msaas-scraper-framework 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msaas_scraper_framework-0.1.0/.gitignore +21 -0
- msaas_scraper_framework-0.1.0/PKG-INFO +13 -0
- msaas_scraper_framework-0.1.0/pyproject.toml +40 -0
- msaas_scraper_framework-0.1.0/src/scraper_framework/__init__.py +52 -0
- msaas_scraper_framework-0.1.0/src/scraper_framework/base.py +169 -0
- msaas_scraper_framework-0.1.0/src/scraper_framework/filters.py +135 -0
- msaas_scraper_framework-0.1.0/src/scraper_framework/rate_limiter.py +110 -0
- msaas_scraper_framework-0.1.0/src/scraper_framework/registry.py +119 -0
- msaas_scraper_framework-0.1.0/src/scraper_framework/result.py +58 -0
- msaas_scraper_framework-0.1.0/tests/__init__.py +0 -0
- msaas_scraper_framework-0.1.0/tests/test_base.py +418 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
node_modules/
|
|
2
|
+
dist/
|
|
3
|
+
.next/
|
|
4
|
+
.turbo/
|
|
5
|
+
*.pyc
|
|
6
|
+
__pycache__/
|
|
7
|
+
.venv/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
.pytest_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.env
|
|
12
|
+
.env.local
|
|
13
|
+
.env.*.local
|
|
14
|
+
.DS_Store
|
|
15
|
+
coverage/
|
|
16
|
+
|
|
17
|
+
# Runtime artifacts
|
|
18
|
+
logs_llm/
|
|
19
|
+
vectors.db
|
|
20
|
+
vectors.db-shm
|
|
21
|
+
vectors.db-wal
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msaas-scraper-framework
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generic web scraper framework with registry, rate limiting, and composable filters
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: httpx>=0.27.0
|
|
7
|
+
Requires-Dist: msaas-api-core
|
|
8
|
+
Requires-Dist: msaas-errors
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: httpx>=0.27.0; extra == 'dev'
|
|
11
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
12
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "msaas-scraper-framework"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Generic web scraper framework with registry, rate limiting, and composable filters"
|
|
5
|
+
requires-python = ">=3.12"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"httpx>=0.27.0",
|
|
8
|
+
"msaas-errors",
|
|
9
|
+
"msaas-api-core",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
dev = [
|
|
14
|
+
"pytest>=8.0",
|
|
15
|
+
"pytest-asyncio>=0.24",
|
|
16
|
+
"httpx>=0.27.0",
|
|
17
|
+
"ruff>=0.8",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["hatchling"]
|
|
22
|
+
build-backend = "hatchling.build"
|
|
23
|
+
|
|
24
|
+
[tool.hatch.build.targets.wheel]
|
|
25
|
+
packages = ["src/scraper_framework"]
|
|
26
|
+
|
|
27
|
+
[tool.ruff]
|
|
28
|
+
target-version = "py312"
|
|
29
|
+
line-length = 100
|
|
30
|
+
|
|
31
|
+
[tool.ruff.lint]
|
|
32
|
+
select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
|
|
33
|
+
|
|
34
|
+
[tool.pytest.ini_options]
|
|
35
|
+
testpaths = ["tests"]
|
|
36
|
+
asyncio_mode = "auto"
|
|
37
|
+
|
|
38
|
+
[tool.uv.sources]
|
|
39
|
+
msaas-api-core = { workspace = true }
|
|
40
|
+
msaas-errors = { workspace = true }
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""scraper-framework: Generic web scraper framework with registry, rate limiting, and filters.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from scraper_framework import BaseScraper, ScraperResult, ScraperRegistry
|
|
6
|
+
|
|
7
|
+
registry = ScraperRegistry()
|
|
8
|
+
|
|
9
|
+
@registry.register("example")
|
|
10
|
+
class ExampleScraper(BaseScraper):
|
|
11
|
+
source = "example"
|
|
12
|
+
id_prefix = "ex"
|
|
13
|
+
|
|
14
|
+
def _run(self) -> ScraperResult:
|
|
15
|
+
with self.make_client() as client:
|
|
16
|
+
resp = client.get("https://example.com/api")
|
|
17
|
+
self.result.total_fetched += len(resp.json())
|
|
18
|
+
self.sleep()
|
|
19
|
+
return self.result
|
|
20
|
+
|
|
21
|
+
scraper = registry.get("example")
|
|
22
|
+
result = scraper.run()
|
|
23
|
+
print(result.to_dict())
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from scraper_framework.base import BaseScraper
|
|
27
|
+
from scraper_framework.filters import (
|
|
28
|
+
all_match,
|
|
29
|
+
any_match,
|
|
30
|
+
keyword_filter,
|
|
31
|
+
length_filter,
|
|
32
|
+
negate,
|
|
33
|
+
none_match,
|
|
34
|
+
regex_filter,
|
|
35
|
+
)
|
|
36
|
+
from scraper_framework.rate_limiter import RateLimiter
|
|
37
|
+
from scraper_framework.registry import ScraperRegistry
|
|
38
|
+
from scraper_framework.result import ScraperResult
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"BaseScraper",
|
|
42
|
+
"RateLimiter",
|
|
43
|
+
"ScraperRegistry",
|
|
44
|
+
"ScraperResult",
|
|
45
|
+
"all_match",
|
|
46
|
+
"any_match",
|
|
47
|
+
"keyword_filter",
|
|
48
|
+
"length_filter",
|
|
49
|
+
"negate",
|
|
50
|
+
"none_match",
|
|
51
|
+
"regex_filter",
|
|
52
|
+
]
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Generic base scraper class.
|
|
2
|
+
|
|
3
|
+
Provides the abstract skeleton that concrete scrapers implement. Handles
|
|
4
|
+
lifecycle (setup, run, teardown), HTTP client creation, rate limiting, and
|
|
5
|
+
result tracking.
|
|
6
|
+
|
|
7
|
+
Example subclass::
|
|
8
|
+
|
|
9
|
+
from scraper_framework import BaseScraper, ScraperResult
|
|
10
|
+
|
|
11
|
+
class MyScraper(BaseScraper):
|
|
12
|
+
source = "my-source"
|
|
13
|
+
id_prefix = "ms"
|
|
14
|
+
|
|
15
|
+
def _run(self) -> ScraperResult:
|
|
16
|
+
with self.make_client() as client:
|
|
17
|
+
resp = client.get("https://example.com/api/items")
|
|
18
|
+
items = resp.json()
|
|
19
|
+
self.result.total_fetched += len(items)
|
|
20
|
+
# ... process items ...
|
|
21
|
+
self.sleep()
|
|
22
|
+
return self.result
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from abc import ABC, abstractmethod
|
|
29
|
+
from collections.abc import Callable
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
import httpx
|
|
34
|
+
|
|
35
|
+
from scraper_framework.rate_limiter import RateLimiter
|
|
36
|
+
from scraper_framework.result import ScraperResult
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BaseScraper(ABC):
|
|
40
|
+
"""Abstract base class for all scrapers.
|
|
41
|
+
|
|
42
|
+
Subclasses **must** set ``source`` and implement :meth:`_run`.
|
|
43
|
+
|
|
44
|
+
Class attributes:
|
|
45
|
+
source: Identifier for this scraper (e.g. ``"greenhouse"``).
|
|
46
|
+
id_prefix: Short prefix for generated record IDs (e.g. ``"gh"``).
|
|
47
|
+
rate_limit_range: Default ``(min, max)`` seconds between requests.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
source: str = ""
|
|
51
|
+
id_prefix: str = ""
|
|
52
|
+
rate_limit_range: tuple[float, float] = (2.0, 5.0)
|
|
53
|
+
|
|
54
|
+
def __init__(self, *, config: dict[str, Any] | None = None) -> None:
|
|
55
|
+
self.config: dict[str, Any] = config or {}
|
|
56
|
+
self.result = ScraperResult(source=self.source)
|
|
57
|
+
self.rate_limiter = RateLimiter(
|
|
58
|
+
min_seconds=self.rate_limit_range[0],
|
|
59
|
+
max_seconds=self.rate_limit_range[1],
|
|
60
|
+
)
|
|
61
|
+
try:
|
|
62
|
+
import structlog
|
|
63
|
+
self.log = structlog.get_logger(f"scraper.{self.source}").bind(source=self.source)
|
|
64
|
+
except ImportError:
|
|
65
|
+
self.log = logging.getLogger(f"scraper.{self.source}")
|
|
66
|
+
|
|
67
|
+
# ------------------------------------------------------------------
|
|
68
|
+
# Lifecycle hooks (override in subclasses if needed)
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
def setup(self) -> None:
|
|
72
|
+
"""Called before :meth:`_run`. Override for one-time initialisation."""
|
|
73
|
+
|
|
74
|
+
def teardown(self) -> None:
|
|
75
|
+
"""Called after :meth:`_run` completes (even on error). Override for cleanup."""
|
|
76
|
+
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
# Abstract
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def _run(self) -> ScraperResult:
|
|
83
|
+
"""Execute the scraping logic.
|
|
84
|
+
|
|
85
|
+
Must populate ``self.result`` and return it. Implementations should
|
|
86
|
+
call ``self.sleep()`` between HTTP requests to respect rate limits.
|
|
87
|
+
"""
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
# ------------------------------------------------------------------
|
|
91
|
+
# Public entry point
|
|
92
|
+
# ------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def run(
|
|
95
|
+
self,
|
|
96
|
+
*,
|
|
97
|
+
on_before: Callable[["BaseScraper"], None] | None = None,
|
|
98
|
+
on_after: Callable[["BaseScraper", ScraperResult], None] | None = None,
|
|
99
|
+
) -> ScraperResult:
|
|
100
|
+
"""Public entry point: setup, run scraper, teardown, finalize result.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
on_before: Optional callback invoked after :meth:`setup` but
|
|
104
|
+
before :meth:`_run`.
|
|
105
|
+
on_after: Optional callback invoked after :meth:`_run` with the
|
|
106
|
+
result.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
The populated :class:`ScraperResult`.
|
|
110
|
+
"""
|
|
111
|
+
self.setup()
|
|
112
|
+
if on_before:
|
|
113
|
+
on_before(self)
|
|
114
|
+
try:
|
|
115
|
+
self._run()
|
|
116
|
+
except Exception:
|
|
117
|
+
self.result.errors += 1
|
|
118
|
+
self.log.exception("Scraper %s failed", self.source)
|
|
119
|
+
raise
|
|
120
|
+
finally:
|
|
121
|
+
self.teardown()
|
|
122
|
+
self.result.finish()
|
|
123
|
+
if on_after:
|
|
124
|
+
on_after(self, self.result)
|
|
125
|
+
return self.result
|
|
126
|
+
|
|
127
|
+
# ------------------------------------------------------------------
|
|
128
|
+
# Utilities
|
|
129
|
+
# ------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
def sleep(self, range_override: tuple[float, float] | None = None) -> float:
|
|
132
|
+
"""Rate-limiting sleep between requests.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
range_override: Optional ``(min, max)`` seconds to override the
|
|
136
|
+
default rate limit range.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Actual seconds slept.
|
|
140
|
+
"""
|
|
141
|
+
return self.rate_limiter.wait(override=range_override)
|
|
142
|
+
|
|
143
|
+
def make_client(self, **kwargs: Any) -> httpx.Client:
|
|
144
|
+
"""Create an :class:`httpx.Client` with sensible defaults.
|
|
145
|
+
|
|
146
|
+
Defaults:
|
|
147
|
+
- ``timeout``: 30 seconds
|
|
148
|
+
- ``follow_redirects``: True
|
|
149
|
+
|
|
150
|
+
All keyword arguments are forwarded to ``httpx.Client``, overriding
|
|
151
|
+
the defaults.
|
|
152
|
+
"""
|
|
153
|
+
defaults: dict[str, Any] = {"timeout": 30.0, "follow_redirects": True}
|
|
154
|
+
defaults.update(kwargs)
|
|
155
|
+
return httpx.Client(**defaults)
|
|
156
|
+
|
|
157
|
+
def make_async_client(self, **kwargs: Any) -> httpx.AsyncClient:
|
|
158
|
+
"""Create an :class:`httpx.AsyncClient` with sensible defaults.
|
|
159
|
+
|
|
160
|
+
Same defaults as :meth:`make_client`.
|
|
161
|
+
"""
|
|
162
|
+
defaults: dict[str, Any] = {"timeout": 30.0, "follow_redirects": True}
|
|
163
|
+
defaults.update(kwargs)
|
|
164
|
+
return httpx.AsyncClient(**defaults)
|
|
165
|
+
|
|
166
|
+
@staticmethod
|
|
167
|
+
def now_iso() -> str:
|
|
168
|
+
"""Current UTC timestamp in ISO-8601 format."""
|
|
169
|
+
return datetime.now(timezone.utc).isoformat()
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Composable filter functions for scraper pipelines.
|
|
2
|
+
|
|
3
|
+
Filters are plain functions with the signature ``(text: str) -> bool`` so they
|
|
4
|
+
can be combined freely with :func:`all_match`, :func:`any_match`, and
|
|
5
|
+
:func:`none_match`.
|
|
6
|
+
|
|
7
|
+
Example::
|
|
8
|
+
|
|
9
|
+
from scraper_framework.filters import (
|
|
10
|
+
regex_filter, keyword_filter, all_match, none_match,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
is_senior = keyword_filter({"senior", "staff", "lead"})
|
|
14
|
+
not_excluded = none_match(keyword_filter({"intern", "junior"}))
|
|
15
|
+
has_python = regex_filter(r"\\bpython\\b")
|
|
16
|
+
|
|
17
|
+
title = "Senior Python Engineer"
|
|
18
|
+
if is_senior(title) and not_excluded(title) and has_python(title):
|
|
19
|
+
print("Relevant!")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import re
|
|
25
|
+
from collections.abc import Callable
|
|
26
|
+
|
|
27
|
+
type TextFilter = Callable[[str], bool]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Primitive filter builders
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def keyword_filter(
|
|
36
|
+
keywords: set[str],
|
|
37
|
+
*,
|
|
38
|
+
case_sensitive: bool = False,
|
|
39
|
+
) -> TextFilter:
|
|
40
|
+
"""Return a filter that matches if *any* keyword appears in the text.
|
|
41
|
+
|
|
42
|
+
Keywords are matched as substrings. For word-boundary matching use
|
|
43
|
+
:func:`regex_filter` instead.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
keywords: Set of keywords to look for.
|
|
47
|
+
case_sensitive: Whether matching is case-sensitive.
|
|
48
|
+
"""
|
|
49
|
+
if not case_sensitive:
|
|
50
|
+
normalised = {k.lower() for k in keywords}
|
|
51
|
+
|
|
52
|
+
def _match(text: str) -> bool:
|
|
53
|
+
lowered = text.lower()
|
|
54
|
+
return any(kw in lowered for kw in normalised)
|
|
55
|
+
else:
|
|
56
|
+
|
|
57
|
+
def _match(text: str) -> bool:
|
|
58
|
+
return any(kw in text for kw in keywords)
|
|
59
|
+
|
|
60
|
+
return _match
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def regex_filter(pattern: str, *, flags: int = re.IGNORECASE) -> TextFilter:
|
|
64
|
+
"""Return a filter that matches if the compiled regex finds a match.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
pattern: A regular expression string.
|
|
68
|
+
flags: Regex flags (default: ``re.IGNORECASE``).
|
|
69
|
+
"""
|
|
70
|
+
compiled = re.compile(pattern, flags)
|
|
71
|
+
|
|
72
|
+
def _match(text: str) -> bool:
|
|
73
|
+
return compiled.search(text) is not None
|
|
74
|
+
|
|
75
|
+
return _match
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def length_filter(*, min_length: int = 0, max_length: int | None = None) -> TextFilter:
|
|
79
|
+
"""Return a filter that checks text length.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
min_length: Minimum number of characters.
|
|
83
|
+
max_length: Maximum number of characters (``None`` for unlimited).
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def _match(text: str) -> bool:
|
|
87
|
+
n = len(text)
|
|
88
|
+
if n < min_length:
|
|
89
|
+
return False
|
|
90
|
+
if max_length is not None and n > max_length:
|
|
91
|
+
return False
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
return _match
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Combinators
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def all_match(*filters: TextFilter) -> TextFilter:
|
|
103
|
+
"""Return a filter that passes only if **all** sub-filters pass."""
|
|
104
|
+
|
|
105
|
+
def _match(text: str) -> bool:
|
|
106
|
+
return all(f(text) for f in filters)
|
|
107
|
+
|
|
108
|
+
return _match
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def any_match(*filters: TextFilter) -> TextFilter:
|
|
112
|
+
"""Return a filter that passes if **any** sub-filter passes."""
|
|
113
|
+
|
|
114
|
+
def _match(text: str) -> bool:
|
|
115
|
+
return any(f(text) for f in filters)
|
|
116
|
+
|
|
117
|
+
return _match
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def none_match(*filters: TextFilter) -> TextFilter:
|
|
121
|
+
"""Return a filter that passes only if **no** sub-filter passes."""
|
|
122
|
+
|
|
123
|
+
def _match(text: str) -> bool:
|
|
124
|
+
return not any(f(text) for f in filters)
|
|
125
|
+
|
|
126
|
+
return _match
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def negate(f: TextFilter) -> TextFilter:
|
|
130
|
+
"""Return a filter that inverts the result of *f*."""
|
|
131
|
+
|
|
132
|
+
def _match(text: str) -> bool:
|
|
133
|
+
return not f(text)
|
|
134
|
+
|
|
135
|
+
return _match
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Rate limiting utilities for scrapers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import random
|
|
7
|
+
import time
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class RateLimiter:
|
|
14
|
+
"""Configurable rate limiter with jittered sleep.
|
|
15
|
+
|
|
16
|
+
Supports both synchronous and asynchronous usage. The sleep duration is
|
|
17
|
+
uniformly sampled from ``[min_seconds, max_seconds]`` to avoid
|
|
18
|
+
thundering-herd patterns.
|
|
19
|
+
|
|
20
|
+
Example::
|
|
21
|
+
|
|
22
|
+
limiter = RateLimiter(min_seconds=1.0, max_seconds=3.0)
|
|
23
|
+
for url in urls:
|
|
24
|
+
fetch(url)
|
|
25
|
+
limiter.wait()
|
|
26
|
+
|
|
27
|
+
Async usage::
|
|
28
|
+
|
|
29
|
+
limiter = RateLimiter(min_seconds=0.5, max_seconds=1.5)
|
|
30
|
+
for url in urls:
|
|
31
|
+
await fetch(url)
|
|
32
|
+
await limiter.async_wait()
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
min_seconds: float = 2.0
|
|
36
|
+
max_seconds: float = 5.0
|
|
37
|
+
_request_count: int = field(default=0, init=False, repr=False)
|
|
38
|
+
_last_request_at: datetime | None = field(default=None, init=False, repr=False)
|
|
39
|
+
|
|
40
|
+
def __post_init__(self) -> None:
|
|
41
|
+
if self.min_seconds < 0:
|
|
42
|
+
msg = f"min_seconds must be >= 0, got {self.min_seconds}"
|
|
43
|
+
raise ValueError(msg)
|
|
44
|
+
if self.max_seconds < self.min_seconds:
|
|
45
|
+
msg = (
|
|
46
|
+
f"max_seconds ({self.max_seconds}) must be >= "
|
|
47
|
+
f"min_seconds ({self.min_seconds})"
|
|
48
|
+
)
|
|
49
|
+
raise ValueError(msg)
|
|
50
|
+
|
|
51
|
+
# ------------------------------------------------------------------
|
|
52
|
+
# Public API
|
|
53
|
+
# ------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def request_count(self) -> int:
|
|
57
|
+
"""Number of waits performed."""
|
|
58
|
+
return self._request_count
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def last_request_at(self) -> datetime | None:
|
|
62
|
+
"""Timestamp of the most recent wait."""
|
|
63
|
+
return self._last_request_at
|
|
64
|
+
|
|
65
|
+
def wait(self, override: tuple[float, float] | None = None) -> float:
|
|
66
|
+
"""Block the current thread for a jittered duration.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
override: Optional ``(min, max)`` tuple to use instead of the
|
|
70
|
+
instance defaults.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The actual number of seconds slept.
|
|
74
|
+
"""
|
|
75
|
+
duration = self._pick_duration(override)
|
|
76
|
+
time.sleep(duration)
|
|
77
|
+
self._record()
|
|
78
|
+
return duration
|
|
79
|
+
|
|
80
|
+
async def async_wait(self, override: tuple[float, float] | None = None) -> float:
|
|
81
|
+
"""Async version of :meth:`wait`.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
override: Optional ``(min, max)`` tuple to use instead of the
|
|
85
|
+
instance defaults.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
The actual number of seconds slept.
|
|
89
|
+
"""
|
|
90
|
+
duration = self._pick_duration(override)
|
|
91
|
+
await asyncio.sleep(duration)
|
|
92
|
+
self._record()
|
|
93
|
+
return duration
|
|
94
|
+
|
|
95
|
+
def reset(self) -> None:
|
|
96
|
+
"""Reset the internal counters."""
|
|
97
|
+
self._request_count = 0
|
|
98
|
+
self._last_request_at = None
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------------------------------
|
|
101
|
+
# Internals
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
def _pick_duration(self, override: tuple[float, float] | None) -> float:
|
|
105
|
+
lo, hi = override or (self.min_seconds, self.max_seconds)
|
|
106
|
+
return random.uniform(lo, hi)
|
|
107
|
+
|
|
108
|
+
def _record(self) -> None:
|
|
109
|
+
self._request_count += 1
|
|
110
|
+
self._last_request_at = datetime.now(timezone.utc)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Generic scraper registry and factory.
|
|
2
|
+
|
|
3
|
+
The registry is a simple name-to-class mapping that supports both imperative
|
|
4
|
+
registration and a decorator-based approach.
|
|
5
|
+
|
|
6
|
+
Example::
|
|
7
|
+
|
|
8
|
+
from scraper_framework import BaseScraper, ScraperResult
|
|
9
|
+
from scraper_framework.registry import ScraperRegistry
|
|
10
|
+
|
|
11
|
+
registry = ScraperRegistry()
|
|
12
|
+
|
|
13
|
+
@registry.register("my-source")
|
|
14
|
+
class MyScraper(BaseScraper):
|
|
15
|
+
source = "my-source"
|
|
16
|
+
id_prefix = "ms"
|
|
17
|
+
|
|
18
|
+
def _run(self) -> ScraperResult:
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
# Later
|
|
22
|
+
scraper = registry.get("my-source")
|
|
23
|
+
result = scraper.run()
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
from scraper_framework.base import BaseScraper
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ScraperRegistry:
|
|
34
|
+
"""Thread-safe registry mapping scraper names to their classes.
|
|
35
|
+
|
|
36
|
+
Supports two registration styles:
|
|
37
|
+
|
|
38
|
+
1. **Imperative** -- ``registry.add("name", MyScraperClass)``
|
|
39
|
+
2. **Decorator** -- ``@registry.register("name")``
|
|
40
|
+
|
|
41
|
+
Retrieval always returns a *new instance* via :meth:`get`.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self) -> None:
|
|
45
|
+
self._registry: dict[str, type[BaseScraper]] = {}
|
|
46
|
+
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
# Registration
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def add(self, name: str, cls: type[BaseScraper]) -> None:
|
|
52
|
+
"""Register a scraper class under *name*.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
name: Unique identifier (e.g. ``"greenhouse"``).
|
|
56
|
+
cls: A concrete subclass of :class:`BaseScraper`.
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
TypeError: If *cls* is not a subclass of ``BaseScraper``.
|
|
60
|
+
ValueError: If *name* is already registered.
|
|
61
|
+
"""
|
|
62
|
+
if not (isinstance(cls, type) and issubclass(cls, BaseScraper)):
|
|
63
|
+
msg = f"Expected a BaseScraper subclass, got {cls!r}"
|
|
64
|
+
raise TypeError(msg)
|
|
65
|
+
if name in self._registry:
|
|
66
|
+
msg = f"Scraper '{name}' is already registered"
|
|
67
|
+
raise ValueError(msg)
|
|
68
|
+
self._registry[name] = cls
|
|
69
|
+
|
|
70
|
+
def register(self, name: str):
|
|
71
|
+
"""Decorator that registers a scraper class.
|
|
72
|
+
|
|
73
|
+
Usage::
|
|
74
|
+
|
|
75
|
+
@registry.register("my-source")
|
|
76
|
+
class MyScraper(BaseScraper):
|
|
77
|
+
...
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def decorator[T: type[BaseScraper]](cls: T) -> T:
|
|
81
|
+
self.add(name, cls) # type: ignore[arg-type]
|
|
82
|
+
return cls
|
|
83
|
+
|
|
84
|
+
return decorator
|
|
85
|
+
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
# Retrieval
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def get(self, name: str, *, config: dict[str, Any] | None = None) -> BaseScraper:
|
|
91
|
+
"""Instantiate and return a scraper by name.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
name: The registered scraper name.
|
|
95
|
+
config: Optional config dict passed to the scraper constructor.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
KeyError: If *name* is not registered.
|
|
99
|
+
"""
|
|
100
|
+
cls = self._registry.get(name)
|
|
101
|
+
if cls is None:
|
|
102
|
+
available = ", ".join(sorted(self._registry)) or "(none)"
|
|
103
|
+
msg = f"Unknown scraper: '{name}'. Available: {available}"
|
|
104
|
+
raise KeyError(msg)
|
|
105
|
+
return cls(config=config)
|
|
106
|
+
|
|
107
|
+
def list_names(self) -> list[str]:
|
|
108
|
+
"""Return a sorted list of all registered scraper names."""
|
|
109
|
+
return sorted(self._registry)
|
|
110
|
+
|
|
111
|
+
def __contains__(self, name: str) -> bool:
|
|
112
|
+
return name in self._registry
|
|
113
|
+
|
|
114
|
+
def __len__(self) -> int:
|
|
115
|
+
return len(self._registry)
|
|
116
|
+
|
|
117
|
+
def __repr__(self) -> str:
|
|
118
|
+
names = ", ".join(sorted(self._registry))
|
|
119
|
+
return f"ScraperRegistry([{names}])"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Scraper result tracking."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ScraperResult:
|
|
9
|
+
"""Accumulated result from a single scraper run.
|
|
10
|
+
|
|
11
|
+
Tracks fetch counts, insert counts, errors, and timing automatically.
|
|
12
|
+
Call ``finish()`` or rely on ``BaseScraper.run()`` to close the timer.
|
|
13
|
+
|
|
14
|
+
Example::
|
|
15
|
+
|
|
16
|
+
result = ScraperResult(source="my-source")
|
|
17
|
+
result.total_fetched += 10
|
|
18
|
+
result.new_inserted += 3
|
|
19
|
+
result.finish()
|
|
20
|
+
print(result.to_dict())
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
source: str
|
|
24
|
+
total_fetched: int = 0
|
|
25
|
+
new_inserted: int = 0
|
|
26
|
+
errors: int = 0
|
|
27
|
+
started_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
28
|
+
finished_at: datetime | None = None
|
|
29
|
+
metadata: dict[str, object] = field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
# ------------------------------------------------------------------
|
|
32
|
+
# Lifecycle
|
|
33
|
+
# ------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def finish(self) -> None:
|
|
36
|
+
"""Mark the run as finished with the current UTC timestamp."""
|
|
37
|
+
self.finished_at = datetime.now(timezone.utc)
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def duration_seconds(self) -> float:
|
|
41
|
+
"""Elapsed seconds between start and finish (or now if still running)."""
|
|
42
|
+
end = self.finished_at or datetime.now(timezone.utc)
|
|
43
|
+
return round((end - self.started_at).total_seconds(), 1)
|
|
44
|
+
|
|
45
|
+
# ------------------------------------------------------------------
|
|
46
|
+
# Serialisation
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
def to_dict(self) -> dict[str, object]:
|
|
50
|
+
"""Return a JSON-safe dictionary of the result."""
|
|
51
|
+
return {
|
|
52
|
+
"source": self.source,
|
|
53
|
+
"total_fetched": self.total_fetched,
|
|
54
|
+
"new_inserted": self.new_inserted,
|
|
55
|
+
"errors": self.errors,
|
|
56
|
+
"duration_seconds": self.duration_seconds,
|
|
57
|
+
"metadata": self.metadata,
|
|
58
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""Tests for the scraper framework core components."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from unittest.mock import MagicMock, patch
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from scraper_framework import (
|
|
11
|
+
BaseScraper,
|
|
12
|
+
RateLimiter,
|
|
13
|
+
ScraperRegistry,
|
|
14
|
+
ScraperResult,
|
|
15
|
+
all_match,
|
|
16
|
+
any_match,
|
|
17
|
+
keyword_filter,
|
|
18
|
+
length_filter,
|
|
19
|
+
negate,
|
|
20
|
+
none_match,
|
|
21
|
+
regex_filter,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# =========================================================================
|
|
26
|
+
# Fixtures
|
|
27
|
+
# =========================================================================
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class StubScraper(BaseScraper):
|
|
31
|
+
"""Minimal concrete scraper for testing."""
|
|
32
|
+
|
|
33
|
+
source = "stub"
|
|
34
|
+
id_prefix = "st"
|
|
35
|
+
rate_limit_range = (0.0, 0.0) # No delay in tests
|
|
36
|
+
|
|
37
|
+
def _run(self) -> ScraperResult:
|
|
38
|
+
self.result.total_fetched = 5
|
|
39
|
+
self.result.new_inserted = 3
|
|
40
|
+
return self.result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class FailingScraper(BaseScraper):
|
|
44
|
+
"""Scraper that raises during _run."""
|
|
45
|
+
|
|
46
|
+
source = "failing"
|
|
47
|
+
id_prefix = "fl"
|
|
48
|
+
rate_limit_range = (0.0, 0.0)
|
|
49
|
+
|
|
50
|
+
def _run(self) -> ScraperResult:
|
|
51
|
+
msg = "connection refused"
|
|
52
|
+
raise ConnectionError(msg)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# =========================================================================
|
|
56
|
+
# ScraperResult
|
|
57
|
+
# =========================================================================
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TestScraperResult:
|
|
61
|
+
def test_defaults(self) -> None:
|
|
62
|
+
result = ScraperResult(source="test")
|
|
63
|
+
assert result.source == "test"
|
|
64
|
+
assert result.total_fetched == 0
|
|
65
|
+
assert result.new_inserted == 0
|
|
66
|
+
assert result.errors == 0
|
|
67
|
+
assert result.finished_at is None
|
|
68
|
+
assert isinstance(result.started_at, datetime)
|
|
69
|
+
|
|
70
|
+
def test_finish(self) -> None:
|
|
71
|
+
result = ScraperResult(source="test")
|
|
72
|
+
result.finish()
|
|
73
|
+
assert result.finished_at is not None
|
|
74
|
+
assert result.finished_at >= result.started_at
|
|
75
|
+
|
|
76
|
+
def test_duration_seconds_running(self) -> None:
|
|
77
|
+
result = ScraperResult(source="test")
|
|
78
|
+
duration = result.duration_seconds
|
|
79
|
+
assert duration >= 0.0
|
|
80
|
+
|
|
81
|
+
def test_duration_seconds_finished(self) -> None:
|
|
82
|
+
result = ScraperResult(source="test")
|
|
83
|
+
result.finish()
|
|
84
|
+
duration = result.duration_seconds
|
|
85
|
+
assert duration >= 0.0
|
|
86
|
+
|
|
87
|
+
def test_to_dict(self) -> None:
|
|
88
|
+
result = ScraperResult(source="test")
|
|
89
|
+
result.total_fetched = 10
|
|
90
|
+
result.new_inserted = 4
|
|
91
|
+
result.errors = 1
|
|
92
|
+
result.metadata["custom"] = "value"
|
|
93
|
+
result.finish()
|
|
94
|
+
|
|
95
|
+
d = result.to_dict()
|
|
96
|
+
assert d["source"] == "test"
|
|
97
|
+
assert d["total_fetched"] == 10
|
|
98
|
+
assert d["new_inserted"] == 4
|
|
99
|
+
assert d["errors"] == 1
|
|
100
|
+
assert isinstance(d["duration_seconds"], float)
|
|
101
|
+
assert d["metadata"] == {"custom": "value"}
|
|
102
|
+
|
|
103
|
+
def test_metadata_isolation(self) -> None:
|
|
104
|
+
r1 = ScraperResult(source="a")
|
|
105
|
+
r2 = ScraperResult(source="b")
|
|
106
|
+
r1.metadata["key"] = 1
|
|
107
|
+
assert "key" not in r2.metadata
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# =========================================================================
|
|
111
|
+
# BaseScraper
|
|
112
|
+
# =========================================================================
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class TestBaseScraper:
|
|
116
|
+
def test_run_lifecycle(self) -> None:
|
|
117
|
+
scraper = StubScraper()
|
|
118
|
+
result = scraper.run()
|
|
119
|
+
assert result.total_fetched == 5
|
|
120
|
+
assert result.new_inserted == 3
|
|
121
|
+
assert result.finished_at is not None
|
|
122
|
+
|
|
123
|
+
def test_run_with_config(self) -> None:
|
|
124
|
+
scraper = StubScraper(config={"key": "value"})
|
|
125
|
+
assert scraper.config == {"key": "value"}
|
|
126
|
+
|
|
127
|
+
def test_run_calls_setup_and_teardown(self) -> None:
|
|
128
|
+
scraper = StubScraper()
|
|
129
|
+
scraper.setup = MagicMock() # type: ignore[method-assign]
|
|
130
|
+
scraper.teardown = MagicMock() # type: ignore[method-assign]
|
|
131
|
+
|
|
132
|
+
scraper.run()
|
|
133
|
+
|
|
134
|
+
scraper.setup.assert_called_once()
|
|
135
|
+
scraper.teardown.assert_called_once()
|
|
136
|
+
|
|
137
|
+
def test_run_on_before_callback(self) -> None:
|
|
138
|
+
scraper = StubScraper()
|
|
139
|
+
callback = MagicMock()
|
|
140
|
+
|
|
141
|
+
scraper.run(on_before=callback)
|
|
142
|
+
|
|
143
|
+
callback.assert_called_once_with(scraper)
|
|
144
|
+
|
|
145
|
+
def test_run_on_after_callback(self) -> None:
|
|
146
|
+
scraper = StubScraper()
|
|
147
|
+
callback = MagicMock()
|
|
148
|
+
|
|
149
|
+
result = scraper.run(on_after=callback)
|
|
150
|
+
|
|
151
|
+
callback.assert_called_once_with(scraper, result)
|
|
152
|
+
|
|
153
|
+
def test_failing_scraper_increments_errors_and_raises(self) -> None:
|
|
154
|
+
scraper = FailingScraper()
|
|
155
|
+
with pytest.raises(ConnectionError, match="connection refused"):
|
|
156
|
+
scraper.run()
|
|
157
|
+
assert scraper.result.errors == 1
|
|
158
|
+
# teardown is still called and result is finished
|
|
159
|
+
assert scraper.result.finished_at is not None
|
|
160
|
+
|
|
161
|
+
def test_teardown_called_on_failure(self) -> None:
|
|
162
|
+
scraper = FailingScraper()
|
|
163
|
+
scraper.teardown = MagicMock() # type: ignore[method-assign]
|
|
164
|
+
|
|
165
|
+
with pytest.raises(ConnectionError):
|
|
166
|
+
scraper.run()
|
|
167
|
+
|
|
168
|
+
scraper.teardown.assert_called_once()
|
|
169
|
+
|
|
170
|
+
def test_sleep_delegates_to_rate_limiter(self) -> None:
|
|
171
|
+
scraper = StubScraper()
|
|
172
|
+
with patch.object(scraper.rate_limiter, "wait", return_value=0.0) as mock_wait:
|
|
173
|
+
scraper.sleep()
|
|
174
|
+
mock_wait.assert_called_once_with(override=None)
|
|
175
|
+
|
|
176
|
+
def test_sleep_with_override(self) -> None:
|
|
177
|
+
scraper = StubScraper()
|
|
178
|
+
with patch.object(scraper.rate_limiter, "wait", return_value=0.0) as mock_wait:
|
|
179
|
+
scraper.sleep(range_override=(1.0, 2.0))
|
|
180
|
+
mock_wait.assert_called_once_with(override=(1.0, 2.0))
|
|
181
|
+
|
|
182
|
+
def test_make_client(self) -> None:
|
|
183
|
+
scraper = StubScraper()
|
|
184
|
+
client = scraper.make_client()
|
|
185
|
+
assert client.timeout.read == 30.0
|
|
186
|
+
assert client.follow_redirects is True
|
|
187
|
+
client.close()
|
|
188
|
+
|
|
189
|
+
def test_make_client_custom_timeout(self) -> None:
|
|
190
|
+
scraper = StubScraper()
|
|
191
|
+
client = scraper.make_client(timeout=60.0)
|
|
192
|
+
assert client.timeout.read == 60.0
|
|
193
|
+
client.close()
|
|
194
|
+
|
|
195
|
+
def test_make_async_client(self) -> None:
|
|
196
|
+
scraper = StubScraper()
|
|
197
|
+
client = scraper.make_async_client()
|
|
198
|
+
assert client.timeout.read == 30.0
|
|
199
|
+
assert client.follow_redirects is True
|
|
200
|
+
|
|
201
|
+
def test_now_iso(self) -> None:
|
|
202
|
+
ts = StubScraper.now_iso()
|
|
203
|
+
parsed = datetime.fromisoformat(ts)
|
|
204
|
+
assert parsed.tzinfo == timezone.utc
|
|
205
|
+
|
|
206
|
+
def test_logger_name(self) -> None:
|
|
207
|
+
scraper = StubScraper()
|
|
208
|
+
# structlog loggers don't expose .name; check the logger works instead
|
|
209
|
+
try:
|
|
210
|
+
assert scraper.log.name == "scraper.stub"
|
|
211
|
+
except AttributeError:
|
|
212
|
+
# structlog BoundLogger — verify it's functional
|
|
213
|
+
assert scraper.log is not None
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# =========================================================================
|
|
217
|
+
# RateLimiter
|
|
218
|
+
# =========================================================================
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class TestRateLimiter:
|
|
222
|
+
def test_defaults(self) -> None:
|
|
223
|
+
limiter = RateLimiter()
|
|
224
|
+
assert limiter.min_seconds == 2.0
|
|
225
|
+
assert limiter.max_seconds == 5.0
|
|
226
|
+
assert limiter.request_count == 0
|
|
227
|
+
assert limiter.last_request_at is None
|
|
228
|
+
|
|
229
|
+
def test_wait_records_count(self) -> None:
|
|
230
|
+
limiter = RateLimiter(min_seconds=0.0, max_seconds=0.0)
|
|
231
|
+
limiter.wait()
|
|
232
|
+
limiter.wait()
|
|
233
|
+
assert limiter.request_count == 2
|
|
234
|
+
assert limiter.last_request_at is not None
|
|
235
|
+
|
|
236
|
+
def test_wait_with_override(self) -> None:
|
|
237
|
+
limiter = RateLimiter(min_seconds=10.0, max_seconds=20.0)
|
|
238
|
+
duration = limiter.wait(override=(0.0, 0.0))
|
|
239
|
+
assert duration == 0.0
|
|
240
|
+
|
|
241
|
+
def test_reset(self) -> None:
|
|
242
|
+
limiter = RateLimiter(min_seconds=0.0, max_seconds=0.0)
|
|
243
|
+
limiter.wait()
|
|
244
|
+
limiter.reset()
|
|
245
|
+
assert limiter.request_count == 0
|
|
246
|
+
assert limiter.last_request_at is None
|
|
247
|
+
|
|
248
|
+
def test_invalid_min_seconds(self) -> None:
|
|
249
|
+
with pytest.raises(ValueError, match="min_seconds must be >= 0"):
|
|
250
|
+
RateLimiter(min_seconds=-1.0)
|
|
251
|
+
|
|
252
|
+
def test_max_less_than_min(self) -> None:
|
|
253
|
+
with pytest.raises(ValueError, match="max_seconds"):
|
|
254
|
+
RateLimiter(min_seconds=5.0, max_seconds=1.0)
|
|
255
|
+
|
|
256
|
+
@pytest.mark.asyncio
|
|
257
|
+
async def test_async_wait(self) -> None:
|
|
258
|
+
limiter = RateLimiter(min_seconds=0.0, max_seconds=0.0)
|
|
259
|
+
duration = await limiter.async_wait()
|
|
260
|
+
assert duration == 0.0
|
|
261
|
+
assert limiter.request_count == 1
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# =========================================================================
|
|
265
|
+
# ScraperRegistry
|
|
266
|
+
# =========================================================================
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class TestScraperRegistry:
|
|
270
|
+
def test_add_and_get(self) -> None:
|
|
271
|
+
registry = ScraperRegistry()
|
|
272
|
+
registry.add("stub", StubScraper)
|
|
273
|
+
scraper = registry.get("stub")
|
|
274
|
+
assert isinstance(scraper, StubScraper)
|
|
275
|
+
|
|
276
|
+
def test_get_with_config(self) -> None:
|
|
277
|
+
registry = ScraperRegistry()
|
|
278
|
+
registry.add("stub", StubScraper)
|
|
279
|
+
scraper = registry.get("stub", config={"key": "val"})
|
|
280
|
+
assert scraper.config == {"key": "val"}
|
|
281
|
+
|
|
282
|
+
def test_register_decorator(self) -> None:
|
|
283
|
+
registry = ScraperRegistry()
|
|
284
|
+
|
|
285
|
+
@registry.register("decorated")
|
|
286
|
+
class DecoratedScraper(StubScraper):
|
|
287
|
+
source = "decorated"
|
|
288
|
+
|
|
289
|
+
assert "decorated" in registry
|
|
290
|
+
scraper = registry.get("decorated")
|
|
291
|
+
assert isinstance(scraper, DecoratedScraper)
|
|
292
|
+
|
|
293
|
+
def test_get_unknown_raises(self) -> None:
|
|
294
|
+
registry = ScraperRegistry()
|
|
295
|
+
with pytest.raises(KeyError, match="Unknown scraper: 'nope'"):
|
|
296
|
+
registry.get("nope")
|
|
297
|
+
|
|
298
|
+
def test_duplicate_add_raises(self) -> None:
|
|
299
|
+
registry = ScraperRegistry()
|
|
300
|
+
registry.add("stub", StubScraper)
|
|
301
|
+
with pytest.raises(ValueError, match="already registered"):
|
|
302
|
+
registry.add("stub", StubScraper)
|
|
303
|
+
|
|
304
|
+
def test_add_non_scraper_raises(self) -> None:
|
|
305
|
+
registry = ScraperRegistry()
|
|
306
|
+
with pytest.raises(TypeError, match="Expected a BaseScraper subclass"):
|
|
307
|
+
registry.add("bad", dict) # type: ignore[arg-type]
|
|
308
|
+
|
|
309
|
+
def test_list_names(self) -> None:
|
|
310
|
+
registry = ScraperRegistry()
|
|
311
|
+
registry.add("beta", StubScraper)
|
|
312
|
+
|
|
313
|
+
class AnotherScraper(StubScraper):
|
|
314
|
+
source = "alpha"
|
|
315
|
+
|
|
316
|
+
registry.add("alpha", AnotherScraper)
|
|
317
|
+
assert registry.list_names() == ["alpha", "beta"]
|
|
318
|
+
|
|
319
|
+
def test_contains(self) -> None:
|
|
320
|
+
registry = ScraperRegistry()
|
|
321
|
+
registry.add("stub", StubScraper)
|
|
322
|
+
assert "stub" in registry
|
|
323
|
+
assert "missing" not in registry
|
|
324
|
+
|
|
325
|
+
def test_len(self) -> None:
|
|
326
|
+
registry = ScraperRegistry()
|
|
327
|
+
assert len(registry) == 0
|
|
328
|
+
registry.add("stub", StubScraper)
|
|
329
|
+
assert len(registry) == 1
|
|
330
|
+
|
|
331
|
+
def test_repr(self) -> None:
|
|
332
|
+
registry = ScraperRegistry()
|
|
333
|
+
registry.add("b", StubScraper)
|
|
334
|
+
|
|
335
|
+
class A(StubScraper):
|
|
336
|
+
source = "a"
|
|
337
|
+
|
|
338
|
+
registry.add("a", A)
|
|
339
|
+
assert repr(registry) == "ScraperRegistry([a, b])"
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# =========================================================================
|
|
343
|
+
# Filters
|
|
344
|
+
# =========================================================================
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class TestFilters:
|
|
348
|
+
def test_keyword_filter_case_insensitive(self) -> None:
|
|
349
|
+
f = keyword_filter({"senior", "staff"})
|
|
350
|
+
assert f("Senior Engineer") is True
|
|
351
|
+
assert f("STAFF developer") is True
|
|
352
|
+
assert f("Junior Developer") is False
|
|
353
|
+
|
|
354
|
+
def test_keyword_filter_case_sensitive(self) -> None:
|
|
355
|
+
f = keyword_filter({"Senior"}, case_sensitive=True)
|
|
356
|
+
assert f("Senior Engineer") is True
|
|
357
|
+
assert f("senior engineer") is False
|
|
358
|
+
|
|
359
|
+
def test_regex_filter(self) -> None:
|
|
360
|
+
f = regex_filter(r"\bpython\b")
|
|
361
|
+
assert f("Python Developer") is True
|
|
362
|
+
assert f("pythonic code") is False # word boundary
|
|
363
|
+
assert f("Java Engineer") is False
|
|
364
|
+
|
|
365
|
+
def test_length_filter(self) -> None:
|
|
366
|
+
f = length_filter(min_length=3, max_length=10)
|
|
367
|
+
assert f("ab") is False
|
|
368
|
+
assert f("abc") is True
|
|
369
|
+
assert f("0123456789") is True
|
|
370
|
+
assert f("01234567890") is False
|
|
371
|
+
|
|
372
|
+
def test_length_filter_no_max(self) -> None:
|
|
373
|
+
f = length_filter(min_length=1)
|
|
374
|
+
assert f("") is False
|
|
375
|
+
assert f("x" * 10000) is True
|
|
376
|
+
|
|
377
|
+
def test_all_match(self) -> None:
|
|
378
|
+
f = all_match(
|
|
379
|
+
keyword_filter({"senior"}),
|
|
380
|
+
regex_filter(r"\bpython\b"),
|
|
381
|
+
)
|
|
382
|
+
assert f("Senior Python Developer") is True
|
|
383
|
+
assert f("Senior Java Developer") is False
|
|
384
|
+
assert f("Junior Python Developer") is False
|
|
385
|
+
|
|
386
|
+
def test_any_match(self) -> None:
|
|
387
|
+
f = any_match(
|
|
388
|
+
keyword_filter({"python"}),
|
|
389
|
+
keyword_filter({"rust"}),
|
|
390
|
+
)
|
|
391
|
+
assert f("Python Dev") is True
|
|
392
|
+
assert f("Rust Dev") is True
|
|
393
|
+
assert f("Java Dev") is False
|
|
394
|
+
|
|
395
|
+
def test_none_match(self) -> None:
|
|
396
|
+
f = none_match(keyword_filter({"intern", "junior"}))
|
|
397
|
+
assert f("Senior Engineer") is True
|
|
398
|
+
assert f("Junior Developer") is False
|
|
399
|
+
assert f("Intern position") is False
|
|
400
|
+
|
|
401
|
+
def test_negate(self) -> None:
|
|
402
|
+
f = negate(keyword_filter({"intern"}))
|
|
403
|
+
assert f("Senior Engineer") is True
|
|
404
|
+
assert f("Summer Intern") is False
|
|
405
|
+
|
|
406
|
+
def test_composable_pipeline(self) -> None:
|
|
407
|
+
"""Full pipeline: senior + skills, no exclusions."""
|
|
408
|
+
is_senior = keyword_filter({"senior", "staff", "lead"})
|
|
409
|
+
not_excluded = none_match(keyword_filter({"intern", "junior"}))
|
|
410
|
+
has_tech = regex_filter(r"\b(python|typescript|react)\b")
|
|
411
|
+
|
|
412
|
+
pipeline = all_match(is_senior, not_excluded, has_tech)
|
|
413
|
+
|
|
414
|
+
assert pipeline("Senior Python Engineer") is True
|
|
415
|
+
assert pipeline("Lead React Developer") is True
|
|
416
|
+
assert pipeline("Junior Python Developer") is False
|
|
417
|
+
assert pipeline("Senior Marketing Manager") is False
|
|
418
|
+
assert pipeline("Staff Intern Program") is False
|