msaas-scraper-framework 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ node_modules/
2
+ dist/
3
+ .next/
4
+ .turbo/
5
+ *.pyc
6
+ __pycache__/
7
+ .venv/
8
+ *.egg-info/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ .env
12
+ .env.local
13
+ .env.*.local
14
+ .DS_Store
15
+ coverage/
16
+
17
+ # Runtime artifacts
18
+ logs_llm/
19
+ vectors.db
20
+ vectors.db-shm
21
+ vectors.db-wal
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: msaas-scraper-framework
3
+ Version: 0.1.0
4
+ Summary: Generic web scraper framework with registry, rate limiting, and composable filters
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: httpx>=0.27.0
7
+ Requires-Dist: msaas-api-core
8
+ Requires-Dist: msaas-errors
9
+ Provides-Extra: dev
10
+ Requires-Dist: httpx>=0.27.0; extra == 'dev'
11
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
12
+ Requires-Dist: pytest>=8.0; extra == 'dev'
13
+ Requires-Dist: ruff>=0.8; extra == 'dev'
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "msaas-scraper-framework"
3
+ version = "0.1.0"
4
+ description = "Generic web scraper framework with registry, rate limiting, and composable filters"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "httpx>=0.27.0",
8
+ "msaas-errors",
9
+ "msaas-api-core",
10
+ ]
11
+
12
+ [project.optional-dependencies]
13
+ dev = [
14
+ "pytest>=8.0",
15
+ "pytest-asyncio>=0.24",
16
+ "httpx>=0.27.0",
17
+ "ruff>=0.8",
18
+ ]
19
+
20
+ [build-system]
21
+ requires = ["hatchling"]
22
+ build-backend = "hatchling.build"
23
+
24
+ [tool.hatch.build.targets.wheel]
25
+ packages = ["src/scraper_framework"]
26
+
27
+ [tool.ruff]
28
+ target-version = "py312"
29
+ line-length = 100
30
+
31
+ [tool.ruff.lint]
32
+ select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
33
+
34
+ [tool.pytest.ini_options]
35
+ testpaths = ["tests"]
36
+ asyncio_mode = "auto"
37
+
38
+ [tool.uv.sources]
39
+ msaas-api-core = { workspace = true }
40
+ msaas-errors = { workspace = true }
@@ -0,0 +1,52 @@
1
+ """scraper-framework: Generic web scraper framework with registry, rate limiting, and filters.
2
+
3
+ Usage::
4
+
5
+ from scraper_framework import BaseScraper, ScraperResult, ScraperRegistry
6
+
7
+ registry = ScraperRegistry()
8
+
9
+ @registry.register("example")
10
+ class ExampleScraper(BaseScraper):
11
+ source = "example"
12
+ id_prefix = "ex"
13
+
14
+ def _run(self) -> ScraperResult:
15
+ with self.make_client() as client:
16
+ resp = client.get("https://example.com/api")
17
+ self.result.total_fetched += len(resp.json())
18
+ self.sleep()
19
+ return self.result
20
+
21
+ scraper = registry.get("example")
22
+ result = scraper.run()
23
+ print(result.to_dict())
24
+ """
25
+
26
+ from scraper_framework.base import BaseScraper
27
+ from scraper_framework.filters import (
28
+ all_match,
29
+ any_match,
30
+ keyword_filter,
31
+ length_filter,
32
+ negate,
33
+ none_match,
34
+ regex_filter,
35
+ )
36
+ from scraper_framework.rate_limiter import RateLimiter
37
+ from scraper_framework.registry import ScraperRegistry
38
+ from scraper_framework.result import ScraperResult
39
+
40
+ __all__ = [
41
+ "BaseScraper",
42
+ "RateLimiter",
43
+ "ScraperRegistry",
44
+ "ScraperResult",
45
+ "all_match",
46
+ "any_match",
47
+ "keyword_filter",
48
+ "length_filter",
49
+ "negate",
50
+ "none_match",
51
+ "regex_filter",
52
+ ]
@@ -0,0 +1,169 @@
1
+ """Generic base scraper class.
2
+
3
+ Provides the abstract skeleton that concrete scrapers implement. Handles
4
+ lifecycle (setup, run, teardown), HTTP client creation, rate limiting, and
5
+ result tracking.
6
+
7
+ Example subclass::
8
+
9
+ from scraper_framework import BaseScraper, ScraperResult
10
+
11
+ class MyScraper(BaseScraper):
12
+ source = "my-source"
13
+ id_prefix = "ms"
14
+
15
+ def _run(self) -> ScraperResult:
16
+ with self.make_client() as client:
17
+ resp = client.get("https://example.com/api/items")
18
+ items = resp.json()
19
+ self.result.total_fetched += len(items)
20
+ # ... process items ...
21
+ self.sleep()
22
+ return self.result
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import logging
28
+ from abc import ABC, abstractmethod
29
+ from collections.abc import Callable
30
+ from datetime import datetime, timezone
31
+ from typing import Any
32
+
33
+ import httpx
34
+
35
+ from scraper_framework.rate_limiter import RateLimiter
36
+ from scraper_framework.result import ScraperResult
37
+
38
+
39
+ class BaseScraper(ABC):
40
+ """Abstract base class for all scrapers.
41
+
42
+ Subclasses **must** set ``source`` and implement :meth:`_run`.
43
+
44
+ Class attributes:
45
+ source: Identifier for this scraper (e.g. ``"greenhouse"``).
46
+ id_prefix: Short prefix for generated record IDs (e.g. ``"gh"``).
47
+ rate_limit_range: Default ``(min, max)`` seconds between requests.
48
+ """
49
+
50
+ source: str = ""
51
+ id_prefix: str = ""
52
+ rate_limit_range: tuple[float, float] = (2.0, 5.0)
53
+
54
+ def __init__(self, *, config: dict[str, Any] | None = None) -> None:
55
+ self.config: dict[str, Any] = config or {}
56
+ self.result = ScraperResult(source=self.source)
57
+ self.rate_limiter = RateLimiter(
58
+ min_seconds=self.rate_limit_range[0],
59
+ max_seconds=self.rate_limit_range[1],
60
+ )
61
+ try:
62
+ import structlog
63
+ self.log = structlog.get_logger(f"scraper.{self.source}").bind(source=self.source)
64
+ except ImportError:
65
+ self.log = logging.getLogger(f"scraper.{self.source}")
66
+
67
+ # ------------------------------------------------------------------
68
+ # Lifecycle hooks (override in subclasses if needed)
69
+ # ------------------------------------------------------------------
70
+
71
+ def setup(self) -> None:
72
+ """Called before :meth:`_run`. Override for one-time initialisation."""
73
+
74
+ def teardown(self) -> None:
75
+ """Called after :meth:`_run` completes (even on error). Override for cleanup."""
76
+
77
+ # ------------------------------------------------------------------
78
+ # Abstract
79
+ # ------------------------------------------------------------------
80
+
81
+ @abstractmethod
82
+ def _run(self) -> ScraperResult:
83
+ """Execute the scraping logic.
84
+
85
+ Must populate ``self.result`` and return it. Implementations should
86
+ call ``self.sleep()`` between HTTP requests to respect rate limits.
87
+ """
88
+ ...
89
+
90
+ # ------------------------------------------------------------------
91
+ # Public entry point
92
+ # ------------------------------------------------------------------
93
+
94
+ def run(
95
+ self,
96
+ *,
97
+ on_before: Callable[["BaseScraper"], None] | None = None,
98
+ on_after: Callable[["BaseScraper", ScraperResult], None] | None = None,
99
+ ) -> ScraperResult:
100
+ """Public entry point: setup, run scraper, teardown, finalize result.
101
+
102
+ Args:
103
+ on_before: Optional callback invoked after :meth:`setup` but
104
+ before :meth:`_run`.
105
+ on_after: Optional callback invoked after :meth:`_run` with the
106
+ result.
107
+
108
+ Returns:
109
+ The populated :class:`ScraperResult`.
110
+ """
111
+ self.setup()
112
+ if on_before:
113
+ on_before(self)
114
+ try:
115
+ self._run()
116
+ except Exception:
117
+ self.result.errors += 1
118
+ self.log.exception("Scraper %s failed", self.source)
119
+ raise
120
+ finally:
121
+ self.teardown()
122
+ self.result.finish()
123
+ if on_after:
124
+ on_after(self, self.result)
125
+ return self.result
126
+
127
+ # ------------------------------------------------------------------
128
+ # Utilities
129
+ # ------------------------------------------------------------------
130
+
131
+ def sleep(self, range_override: tuple[float, float] | None = None) -> float:
132
+ """Rate-limiting sleep between requests.
133
+
134
+ Args:
135
+ range_override: Optional ``(min, max)`` seconds to override the
136
+ default rate limit range.
137
+
138
+ Returns:
139
+ Actual seconds slept.
140
+ """
141
+ return self.rate_limiter.wait(override=range_override)
142
+
143
+ def make_client(self, **kwargs: Any) -> httpx.Client:
144
+ """Create an :class:`httpx.Client` with sensible defaults.
145
+
146
+ Defaults:
147
+ - ``timeout``: 30 seconds
148
+ - ``follow_redirects``: True
149
+
150
+ All keyword arguments are forwarded to ``httpx.Client``, overriding
151
+ the defaults.
152
+ """
153
+ defaults: dict[str, Any] = {"timeout": 30.0, "follow_redirects": True}
154
+ defaults.update(kwargs)
155
+ return httpx.Client(**defaults)
156
+
157
+ def make_async_client(self, **kwargs: Any) -> httpx.AsyncClient:
158
+ """Create an :class:`httpx.AsyncClient` with sensible defaults.
159
+
160
+ Same defaults as :meth:`make_client`.
161
+ """
162
+ defaults: dict[str, Any] = {"timeout": 30.0, "follow_redirects": True}
163
+ defaults.update(kwargs)
164
+ return httpx.AsyncClient(**defaults)
165
+
166
+ @staticmethod
167
+ def now_iso() -> str:
168
+ """Current UTC timestamp in ISO-8601 format."""
169
+ return datetime.now(timezone.utc).isoformat()
@@ -0,0 +1,135 @@
1
+ """Composable filter functions for scraper pipelines.
2
+
3
+ Filters are plain functions with the signature ``(text: str) -> bool`` so they
4
+ can be combined freely with :func:`all_match`, :func:`any_match`, and
5
+ :func:`none_match`.
6
+
7
+ Example::
8
+
9
+ from scraper_framework.filters import (
10
+ regex_filter, keyword_filter, all_match, none_match,
11
+ )
12
+
13
+ is_senior = keyword_filter({"senior", "staff", "lead"})
14
+ not_excluded = none_match(keyword_filter({"intern", "junior"}))
15
+ has_python = regex_filter(r"\\bpython\\b")
16
+
17
+ title = "Senior Python Engineer"
18
+ if is_senior(title) and not_excluded(title) and has_python(title):
19
+ print("Relevant!")
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import re
25
+ from collections.abc import Callable
26
+
27
+ type TextFilter = Callable[[str], bool]
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Primitive filter builders
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ def keyword_filter(
36
+ keywords: set[str],
37
+ *,
38
+ case_sensitive: bool = False,
39
+ ) -> TextFilter:
40
+ """Return a filter that matches if *any* keyword appears in the text.
41
+
42
+ Keywords are matched as substrings. For word-boundary matching use
43
+ :func:`regex_filter` instead.
44
+
45
+ Args:
46
+ keywords: Set of keywords to look for.
47
+ case_sensitive: Whether matching is case-sensitive.
48
+ """
49
+ if not case_sensitive:
50
+ normalised = {k.lower() for k in keywords}
51
+
52
+ def _match(text: str) -> bool:
53
+ lowered = text.lower()
54
+ return any(kw in lowered for kw in normalised)
55
+ else:
56
+
57
+ def _match(text: str) -> bool:
58
+ return any(kw in text for kw in keywords)
59
+
60
+ return _match
61
+
62
+
63
+ def regex_filter(pattern: str, *, flags: int = re.IGNORECASE) -> TextFilter:
64
+ """Return a filter that matches if the compiled regex finds a match.
65
+
66
+ Args:
67
+ pattern: A regular expression string.
68
+ flags: Regex flags (default: ``re.IGNORECASE``).
69
+ """
70
+ compiled = re.compile(pattern, flags)
71
+
72
+ def _match(text: str) -> bool:
73
+ return compiled.search(text) is not None
74
+
75
+ return _match
76
+
77
+
78
+ def length_filter(*, min_length: int = 0, max_length: int | None = None) -> TextFilter:
79
+ """Return a filter that checks text length.
80
+
81
+ Args:
82
+ min_length: Minimum number of characters.
83
+ max_length: Maximum number of characters (``None`` for unlimited).
84
+ """
85
+
86
+ def _match(text: str) -> bool:
87
+ n = len(text)
88
+ if n < min_length:
89
+ return False
90
+ if max_length is not None and n > max_length:
91
+ return False
92
+ return True
93
+
94
+ return _match
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Combinators
99
+ # ---------------------------------------------------------------------------
100
+
101
+
102
+ def all_match(*filters: TextFilter) -> TextFilter:
103
+ """Return a filter that passes only if **all** sub-filters pass."""
104
+
105
+ def _match(text: str) -> bool:
106
+ return all(f(text) for f in filters)
107
+
108
+ return _match
109
+
110
+
111
+ def any_match(*filters: TextFilter) -> TextFilter:
112
+ """Return a filter that passes if **any** sub-filter passes."""
113
+
114
+ def _match(text: str) -> bool:
115
+ return any(f(text) for f in filters)
116
+
117
+ return _match
118
+
119
+
120
+ def none_match(*filters: TextFilter) -> TextFilter:
121
+ """Return a filter that passes only if **no** sub-filter passes."""
122
+
123
+ def _match(text: str) -> bool:
124
+ return not any(f(text) for f in filters)
125
+
126
+ return _match
127
+
128
+
129
+ def negate(f: TextFilter) -> TextFilter:
130
+ """Return a filter that inverts the result of *f*."""
131
+
132
+ def _match(text: str) -> bool:
133
+ return not f(text)
134
+
135
+ return _match
@@ -0,0 +1,110 @@
1
+ """Rate limiting utilities for scrapers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import random
7
+ import time
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
10
+
11
+
12
+ @dataclass
13
+ class RateLimiter:
14
+ """Configurable rate limiter with jittered sleep.
15
+
16
+ Supports both synchronous and asynchronous usage. The sleep duration is
17
+ uniformly sampled from ``[min_seconds, max_seconds]`` to avoid
18
+ thundering-herd patterns.
19
+
20
+ Example::
21
+
22
+ limiter = RateLimiter(min_seconds=1.0, max_seconds=3.0)
23
+ for url in urls:
24
+ fetch(url)
25
+ limiter.wait()
26
+
27
+ Async usage::
28
+
29
+ limiter = RateLimiter(min_seconds=0.5, max_seconds=1.5)
30
+ for url in urls:
31
+ await fetch(url)
32
+ await limiter.async_wait()
33
+ """
34
+
35
+ min_seconds: float = 2.0
36
+ max_seconds: float = 5.0
37
+ _request_count: int = field(default=0, init=False, repr=False)
38
+ _last_request_at: datetime | None = field(default=None, init=False, repr=False)
39
+
40
+ def __post_init__(self) -> None:
41
+ if self.min_seconds < 0:
42
+ msg = f"min_seconds must be >= 0, got {self.min_seconds}"
43
+ raise ValueError(msg)
44
+ if self.max_seconds < self.min_seconds:
45
+ msg = (
46
+ f"max_seconds ({self.max_seconds}) must be >= "
47
+ f"min_seconds ({self.min_seconds})"
48
+ )
49
+ raise ValueError(msg)
50
+
51
+ # ------------------------------------------------------------------
52
+ # Public API
53
+ # ------------------------------------------------------------------
54
+
55
+ @property
56
+ def request_count(self) -> int:
57
+ """Number of waits performed."""
58
+ return self._request_count
59
+
60
+ @property
61
+ def last_request_at(self) -> datetime | None:
62
+ """Timestamp of the most recent wait."""
63
+ return self._last_request_at
64
+
65
+ def wait(self, override: tuple[float, float] | None = None) -> float:
66
+ """Block the current thread for a jittered duration.
67
+
68
+ Args:
69
+ override: Optional ``(min, max)`` tuple to use instead of the
70
+ instance defaults.
71
+
72
+ Returns:
73
+ The actual number of seconds slept.
74
+ """
75
+ duration = self._pick_duration(override)
76
+ time.sleep(duration)
77
+ self._record()
78
+ return duration
79
+
80
+ async def async_wait(self, override: tuple[float, float] | None = None) -> float:
81
+ """Async version of :meth:`wait`.
82
+
83
+ Args:
84
+ override: Optional ``(min, max)`` tuple to use instead of the
85
+ instance defaults.
86
+
87
+ Returns:
88
+ The actual number of seconds slept.
89
+ """
90
+ duration = self._pick_duration(override)
91
+ await asyncio.sleep(duration)
92
+ self._record()
93
+ return duration
94
+
95
+ def reset(self) -> None:
96
+ """Reset the internal counters."""
97
+ self._request_count = 0
98
+ self._last_request_at = None
99
+
100
+ # ------------------------------------------------------------------
101
+ # Internals
102
+ # ------------------------------------------------------------------
103
+
104
+ def _pick_duration(self, override: tuple[float, float] | None) -> float:
105
+ lo, hi = override or (self.min_seconds, self.max_seconds)
106
+ return random.uniform(lo, hi)
107
+
108
+ def _record(self) -> None:
109
+ self._request_count += 1
110
+ self._last_request_at = datetime.now(timezone.utc)
@@ -0,0 +1,119 @@
1
+ """Generic scraper registry and factory.
2
+
3
+ The registry is a simple name-to-class mapping that supports both imperative
4
+ registration and a decorator-based approach.
5
+
6
+ Example::
7
+
8
+ from scraper_framework import BaseScraper, ScraperResult
9
+ from scraper_framework.registry import ScraperRegistry
10
+
11
+ registry = ScraperRegistry()
12
+
13
+ @registry.register("my-source")
14
+ class MyScraper(BaseScraper):
15
+ source = "my-source"
16
+ id_prefix = "ms"
17
+
18
+ def _run(self) -> ScraperResult:
19
+ ...
20
+
21
+ # Later
22
+ scraper = registry.get("my-source")
23
+ result = scraper.run()
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from typing import Any
29
+
30
+ from scraper_framework.base import BaseScraper
31
+
32
+
33
+ class ScraperRegistry:
34
+ """Thread-safe registry mapping scraper names to their classes.
35
+
36
+ Supports two registration styles:
37
+
38
+ 1. **Imperative** -- ``registry.add("name", MyScraperClass)``
39
+ 2. **Decorator** -- ``@registry.register("name")``
40
+
41
+ Retrieval always returns a *new instance* via :meth:`get`.
42
+ """
43
+
44
+ def __init__(self) -> None:
45
+ self._registry: dict[str, type[BaseScraper]] = {}
46
+
47
+ # ------------------------------------------------------------------
48
+ # Registration
49
+ # ------------------------------------------------------------------
50
+
51
+ def add(self, name: str, cls: type[BaseScraper]) -> None:
52
+ """Register a scraper class under *name*.
53
+
54
+ Args:
55
+ name: Unique identifier (e.g. ``"greenhouse"``).
56
+ cls: A concrete subclass of :class:`BaseScraper`.
57
+
58
+ Raises:
59
+ TypeError: If *cls* is not a subclass of ``BaseScraper``.
60
+ ValueError: If *name* is already registered.
61
+ """
62
+ if not (isinstance(cls, type) and issubclass(cls, BaseScraper)):
63
+ msg = f"Expected a BaseScraper subclass, got {cls!r}"
64
+ raise TypeError(msg)
65
+ if name in self._registry:
66
+ msg = f"Scraper '{name}' is already registered"
67
+ raise ValueError(msg)
68
+ self._registry[name] = cls
69
+
70
+ def register(self, name: str):
71
+ """Decorator that registers a scraper class.
72
+
73
+ Usage::
74
+
75
+ @registry.register("my-source")
76
+ class MyScraper(BaseScraper):
77
+ ...
78
+ """
79
+
80
+ def decorator[T: type[BaseScraper]](cls: T) -> T:
81
+ self.add(name, cls) # type: ignore[arg-type]
82
+ return cls
83
+
84
+ return decorator
85
+
86
+ # ------------------------------------------------------------------
87
+ # Retrieval
88
+ # ------------------------------------------------------------------
89
+
90
+ def get(self, name: str, *, config: dict[str, Any] | None = None) -> BaseScraper:
91
+ """Instantiate and return a scraper by name.
92
+
93
+ Args:
94
+ name: The registered scraper name.
95
+ config: Optional config dict passed to the scraper constructor.
96
+
97
+ Raises:
98
+ KeyError: If *name* is not registered.
99
+ """
100
+ cls = self._registry.get(name)
101
+ if cls is None:
102
+ available = ", ".join(sorted(self._registry)) or "(none)"
103
+ msg = f"Unknown scraper: '{name}'. Available: {available}"
104
+ raise KeyError(msg)
105
+ return cls(config=config)
106
+
107
+ def list_names(self) -> list[str]:
108
+ """Return a sorted list of all registered scraper names."""
109
+ return sorted(self._registry)
110
+
111
+ def __contains__(self, name: str) -> bool:
112
+ return name in self._registry
113
+
114
+ def __len__(self) -> int:
115
+ return len(self._registry)
116
+
117
+ def __repr__(self) -> str:
118
+ names = ", ".join(sorted(self._registry))
119
+ return f"ScraperRegistry([{names}])"
@@ -0,0 +1,58 @@
1
+ """Scraper result tracking."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime, timezone
5
+
6
+
7
+ @dataclass
8
+ class ScraperResult:
9
+ """Accumulated result from a single scraper run.
10
+
11
+ Tracks fetch counts, insert counts, errors, and timing automatically.
12
+ Call ``finish()`` or rely on ``BaseScraper.run()`` to close the timer.
13
+
14
+ Example::
15
+
16
+ result = ScraperResult(source="my-source")
17
+ result.total_fetched += 10
18
+ result.new_inserted += 3
19
+ result.finish()
20
+ print(result.to_dict())
21
+ """
22
+
23
+ source: str
24
+ total_fetched: int = 0
25
+ new_inserted: int = 0
26
+ errors: int = 0
27
+ started_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
28
+ finished_at: datetime | None = None
29
+ metadata: dict[str, object] = field(default_factory=dict)
30
+
31
+ # ------------------------------------------------------------------
32
+ # Lifecycle
33
+ # ------------------------------------------------------------------
34
+
35
+ def finish(self) -> None:
36
+ """Mark the run as finished with the current UTC timestamp."""
37
+ self.finished_at = datetime.now(timezone.utc)
38
+
39
+ @property
40
+ def duration_seconds(self) -> float:
41
+ """Elapsed seconds between start and finish (or now if still running)."""
42
+ end = self.finished_at or datetime.now(timezone.utc)
43
+ return round((end - self.started_at).total_seconds(), 1)
44
+
45
+ # ------------------------------------------------------------------
46
+ # Serialisation
47
+ # ------------------------------------------------------------------
48
+
49
+ def to_dict(self) -> dict[str, object]:
50
+ """Return a JSON-safe dictionary of the result."""
51
+ return {
52
+ "source": self.source,
53
+ "total_fetched": self.total_fetched,
54
+ "new_inserted": self.new_inserted,
55
+ "errors": self.errors,
56
+ "duration_seconds": self.duration_seconds,
57
+ "metadata": self.metadata,
58
+ }
File without changes
@@ -0,0 +1,418 @@
1
+ """Tests for the scraper framework core components."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from unittest.mock import MagicMock, patch
7
+
8
+ import pytest
9
+
10
+ from scraper_framework import (
11
+ BaseScraper,
12
+ RateLimiter,
13
+ ScraperRegistry,
14
+ ScraperResult,
15
+ all_match,
16
+ any_match,
17
+ keyword_filter,
18
+ length_filter,
19
+ negate,
20
+ none_match,
21
+ regex_filter,
22
+ )
23
+
24
+
25
+ # =========================================================================
26
+ # Fixtures
27
+ # =========================================================================
28
+
29
+
30
+ class StubScraper(BaseScraper):
31
+ """Minimal concrete scraper for testing."""
32
+
33
+ source = "stub"
34
+ id_prefix = "st"
35
+ rate_limit_range = (0.0, 0.0) # No delay in tests
36
+
37
+ def _run(self) -> ScraperResult:
38
+ self.result.total_fetched = 5
39
+ self.result.new_inserted = 3
40
+ return self.result
41
+
42
+
43
+ class FailingScraper(BaseScraper):
44
+ """Scraper that raises during _run."""
45
+
46
+ source = "failing"
47
+ id_prefix = "fl"
48
+ rate_limit_range = (0.0, 0.0)
49
+
50
+ def _run(self) -> ScraperResult:
51
+ msg = "connection refused"
52
+ raise ConnectionError(msg)
53
+
54
+
55
+ # =========================================================================
56
+ # ScraperResult
57
+ # =========================================================================
58
+
59
+
60
+ class TestScraperResult:
61
+ def test_defaults(self) -> None:
62
+ result = ScraperResult(source="test")
63
+ assert result.source == "test"
64
+ assert result.total_fetched == 0
65
+ assert result.new_inserted == 0
66
+ assert result.errors == 0
67
+ assert result.finished_at is None
68
+ assert isinstance(result.started_at, datetime)
69
+
70
+ def test_finish(self) -> None:
71
+ result = ScraperResult(source="test")
72
+ result.finish()
73
+ assert result.finished_at is not None
74
+ assert result.finished_at >= result.started_at
75
+
76
+ def test_duration_seconds_running(self) -> None:
77
+ result = ScraperResult(source="test")
78
+ duration = result.duration_seconds
79
+ assert duration >= 0.0
80
+
81
+ def test_duration_seconds_finished(self) -> None:
82
+ result = ScraperResult(source="test")
83
+ result.finish()
84
+ duration = result.duration_seconds
85
+ assert duration >= 0.0
86
+
87
+ def test_to_dict(self) -> None:
88
+ result = ScraperResult(source="test")
89
+ result.total_fetched = 10
90
+ result.new_inserted = 4
91
+ result.errors = 1
92
+ result.metadata["custom"] = "value"
93
+ result.finish()
94
+
95
+ d = result.to_dict()
96
+ assert d["source"] == "test"
97
+ assert d["total_fetched"] == 10
98
+ assert d["new_inserted"] == 4
99
+ assert d["errors"] == 1
100
+ assert isinstance(d["duration_seconds"], float)
101
+ assert d["metadata"] == {"custom": "value"}
102
+
103
+ def test_metadata_isolation(self) -> None:
104
+ r1 = ScraperResult(source="a")
105
+ r2 = ScraperResult(source="b")
106
+ r1.metadata["key"] = 1
107
+ assert "key" not in r2.metadata
108
+
109
+
110
+ # =========================================================================
111
+ # BaseScraper
112
+ # =========================================================================
113
+
114
+
115
+ class TestBaseScraper:
116
+ def test_run_lifecycle(self) -> None:
117
+ scraper = StubScraper()
118
+ result = scraper.run()
119
+ assert result.total_fetched == 5
120
+ assert result.new_inserted == 3
121
+ assert result.finished_at is not None
122
+
123
+ def test_run_with_config(self) -> None:
124
+ scraper = StubScraper(config={"key": "value"})
125
+ assert scraper.config == {"key": "value"}
126
+
127
+ def test_run_calls_setup_and_teardown(self) -> None:
128
+ scraper = StubScraper()
129
+ scraper.setup = MagicMock() # type: ignore[method-assign]
130
+ scraper.teardown = MagicMock() # type: ignore[method-assign]
131
+
132
+ scraper.run()
133
+
134
+ scraper.setup.assert_called_once()
135
+ scraper.teardown.assert_called_once()
136
+
137
+ def test_run_on_before_callback(self) -> None:
138
+ scraper = StubScraper()
139
+ callback = MagicMock()
140
+
141
+ scraper.run(on_before=callback)
142
+
143
+ callback.assert_called_once_with(scraper)
144
+
145
+ def test_run_on_after_callback(self) -> None:
146
+ scraper = StubScraper()
147
+ callback = MagicMock()
148
+
149
+ result = scraper.run(on_after=callback)
150
+
151
+ callback.assert_called_once_with(scraper, result)
152
+
153
+ def test_failing_scraper_increments_errors_and_raises(self) -> None:
154
+ scraper = FailingScraper()
155
+ with pytest.raises(ConnectionError, match="connection refused"):
156
+ scraper.run()
157
+ assert scraper.result.errors == 1
158
+ # teardown is still called and result is finished
159
+ assert scraper.result.finished_at is not None
160
+
161
+ def test_teardown_called_on_failure(self) -> None:
162
+ scraper = FailingScraper()
163
+ scraper.teardown = MagicMock() # type: ignore[method-assign]
164
+
165
+ with pytest.raises(ConnectionError):
166
+ scraper.run()
167
+
168
+ scraper.teardown.assert_called_once()
169
+
170
+ def test_sleep_delegates_to_rate_limiter(self) -> None:
171
+ scraper = StubScraper()
172
+ with patch.object(scraper.rate_limiter, "wait", return_value=0.0) as mock_wait:
173
+ scraper.sleep()
174
+ mock_wait.assert_called_once_with(override=None)
175
+
176
+ def test_sleep_with_override(self) -> None:
177
+ scraper = StubScraper()
178
+ with patch.object(scraper.rate_limiter, "wait", return_value=0.0) as mock_wait:
179
+ scraper.sleep(range_override=(1.0, 2.0))
180
+ mock_wait.assert_called_once_with(override=(1.0, 2.0))
181
+
182
+ def test_make_client(self) -> None:
183
+ scraper = StubScraper()
184
+ client = scraper.make_client()
185
+ assert client.timeout.read == 30.0
186
+ assert client.follow_redirects is True
187
+ client.close()
188
+
189
+ def test_make_client_custom_timeout(self) -> None:
190
+ scraper = StubScraper()
191
+ client = scraper.make_client(timeout=60.0)
192
+ assert client.timeout.read == 60.0
193
+ client.close()
194
+
195
+ def test_make_async_client(self) -> None:
196
+ scraper = StubScraper()
197
+ client = scraper.make_async_client()
198
+ assert client.timeout.read == 30.0
199
+ assert client.follow_redirects is True
200
+
201
+ def test_now_iso(self) -> None:
202
+ ts = StubScraper.now_iso()
203
+ parsed = datetime.fromisoformat(ts)
204
+ assert parsed.tzinfo == timezone.utc
205
+
206
+ def test_logger_name(self) -> None:
207
+ scraper = StubScraper()
208
+ # structlog loggers don't expose .name; check the logger works instead
209
+ try:
210
+ assert scraper.log.name == "scraper.stub"
211
+ except AttributeError:
212
+ # structlog BoundLogger — verify it's functional
213
+ assert scraper.log is not None
214
+
215
+
216
+ # =========================================================================
217
+ # RateLimiter
218
+ # =========================================================================
219
+
220
+
221
+ class TestRateLimiter:
222
+ def test_defaults(self) -> None:
223
+ limiter = RateLimiter()
224
+ assert limiter.min_seconds == 2.0
225
+ assert limiter.max_seconds == 5.0
226
+ assert limiter.request_count == 0
227
+ assert limiter.last_request_at is None
228
+
229
+ def test_wait_records_count(self) -> None:
230
+ limiter = RateLimiter(min_seconds=0.0, max_seconds=0.0)
231
+ limiter.wait()
232
+ limiter.wait()
233
+ assert limiter.request_count == 2
234
+ assert limiter.last_request_at is not None
235
+
236
+ def test_wait_with_override(self) -> None:
237
+ limiter = RateLimiter(min_seconds=10.0, max_seconds=20.0)
238
+ duration = limiter.wait(override=(0.0, 0.0))
239
+ assert duration == 0.0
240
+
241
+ def test_reset(self) -> None:
242
+ limiter = RateLimiter(min_seconds=0.0, max_seconds=0.0)
243
+ limiter.wait()
244
+ limiter.reset()
245
+ assert limiter.request_count == 0
246
+ assert limiter.last_request_at is None
247
+
248
+ def test_invalid_min_seconds(self) -> None:
249
+ with pytest.raises(ValueError, match="min_seconds must be >= 0"):
250
+ RateLimiter(min_seconds=-1.0)
251
+
252
+ def test_max_less_than_min(self) -> None:
253
+ with pytest.raises(ValueError, match="max_seconds"):
254
+ RateLimiter(min_seconds=5.0, max_seconds=1.0)
255
+
256
+ @pytest.mark.asyncio
257
+ async def test_async_wait(self) -> None:
258
+ limiter = RateLimiter(min_seconds=0.0, max_seconds=0.0)
259
+ duration = await limiter.async_wait()
260
+ assert duration == 0.0
261
+ assert limiter.request_count == 1
262
+
263
+
264
+ # =========================================================================
265
+ # ScraperRegistry
266
+ # =========================================================================
267
+
268
+
269
+ class TestScraperRegistry:
270
+ def test_add_and_get(self) -> None:
271
+ registry = ScraperRegistry()
272
+ registry.add("stub", StubScraper)
273
+ scraper = registry.get("stub")
274
+ assert isinstance(scraper, StubScraper)
275
+
276
+ def test_get_with_config(self) -> None:
277
+ registry = ScraperRegistry()
278
+ registry.add("stub", StubScraper)
279
+ scraper = registry.get("stub", config={"key": "val"})
280
+ assert scraper.config == {"key": "val"}
281
+
282
+ def test_register_decorator(self) -> None:
283
+ registry = ScraperRegistry()
284
+
285
+ @registry.register("decorated")
286
+ class DecoratedScraper(StubScraper):
287
+ source = "decorated"
288
+
289
+ assert "decorated" in registry
290
+ scraper = registry.get("decorated")
291
+ assert isinstance(scraper, DecoratedScraper)
292
+
293
+ def test_get_unknown_raises(self) -> None:
294
+ registry = ScraperRegistry()
295
+ with pytest.raises(KeyError, match="Unknown scraper: 'nope'"):
296
+ registry.get("nope")
297
+
298
+ def test_duplicate_add_raises(self) -> None:
299
+ registry = ScraperRegistry()
300
+ registry.add("stub", StubScraper)
301
+ with pytest.raises(ValueError, match="already registered"):
302
+ registry.add("stub", StubScraper)
303
+
304
+ def test_add_non_scraper_raises(self) -> None:
305
+ registry = ScraperRegistry()
306
+ with pytest.raises(TypeError, match="Expected a BaseScraper subclass"):
307
+ registry.add("bad", dict) # type: ignore[arg-type]
308
+
309
+ def test_list_names(self) -> None:
310
+ registry = ScraperRegistry()
311
+ registry.add("beta", StubScraper)
312
+
313
+ class AnotherScraper(StubScraper):
314
+ source = "alpha"
315
+
316
+ registry.add("alpha", AnotherScraper)
317
+ assert registry.list_names() == ["alpha", "beta"]
318
+
319
+ def test_contains(self) -> None:
320
+ registry = ScraperRegistry()
321
+ registry.add("stub", StubScraper)
322
+ assert "stub" in registry
323
+ assert "missing" not in registry
324
+
325
+ def test_len(self) -> None:
326
+ registry = ScraperRegistry()
327
+ assert len(registry) == 0
328
+ registry.add("stub", StubScraper)
329
+ assert len(registry) == 1
330
+
331
+ def test_repr(self) -> None:
332
+ registry = ScraperRegistry()
333
+ registry.add("b", StubScraper)
334
+
335
+ class A(StubScraper):
336
+ source = "a"
337
+
338
+ registry.add("a", A)
339
+ assert repr(registry) == "ScraperRegistry([a, b])"
340
+
341
+
342
+ # =========================================================================
343
+ # Filters
344
+ # =========================================================================
345
+
346
+
347
+ class TestFilters:
348
+ def test_keyword_filter_case_insensitive(self) -> None:
349
+ f = keyword_filter({"senior", "staff"})
350
+ assert f("Senior Engineer") is True
351
+ assert f("STAFF developer") is True
352
+ assert f("Junior Developer") is False
353
+
354
+ def test_keyword_filter_case_sensitive(self) -> None:
355
+ f = keyword_filter({"Senior"}, case_sensitive=True)
356
+ assert f("Senior Engineer") is True
357
+ assert f("senior engineer") is False
358
+
359
+ def test_regex_filter(self) -> None:
360
+ f = regex_filter(r"\bpython\b")
361
+ assert f("Python Developer") is True
362
+ assert f("pythonic code") is False # word boundary
363
+ assert f("Java Engineer") is False
364
+
365
+ def test_length_filter(self) -> None:
366
+ f = length_filter(min_length=3, max_length=10)
367
+ assert f("ab") is False
368
+ assert f("abc") is True
369
+ assert f("0123456789") is True
370
+ assert f("01234567890") is False
371
+
372
+ def test_length_filter_no_max(self) -> None:
373
+ f = length_filter(min_length=1)
374
+ assert f("") is False
375
+ assert f("x" * 10000) is True
376
+
377
+ def test_all_match(self) -> None:
378
+ f = all_match(
379
+ keyword_filter({"senior"}),
380
+ regex_filter(r"\bpython\b"),
381
+ )
382
+ assert f("Senior Python Developer") is True
383
+ assert f("Senior Java Developer") is False
384
+ assert f("Junior Python Developer") is False
385
+
386
+ def test_any_match(self) -> None:
387
+ f = any_match(
388
+ keyword_filter({"python"}),
389
+ keyword_filter({"rust"}),
390
+ )
391
+ assert f("Python Dev") is True
392
+ assert f("Rust Dev") is True
393
+ assert f("Java Dev") is False
394
+
395
+ def test_none_match(self) -> None:
396
+ f = none_match(keyword_filter({"intern", "junior"}))
397
+ assert f("Senior Engineer") is True
398
+ assert f("Junior Developer") is False
399
+ assert f("Intern position") is False
400
+
401
+ def test_negate(self) -> None:
402
+ f = negate(keyword_filter({"intern"}))
403
+ assert f("Senior Engineer") is True
404
+ assert f("Summer Intern") is False
405
+
406
+ def test_composable_pipeline(self) -> None:
407
+ """Full pipeline: senior + skills, no exclusions."""
408
+ is_senior = keyword_filter({"senior", "staff", "lead"})
409
+ not_excluded = none_match(keyword_filter({"intern", "junior"}))
410
+ has_tech = regex_filter(r"\b(python|typescript|react)\b")
411
+
412
+ pipeline = all_match(is_senior, not_excluded, has_tech)
413
+
414
+ assert pipeline("Senior Python Engineer") is True
415
+ assert pipeline("Lead React Developer") is True
416
+ assert pipeline("Junior Python Developer") is False
417
+ assert pipeline("Senior Marketing Manager") is False
418
+ assert pipeline("Staff Intern Program") is False