msaas-scraper-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: msaas-scraper-framework
3
+ Version: 0.1.0
4
+ Summary: Generic web scraper framework with registry, rate limiting, and composable filters
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: httpx>=0.27.0
7
+ Requires-Dist: msaas-api-core
8
+ Requires-Dist: msaas-errors
9
+ Provides-Extra: dev
10
+ Requires-Dist: httpx>=0.27.0; extra == 'dev'
11
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
12
+ Requires-Dist: pytest>=8.0; extra == 'dev'
13
+ Requires-Dist: ruff>=0.8; extra == 'dev'
@@ -0,0 +1,9 @@
1
+ scraper_framework/__init__.py,sha256=fss4om1vREfIvsDBcmdBqKWdiGbikInQ_xTEho68tEE,1305
2
+ scraper_framework/base.py,sha256=fBZjgd8px7blay4-NgQyyrYvTSqpBCZ3IBeyLdU8gJ8,5672
3
+ scraper_framework/filters.py,sha256=pynhfV0Tvc_2-hSNTEjSBg0gxqvh7HfwejJVDtEjbmA,3680
4
+ scraper_framework/rate_limiter.py,sha256=rS2Uc5tWVx6YZc1Y1FJfyF4MsYSRNwzAubkv_RFMSnY,3335
5
+ scraper_framework/registry.py,sha256=aSws_jA2q-uIH15C6yXyJlfge5k3CTjMUnEdNL9TzO0,3638
6
+ scraper_framework/result.py,sha256=arDqiYaHz83C3CU88yX44x9z342p-jNQScNOhmxQlrU,1935
7
+ msaas_scraper_framework-0.1.0.dist-info/METADATA,sha256=rLXQLT-0ELmAmf9MkRsKlt__RfsT8eP-3w2Lrn4L_9g,471
8
+ msaas_scraper_framework-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ msaas_scraper_framework-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,52 @@
1
+ """scraper-framework: Generic web scraper framework with registry, rate limiting, and filters.
2
+
3
+ Usage::
4
+
5
+ from scraper_framework import BaseScraper, ScraperResult, ScraperRegistry
6
+
7
+ registry = ScraperRegistry()
8
+
9
+ @registry.register("example")
10
+ class ExampleScraper(BaseScraper):
11
+ source = "example"
12
+ id_prefix = "ex"
13
+
14
+ def _run(self) -> ScraperResult:
15
+ with self.make_client() as client:
16
+ resp = client.get("https://example.com/api")
17
+ self.result.total_fetched += len(resp.json())
18
+ self.sleep()
19
+ return self.result
20
+
21
+ scraper = registry.get("example")
22
+ result = scraper.run()
23
+ print(result.to_dict())
24
+ """
25
+
26
+ from scraper_framework.base import BaseScraper
27
+ from scraper_framework.filters import (
28
+ all_match,
29
+ any_match,
30
+ keyword_filter,
31
+ length_filter,
32
+ negate,
33
+ none_match,
34
+ regex_filter,
35
+ )
36
+ from scraper_framework.rate_limiter import RateLimiter
37
+ from scraper_framework.registry import ScraperRegistry
38
+ from scraper_framework.result import ScraperResult
39
+
40
+ __all__ = [
41
+ "BaseScraper",
42
+ "RateLimiter",
43
+ "ScraperRegistry",
44
+ "ScraperResult",
45
+ "all_match",
46
+ "any_match",
47
+ "keyword_filter",
48
+ "length_filter",
49
+ "negate",
50
+ "none_match",
51
+ "regex_filter",
52
+ ]
@@ -0,0 +1,170 @@
1
+ """Generic base scraper class.
2
+
3
+ Provides the abstract skeleton that concrete scrapers implement. Handles
4
+ lifecycle (setup, run, teardown), HTTP client creation, rate limiting, and
5
+ result tracking.
6
+
7
+ Example subclass::
8
+
9
+ from scraper_framework import BaseScraper, ScraperResult
10
+
11
+ class MyScraper(BaseScraper):
12
+ source = "my-source"
13
+ id_prefix = "ms"
14
+
15
+ def _run(self) -> ScraperResult:
16
+ with self.make_client() as client:
17
+ resp = client.get("https://example.com/api/items")
18
+ items = resp.json()
19
+ self.result.total_fetched += len(items)
20
+ # ... process items ...
21
+ self.sleep()
22
+ return self.result
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import logging
28
+ from abc import ABC, abstractmethod
29
+ from collections.abc import Callable
30
+ from datetime import UTC, datetime
31
+ from typing import Any
32
+
33
+ import httpx
34
+
35
+ from scraper_framework.rate_limiter import RateLimiter
36
+ from scraper_framework.result import ScraperResult
37
+
38
+
39
+ class BaseScraper(ABC):
40
+ """Abstract base class for all scrapers.
41
+
42
+ Subclasses **must** set ``source`` and implement :meth:`_run`.
43
+
44
+ Class attributes:
45
+ source: Identifier for this scraper (e.g. ``"greenhouse"``).
46
+ id_prefix: Short prefix for generated record IDs (e.g. ``"gh"``).
47
+ rate_limit_range: Default ``(min, max)`` seconds between requests.
48
+ """
49
+
50
+ source: str = ""
51
+ id_prefix: str = ""
52
+ rate_limit_range: tuple[float, float] = (2.0, 5.0)
53
+
54
+ def __init__(self, *, config: dict[str, Any] | None = None) -> None:
55
+ self.config: dict[str, Any] = config or {}
56
+ self.result = ScraperResult(source=self.source)
57
+ self.rate_limiter = RateLimiter(
58
+ min_seconds=self.rate_limit_range[0],
59
+ max_seconds=self.rate_limit_range[1],
60
+ )
61
+ try:
62
+ import structlog
63
+
64
+ self.log = structlog.get_logger(f"scraper.{self.source}").bind(source=self.source)
65
+ except ImportError:
66
+ self.log = logging.getLogger(f"scraper.{self.source}")
67
+
68
+ # ------------------------------------------------------------------
69
+ # Lifecycle hooks (override in subclasses if needed)
70
+ # ------------------------------------------------------------------
71
+
72
+ def setup(self) -> None:
73
+ """Called before :meth:`_run`. Override for one-time initialisation."""
74
+
75
+ def teardown(self) -> None:
76
+ """Called after :meth:`_run` completes (even on error). Override for cleanup."""
77
+
78
+ # ------------------------------------------------------------------
79
+ # Abstract
80
+ # ------------------------------------------------------------------
81
+
82
+ @abstractmethod
83
+ def _run(self) -> ScraperResult:
84
+ """Execute the scraping logic.
85
+
86
+ Must populate ``self.result`` and return it. Implementations should
87
+ call ``self.sleep()`` between HTTP requests to respect rate limits.
88
+ """
89
+ ...
90
+
91
+ # ------------------------------------------------------------------
92
+ # Public entry point
93
+ # ------------------------------------------------------------------
94
+
95
+ def run(
96
+ self,
97
+ *,
98
+ on_before: Callable[[BaseScraper], None] | None = None,
99
+ on_after: Callable[[BaseScraper, ScraperResult], None] | None = None,
100
+ ) -> ScraperResult:
101
+ """Public entry point: setup, run scraper, teardown, finalize result.
102
+
103
+ Args:
104
+ on_before: Optional callback invoked after :meth:`setup` but
105
+ before :meth:`_run`.
106
+ on_after: Optional callback invoked after :meth:`_run` with the
107
+ result.
108
+
109
+ Returns:
110
+ The populated :class:`ScraperResult`.
111
+ """
112
+ self.setup()
113
+ if on_before:
114
+ on_before(self)
115
+ try:
116
+ self._run()
117
+ except Exception:
118
+ self.result.errors += 1
119
+ self.log.exception("Scraper %s failed", self.source)
120
+ raise
121
+ finally:
122
+ self.teardown()
123
+ self.result.finish()
124
+ if on_after:
125
+ on_after(self, self.result)
126
+ return self.result
127
+
128
+ # ------------------------------------------------------------------
129
+ # Utilities
130
+ # ------------------------------------------------------------------
131
+
132
+ def sleep(self, range_override: tuple[float, float] | None = None) -> float:
133
+ """Rate-limiting sleep between requests.
134
+
135
+ Args:
136
+ range_override: Optional ``(min, max)`` seconds to override the
137
+ default rate limit range.
138
+
139
+ Returns:
140
+ Actual seconds slept.
141
+ """
142
+ return self.rate_limiter.wait(override=range_override)
143
+
144
+ def make_client(self, **kwargs: Any) -> httpx.Client:
145
+ """Create an :class:`httpx.Client` with sensible defaults.
146
+
147
+ Defaults:
148
+ - ``timeout``: 30 seconds
149
+ - ``follow_redirects``: True
150
+
151
+ All keyword arguments are forwarded to ``httpx.Client``, overriding
152
+ the defaults.
153
+ """
154
+ defaults: dict[str, Any] = {"timeout": 30.0, "follow_redirects": True}
155
+ defaults.update(kwargs)
156
+ return httpx.Client(**defaults)
157
+
158
+ def make_async_client(self, **kwargs: Any) -> httpx.AsyncClient:
159
+ """Create an :class:`httpx.AsyncClient` with sensible defaults.
160
+
161
+ Same defaults as :meth:`make_client`.
162
+ """
163
+ defaults: dict[str, Any] = {"timeout": 30.0, "follow_redirects": True}
164
+ defaults.update(kwargs)
165
+ return httpx.AsyncClient(**defaults)
166
+
167
+ @staticmethod
168
+ def now_iso() -> str:
169
+ """Current UTC timestamp in ISO-8601 format."""
170
+ return datetime.now(UTC).isoformat()
@@ -0,0 +1,135 @@
1
+ """Composable filter functions for scraper pipelines.
2
+
3
+ Filters are plain functions with the signature ``(text: str) -> bool`` so they
4
+ can be combined freely with :func:`all_match`, :func:`any_match`, and
5
+ :func:`none_match`.
6
+
7
+ Example::
8
+
9
+ from scraper_framework.filters import (
10
+ regex_filter, keyword_filter, all_match, none_match,
11
+ )
12
+
13
+ is_senior = keyword_filter({"senior", "staff", "lead"})
14
+ not_excluded = none_match(keyword_filter({"intern", "junior"}))
15
+ has_python = regex_filter(r"\\bpython\\b")
16
+
17
+ title = "Senior Python Engineer"
18
+ if is_senior(title) and not_excluded(title) and has_python(title):
19
+ print("Relevant!")
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import re
25
+ from collections.abc import Callable
26
+
27
+ type TextFilter = Callable[[str], bool]
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Primitive filter builders
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ def keyword_filter(
36
+ keywords: set[str],
37
+ *,
38
+ case_sensitive: bool = False,
39
+ ) -> TextFilter:
40
+ """Return a filter that matches if *any* keyword appears in the text.
41
+
42
+ Keywords are matched as substrings. For word-boundary matching use
43
+ :func:`regex_filter` instead.
44
+
45
+ Args:
46
+ keywords: Set of keywords to look for.
47
+ case_sensitive: Whether matching is case-sensitive.
48
+ """
49
+ if not case_sensitive:
50
+ normalised = {k.lower() for k in keywords}
51
+
52
+ def _match(text: str) -> bool:
53
+ lowered = text.lower()
54
+ return any(kw in lowered for kw in normalised)
55
+ else:
56
+
57
+ def _match(text: str) -> bool:
58
+ return any(kw in text for kw in keywords)
59
+
60
+ return _match
61
+
62
+
63
+ def regex_filter(pattern: str, *, flags: int = re.IGNORECASE) -> TextFilter:
64
+ """Return a filter that matches if the compiled regex finds a match.
65
+
66
+ Args:
67
+ pattern: A regular expression string.
68
+ flags: Regex flags (default: ``re.IGNORECASE``).
69
+ """
70
+ compiled = re.compile(pattern, flags)
71
+
72
+ def _match(text: str) -> bool:
73
+ return compiled.search(text) is not None
74
+
75
+ return _match
76
+
77
+
78
+ def length_filter(*, min_length: int = 0, max_length: int | None = None) -> TextFilter:
79
+ """Return a filter that checks text length.
80
+
81
+ Args:
82
+ min_length: Minimum number of characters.
83
+ max_length: Maximum number of characters (``None`` for unlimited).
84
+ """
85
+
86
+ def _match(text: str) -> bool:
87
+ n = len(text)
88
+ if n < min_length:
89
+ return False
90
+ if max_length is not None and n > max_length:
91
+ return False
92
+ return True
93
+
94
+ return _match
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Combinators
99
+ # ---------------------------------------------------------------------------
100
+
101
+
102
+ def all_match(*filters: TextFilter) -> TextFilter:
103
+ """Return a filter that passes only if **all** sub-filters pass."""
104
+
105
+ def _match(text: str) -> bool:
106
+ return all(f(text) for f in filters)
107
+
108
+ return _match
109
+
110
+
111
+ def any_match(*filters: TextFilter) -> TextFilter:
112
+ """Return a filter that passes if **any** sub-filter passes."""
113
+
114
+ def _match(text: str) -> bool:
115
+ return any(f(text) for f in filters)
116
+
117
+ return _match
118
+
119
+
120
+ def none_match(*filters: TextFilter) -> TextFilter:
121
+ """Return a filter that passes only if **no** sub-filter passes."""
122
+
123
+ def _match(text: str) -> bool:
124
+ return not any(f(text) for f in filters)
125
+
126
+ return _match
127
+
128
+
129
+ def negate(f: TextFilter) -> TextFilter:
130
+ """Return a filter that inverts the result of *f*."""
131
+
132
+ def _match(text: str) -> bool:
133
+ return not f(text)
134
+
135
+ return _match
@@ -0,0 +1,107 @@
1
+ """Rate limiting utilities for scrapers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import random
7
+ import time
8
+ from dataclasses import dataclass, field
9
+ from datetime import UTC, datetime
10
+
11
+
12
+ @dataclass
13
+ class RateLimiter:
14
+ """Configurable rate limiter with jittered sleep.
15
+
16
+ Supports both synchronous and asynchronous usage. The sleep duration is
17
+ uniformly sampled from ``[min_seconds, max_seconds]`` to avoid
18
+ thundering-herd patterns.
19
+
20
+ Example::
21
+
22
+ limiter = RateLimiter(min_seconds=1.0, max_seconds=3.0)
23
+ for url in urls:
24
+ fetch(url)
25
+ limiter.wait()
26
+
27
+ Async usage::
28
+
29
+ limiter = RateLimiter(min_seconds=0.5, max_seconds=1.5)
30
+ for url in urls:
31
+ await fetch(url)
32
+ await limiter.async_wait()
33
+ """
34
+
35
+ min_seconds: float = 2.0
36
+ max_seconds: float = 5.0
37
+ _request_count: int = field(default=0, init=False, repr=False)
38
+ _last_request_at: datetime | None = field(default=None, init=False, repr=False)
39
+
40
+ def __post_init__(self) -> None:
41
+ if self.min_seconds < 0:
42
+ msg = f"min_seconds must be >= 0, got {self.min_seconds}"
43
+ raise ValueError(msg)
44
+ if self.max_seconds < self.min_seconds:
45
+ msg = f"max_seconds ({self.max_seconds}) must be >= min_seconds ({self.min_seconds})"
46
+ raise ValueError(msg)
47
+
48
+ # ------------------------------------------------------------------
49
+ # Public API
50
+ # ------------------------------------------------------------------
51
+
52
+ @property
53
+ def request_count(self) -> int:
54
+ """Number of waits performed."""
55
+ return self._request_count
56
+
57
+ @property
58
+ def last_request_at(self) -> datetime | None:
59
+ """Timestamp of the most recent wait."""
60
+ return self._last_request_at
61
+
62
+ def wait(self, override: tuple[float, float] | None = None) -> float:
63
+ """Block the current thread for a jittered duration.
64
+
65
+ Args:
66
+ override: Optional ``(min, max)`` tuple to use instead of the
67
+ instance defaults.
68
+
69
+ Returns:
70
+ The actual number of seconds slept.
71
+ """
72
+ duration = self._pick_duration(override)
73
+ time.sleep(duration)
74
+ self._record()
75
+ return duration
76
+
77
+ async def async_wait(self, override: tuple[float, float] | None = None) -> float:
78
+ """Async version of :meth:`wait`.
79
+
80
+ Args:
81
+ override: Optional ``(min, max)`` tuple to use instead of the
82
+ instance defaults.
83
+
84
+ Returns:
85
+ The actual number of seconds slept.
86
+ """
87
+ duration = self._pick_duration(override)
88
+ await asyncio.sleep(duration)
89
+ self._record()
90
+ return duration
91
+
92
+ def reset(self) -> None:
93
+ """Reset the internal counters."""
94
+ self._request_count = 0
95
+ self._last_request_at = None
96
+
97
+ # ------------------------------------------------------------------
98
+ # Internals
99
+ # ------------------------------------------------------------------
100
+
101
+ def _pick_duration(self, override: tuple[float, float] | None) -> float:
102
+ lo, hi = override or (self.min_seconds, self.max_seconds)
103
+ return random.uniform(lo, hi)
104
+
105
+ def _record(self) -> None:
106
+ self._request_count += 1
107
+ self._last_request_at = datetime.now(UTC)
@@ -0,0 +1,119 @@
1
+ """Generic scraper registry and factory.
2
+
3
+ The registry is a simple name-to-class mapping that supports both imperative
4
+ registration and a decorator-based approach.
5
+
6
+ Example::
7
+
8
+ from scraper_framework import BaseScraper, ScraperResult
9
+ from scraper_framework.registry import ScraperRegistry
10
+
11
+ registry = ScraperRegistry()
12
+
13
+ @registry.register("my-source")
14
+ class MyScraper(BaseScraper):
15
+ source = "my-source"
16
+ id_prefix = "ms"
17
+
18
+ def _run(self) -> ScraperResult:
19
+ ...
20
+
21
+ # Later
22
+ scraper = registry.get("my-source")
23
+ result = scraper.run()
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from typing import Any
29
+
30
+ from scraper_framework.base import BaseScraper
31
+
32
+
33
+ class ScraperRegistry:
34
+ """Thread-safe registry mapping scraper names to their classes.
35
+
36
+ Supports two registration styles:
37
+
38
+ 1. **Imperative** -- ``registry.add("name", MyScraperClass)``
39
+ 2. **Decorator** -- ``@registry.register("name")``
40
+
41
+ Retrieval always returns a *new instance* via :meth:`get`.
42
+ """
43
+
44
+ def __init__(self) -> None:
45
+ self._registry: dict[str, type[BaseScraper]] = {}
46
+
47
+ # ------------------------------------------------------------------
48
+ # Registration
49
+ # ------------------------------------------------------------------
50
+
51
+ def add(self, name: str, cls: type[BaseScraper]) -> None:
52
+ """Register a scraper class under *name*.
53
+
54
+ Args:
55
+ name: Unique identifier (e.g. ``"greenhouse"``).
56
+ cls: A concrete subclass of :class:`BaseScraper`.
57
+
58
+ Raises:
59
+ TypeError: If *cls* is not a subclass of ``BaseScraper``.
60
+ ValueError: If *name* is already registered.
61
+ """
62
+ if not (isinstance(cls, type) and issubclass(cls, BaseScraper)):
63
+ msg = f"Expected a BaseScraper subclass, got {cls!r}"
64
+ raise TypeError(msg)
65
+ if name in self._registry:
66
+ msg = f"Scraper '{name}' is already registered"
67
+ raise ValueError(msg)
68
+ self._registry[name] = cls
69
+
70
+ def register(self, name: str):
71
+ """Decorator that registers a scraper class.
72
+
73
+ Usage::
74
+
75
+ @registry.register("my-source")
76
+ class MyScraper(BaseScraper):
77
+ ...
78
+ """
79
+
80
+ def decorator[T: type[BaseScraper]](cls: T) -> T:
81
+ self.add(name, cls) # type: ignore[arg-type]
82
+ return cls
83
+
84
+ return decorator
85
+
86
+ # ------------------------------------------------------------------
87
+ # Retrieval
88
+ # ------------------------------------------------------------------
89
+
90
+ def get(self, name: str, *, config: dict[str, Any] | None = None) -> BaseScraper:
91
+ """Instantiate and return a scraper by name.
92
+
93
+ Args:
94
+ name: The registered scraper name.
95
+ config: Optional config dict passed to the scraper constructor.
96
+
97
+ Raises:
98
+ KeyError: If *name* is not registered.
99
+ """
100
+ cls = self._registry.get(name)
101
+ if cls is None:
102
+ available = ", ".join(sorted(self._registry)) or "(none)"
103
+ msg = f"Unknown scraper: '{name}'. Available: {available}"
104
+ raise KeyError(msg)
105
+ return cls(config=config)
106
+
107
+ def list_names(self) -> list[str]:
108
+ """Return a sorted list of all registered scraper names."""
109
+ return sorted(self._registry)
110
+
111
+ def __contains__(self, name: str) -> bool:
112
+ return name in self._registry
113
+
114
+ def __len__(self) -> int:
115
+ return len(self._registry)
116
+
117
+ def __repr__(self) -> str:
118
+ names = ", ".join(sorted(self._registry))
119
+ return f"ScraperRegistry([{names}])"
@@ -0,0 +1,58 @@
1
+ """Scraper result tracking."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import UTC, datetime
5
+
6
+
7
+ @dataclass
8
+ class ScraperResult:
9
+ """Accumulated result from a single scraper run.
10
+
11
+ Tracks fetch counts, insert counts, errors, and timing automatically.
12
+ Call ``finish()`` or rely on ``BaseScraper.run()`` to close the timer.
13
+
14
+ Example::
15
+
16
+ result = ScraperResult(source="my-source")
17
+ result.total_fetched += 10
18
+ result.new_inserted += 3
19
+ result.finish()
20
+ print(result.to_dict())
21
+ """
22
+
23
+ source: str
24
+ total_fetched: int = 0
25
+ new_inserted: int = 0
26
+ errors: int = 0
27
+ started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
28
+ finished_at: datetime | None = None
29
+ metadata: dict[str, object] = field(default_factory=dict)
30
+
31
+ # ------------------------------------------------------------------
32
+ # Lifecycle
33
+ # ------------------------------------------------------------------
34
+
35
+ def finish(self) -> None:
36
+ """Mark the run as finished with the current UTC timestamp."""
37
+ self.finished_at = datetime.now(UTC)
38
+
39
+ @property
40
+ def duration_seconds(self) -> float:
41
+ """Elapsed seconds between start and finish (or now if still running)."""
42
+ end = self.finished_at or datetime.now(UTC)
43
+ return round((end - self.started_at).total_seconds(), 1)
44
+
45
+ # ------------------------------------------------------------------
46
+ # Serialisation
47
+ # ------------------------------------------------------------------
48
+
49
+ def to_dict(self) -> dict[str, object]:
50
+ """Return a JSON-safe dictionary of the result."""
51
+ return {
52
+ "source": self.source,
53
+ "total_fetched": self.total_fetched,
54
+ "new_inserted": self.new_inserted,
55
+ "errors": self.errors,
56
+ "duration_seconds": self.duration_seconds,
57
+ "metadata": self.metadata,
58
+ }