wxpath 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,11 +2,12 @@ import asyncio
2
2
  import contextlib
3
3
  import inspect
4
4
  from collections import deque
5
- from typing import Any, AsyncGenerator
5
+ from typing import Any, AsyncGenerator, Iterator
6
6
 
7
7
  from lxml.html import HtmlElement
8
8
 
9
9
  from wxpath import patches # noqa: F401
10
+ from wxpath.core import parser
10
11
  from wxpath.core.models import (
11
12
  CrawlIntent,
12
13
  CrawlTask,
@@ -16,7 +17,7 @@ from wxpath.core.models import (
16
17
  ProcessIntent,
17
18
  )
18
19
  from wxpath.core.ops import get_operator
19
- from wxpath.core.parser import parse_wxpath_expr
20
+ from wxpath.core.parser import Binary, Segment, Segments
20
21
  from wxpath.core.runtime.helpers import parse_html
21
22
  from wxpath.hooks.registry import FetchContext, get_hooks
22
23
  from wxpath.http.client.crawler import Crawler
@@ -27,7 +28,21 @@ log = get_logger(__name__)
27
28
 
28
29
 
29
30
  class HookedEngineBase:
30
- async def post_fetch_hooks(self, body, task):
31
+ """Common hook invocation helpers shared by engine variants."""
32
+
33
+ async def post_fetch_hooks(self, body: bytes | str, task: CrawlTask) -> bytes | str | None:
34
+ """Run registered `post_fetch` hooks over a fetched response body.
35
+
36
+ Hooks may be synchronous or asynchronous and can transform or drop the
37
+ response payload entirely.
38
+
39
+ Args:
40
+ body: Raw response body bytes from the crawler.
41
+ task: The `CrawlTask` that produced the response.
42
+
43
+ Returns:
44
+ The transformed body, or `None` if any hook chooses to drop it.
45
+ """
31
46
  for hook in get_hooks():
32
47
  hook_method = getattr(hook, "post_fetch", lambda _, b: b)
33
48
  if inspect.iscoroutinefunction(hook_method):
@@ -45,7 +60,18 @@ class HookedEngineBase:
45
60
  break
46
61
  return body
47
62
 
48
- async def post_parse_hooks(self, elem, task):
63
+ async def post_parse_hooks(
64
+ self, elem: HtmlElement | None, task: CrawlTask
65
+ ) -> HtmlElement | None:
66
+ """Run registered `post_parse` hooks on a parsed DOM element.
67
+
68
+ Args:
69
+ elem: Parsed `lxml` element to process.
70
+ task: The originating `CrawlTask`.
71
+
72
+ Returns:
73
+ The transformed element, or `None` if a hook drops the branch.
74
+ """
49
75
  for hook in get_hooks():
50
76
  hook_method = getattr(hook, "post_parse", lambda _, e: e)
51
77
  if inspect.iscoroutinefunction(hook_method):
@@ -73,7 +99,15 @@ class HookedEngineBase:
73
99
  break
74
100
  return elem
75
101
 
76
- async def post_extract_hooks(self, value):
102
+ async def post_extract_hooks(self, value: Any) -> Any | None:
103
+ """Run registered `post_extract` hooks on extracted values.
104
+
105
+ Args:
106
+ value: The extracted datum to post-process.
107
+
108
+ Returns:
109
+ The transformed value, or `None` if a hook drops it.
110
+ """
77
111
  for hook in get_hooks():
78
112
  hook_method = getattr(hook, "post_extract", lambda v: v)
79
113
  if inspect.iscoroutinefunction(hook_method):
@@ -87,35 +121,65 @@ class HookedEngineBase:
87
121
 
88
122
 
89
123
  class WXPathEngine(HookedEngineBase):
90
- """
91
- Main class for executing wxpath expressions.
124
+ """Main class for executing wxpath expressions.
92
125
 
93
- The core pattern and directive for this engine is to build a queue of CrawlTasks,
94
- which is crawled and processed FIFO. The traversal of the queue (and therefore
95
- the web graph) is done concurrently and in BFS-ish order.
126
+ The core pattern is to build a queue of CrawlTasks that are crawled and
127
+ processed FIFO. Traversal of the queue (and therefore the web graph) is
128
+ done concurrently in BFS-ish order.
96
129
 
97
130
  Args:
98
- crawler: Crawler instance
99
- concurrency: number of concurrent fetches at the Crawler (request engine) level
100
- per_host: number of concurrent fetches per host
131
+ crawler: Crawler instance to use for HTTP requests.
132
+ concurrency: Number of concurrent fetches at the Crawler level.
133
+ per_host: Number of concurrent fetches per host.
134
+ respect_robots: Whether to respect robots.txt directives.
135
+ allowed_response_codes: Set of allowed HTTP response codes. Defaults
136
+ to ``{200}``. Responses may still be filtered and dropped.
137
+ allow_redirects: Whether to follow HTTP redirects. Defaults to ``True``.
101
138
  """
102
139
  def __init__(
103
140
  self,
104
141
  crawler: Crawler | None = None,
105
142
  concurrency: int = 16,
106
- per_host: int = 8
143
+ per_host: int = 8,
144
+ respect_robots: bool = True,
145
+ allowed_response_codes: set[int] = None,
146
+ allow_redirects: bool = True,
107
147
  ):
148
+ # NOTE: Will grow unbounded in large crawls. Consider a LRU cache, or bloom filter.
108
149
  self.seen_urls: set[str] = set()
109
- self.crawler = crawler or Crawler(concurrency=concurrency, per_host=per_host)
110
-
111
- async def run(self, expression: str, max_depth: int):
112
- segments = parse_wxpath_expr(expression)
150
+ self.crawler = crawler or Crawler(
151
+ concurrency=concurrency,
152
+ per_host=per_host,
153
+ respect_robots=respect_robots
154
+ )
155
+ self.allowed_response_codes = allowed_response_codes or {200}
156
+ self.allow_redirects = allow_redirects
157
+ if allow_redirects:
158
+ self.allowed_response_codes |= {301, 302, 303, 307, 308}
159
+
160
+ async def run(self, expression: str, max_depth: int) -> AsyncGenerator[Any, None]:
161
+ """Execute a wxpath expression concurrently and yield results.
162
+
163
+ Builds and drives a BFS-like crawl pipeline that honors robots rules,
164
+ throttling, and hook callbacks while walking the web graph.
165
+
166
+ Args:
167
+ expression: WXPath expression string to evaluate.
168
+ max_depth: Maximum crawl depth to follow for url hops.
169
+
170
+ Yields:
171
+ Extracted values produced by the expression (HTML elements or
172
+ wxpath-specific value types).
173
+ """
174
+ segments = parser.parse(expression)
113
175
 
114
176
  queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
115
177
  inflight: dict[str, CrawlTask] = {}
116
178
  pending_tasks = 0
117
179
 
118
180
  def is_terminal():
181
+ # NOTE: consider adopting state machine pattern for determining
182
+ # the current state of the engine.
119
183
  return queue.empty() and pending_tasks <= 0
120
184
 
121
185
  async with self.crawler as crawler:
@@ -177,7 +241,7 @@ class WXPathEngine(HookedEngineBase):
177
241
  continue
178
242
 
179
243
  # NOTE: Consider allowing redirects
180
- if resp.status != 200 or not resp.body:
244
+ if resp.status not in self.allowed_response_codes or not resp.body:
181
245
  log.warning(f"Got non-200 response from {resp.request.url}")
182
246
  if is_terminal():
183
247
  break
@@ -226,20 +290,36 @@ class WXPathEngine(HookedEngineBase):
226
290
  async def _process_pipeline(
227
291
  self,
228
292
  task: CrawlTask,
229
- elem,
293
+ elem: Any,
230
294
  depth: int,
231
295
  max_depth: int,
232
296
  queue: asyncio.Queue[CrawlTask],
233
- ):
234
- mini_queue: deque[(HtmlElement, list[tuple[str, str]])] = deque([(elem, task.segments)])
297
+ ) -> AsyncGenerator[Any, None]:
298
+ """Process a queue of intents for a single crawl branch.
299
+
300
+ Traverses wxpath segments depth-first within a page while coordinating
301
+ newly discovered crawl intents back to the shared queue.
302
+
303
+ Args:
304
+ task: The originating crawl task for this branch.
305
+ elem: Current DOM element (or extracted value) being processed.
306
+ depth: Current traversal depth.
307
+ max_depth: Maximum permitted crawl depth.
308
+ queue: Shared crawl queue for enqueuing downstream URLs.
309
+
310
+ Yields:
311
+ object: Extracted values or processed elements as produced by operators.
312
+ """
313
+ mini_queue: deque[tuple[HtmlElement | Any, list[Binary | Segment] | Segments]] = deque(
314
+ [(elem, task.segments)]
315
+ )
235
316
 
236
317
  while mini_queue:
237
- elem, segments = mini_queue.popleft()
238
-
239
- op, _ = segments[0]
240
- operator = get_operator(op)
318
+ elem, bin_or_segs = mini_queue.popleft()
241
319
 
242
- intents = operator(elem, segments, depth)
320
+ binary_or_segment = bin_or_segs if isinstance(bin_or_segs, Binary) else bin_or_segs[0]
321
+ operator = get_operator(binary_or_segment)
322
+ intents = operator(elem, bin_or_segs, depth)
243
323
 
244
324
  if not intents:
245
325
  return
@@ -253,6 +333,7 @@ class WXPathEngine(HookedEngineBase):
253
333
  # if intent.url not in self.seen_urls and next_depth <= max_depth:
254
334
  if next_depth <= max_depth:
255
335
  # self.seen_urls.add(intent.url)
336
+ log.debug(f"Depth: {next_depth}; Enqueuing {intent.url}")
256
337
  queue.put_nowait(
257
338
  CrawlTask(
258
339
  elem=None,
@@ -272,29 +353,33 @@ class WXPathEngine(HookedEngineBase):
272
353
 
273
354
  def wxpath_async(path_expr: str,
274
355
  max_depth: int,
275
- engine: WXPathEngine = None) -> AsyncGenerator[Any, None]:
356
+ engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
276
357
  if engine is None:
277
358
  engine = WXPathEngine()
278
359
  return engine.run(path_expr, max_depth)
279
360
 
280
361
 
281
362
  ##### ASYNC IN SYNC #####
282
- def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = None):
283
- """
284
- Evaluate a wxpath expression using concurrent breadth-first traversal.
285
-
286
- Args:
287
- path_expr (str): A wxpath expression.
288
- max_depth (int, optional): Maximum crawl depth. Must be at least the
289
- number of `url*` segments minus one. Defaults to `1`.
290
-
291
- Yields:
292
- lxml.html.HtmlElement | wxpath.models.WxStr | dict | Any: The same objects
293
- produced by the sequential evaluator.
363
+ def wxpath_async_blocking_iter(
364
+ path_expr: str,
365
+ max_depth: int = 1,
366
+ engine: WXPathEngine | None = None,
367
+ ) -> Iterator[Any]:
368
+ """Evaluate a wxpath expression using concurrent breadth-first traversal.
294
369
 
295
370
  Warning:
296
371
  Spins up its own event loop therefore this function must **not** be
297
372
  invoked from within an active asyncio event loop.
373
+
374
+ Args:
375
+ path_expr: A wxpath expression.
376
+ max_depth: Maximum crawl depth. Must be at least the number of
377
+ ``url*`` segments minus one.
378
+ engine: Optional pre-configured WXPathEngine instance.
379
+
380
+ Yields:
381
+ object: Extracted objects (HtmlElement, WxStr, dict, or other values)
382
+ produced by the expression evaluator.
298
383
  """
299
384
  loop = asyncio.new_event_loop()
300
385
  asyncio.set_event_loop(loop)
@@ -311,5 +396,11 @@ def wxpath_async_blocking_iter(path_expr, max_depth=1, engine: WXPathEngine = No
311
396
  loop.close()
312
397
 
313
398
 
314
- def wxpath_async_blocking(path_expr, max_depth=1, engine: WXPathEngine = None):
315
- return list(wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine))
399
+ def wxpath_async_blocking(
400
+ path_expr: str,
401
+ max_depth: int = 1,
402
+ engine: WXPathEngine | None = None,
403
+ ) -> list[Any]:
404
+ return list(
405
+ wxpath_async_blocking_iter(path_expr, max_depth=max_depth, engine=engine)
406
+ )
@@ -1,4 +1,3 @@
1
- import requests
2
1
  from lxml import etree, html
3
2
 
4
3
  from wxpath import patches
@@ -40,9 +39,3 @@ def detach_html_root(elem, base_url=None):
40
39
  new_root.base_url = base_url
41
40
 
42
41
  return new_root
43
-
44
-
45
- def fetch_html(url):
46
- response = requests.get(url, timeout=10)
47
- response.raise_for_status()
48
- return response.content
wxpath/hooks/registry.py CHANGED
@@ -17,7 +17,6 @@ Write once:
17
17
  from __future__ import annotations
18
18
 
19
19
  import functools
20
- from collections import OrderedDict
21
20
  from collections.abc import Generator
22
21
  from dataclasses import dataclass, field
23
22
  from typing import Any, Iterable, List, Optional, Protocol
@@ -66,20 +65,24 @@ class Hook(Protocol):
66
65
  # --------------------------------------------------------------------------- #
67
66
  # Global registry helpers
68
67
  # --------------------------------------------------------------------------- #
69
- _global_hooks: OrderedDict[str, Hook] = OrderedDict()
68
+ _global_hooks: dict[str, Hook] = dict()
70
69
 
71
70
 
72
71
  def register(hook: Hook | type) -> Hook:
73
- """
74
- Decorator / helper to add a Hook to the global list.
75
-
76
- Example
77
- -------
78
- >>> @register
79
- ... class DebugHook:
80
- ... def post_fetch(self, ctx, html_bytes):
81
- ... print("Fetched", ctx.url)
82
- ... return html_bytes
72
+ """Decorator/helper to add a Hook to the global list.
73
+
74
+ Args:
75
+ hook: A Hook class or instance to register.
76
+
77
+ Returns:
78
+ The registered hook (instantiated if a class was provided).
79
+
80
+ Example:
81
+ >>> @register
82
+ ... class DebugHook:
83
+ ... def post_fetch(self, ctx, html_bytes):
84
+ ... print("Fetched", ctx.url)
85
+ ... return html_bytes
83
86
  """
84
87
 
85
88
  hook_name = getattr(hook, '__name__', hook.__class__.__name__)
@@ -101,9 +104,13 @@ def iter_post_extract_hooks() -> Iterable[Hook]:
101
104
 
102
105
 
103
106
  def pipe_post_extract(gen_func):
104
- """
105
- Decorator: wrap a *generator function* so every yielded value
106
- is piped through the registered post_extract hooks.
107
+ """Wrap a generator function to pipe yielded values through post_extract hooks.
108
+
109
+ Args:
110
+ gen_func: A generator function to wrap.
111
+
112
+ Returns:
113
+ A wrapped generator that filters values through registered hooks.
107
114
  """
108
115
  @functools.wraps(gen_func)
109
116
  def wrapper(*args, **kwargs) -> Generator:
@@ -118,8 +125,13 @@ def pipe_post_extract(gen_func):
118
125
 
119
126
 
120
127
  def pipe_post_extract_async(async_gen_func):
121
- """
122
- Async variant - wraps an *async* generator function.
128
+ """Wrap an async generator function to pipe yielded values through hooks.
129
+
130
+ Args:
131
+ async_gen_func: An async generator function to wrap.
132
+
133
+ Returns:
134
+ A wrapped async generator that filters values through registered hooks.
123
135
  """
124
136
  @functools.wraps(async_gen_func)
125
137
  async def wrapper(*args, **kwargs):
@@ -10,6 +10,7 @@ import aiohttp
10
10
  from wxpath.http.client.request import Request
11
11
  from wxpath.http.client.response import Response
12
12
  from wxpath.http.policy.retry import RetryPolicy
13
+ from wxpath.http.policy.robots import RobotsTxtPolicy
13
14
  from wxpath.http.policy.throttler import AbstractThrottler, AutoThrottler
14
15
  from wxpath.http.stats import CrawlerStats, build_trace_config
15
16
  from wxpath.util.logging import get_logger
@@ -22,6 +23,8 @@ HEADERS = {"User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
22
23
 
23
24
 
24
25
  class Crawler:
26
+ """Concurrent HTTP crawler that manages throttling, retries, and robots."""
27
+
25
28
  def __init__(
26
29
  self,
27
30
  concurrency: int = 16,
@@ -35,11 +38,13 @@ class Crawler:
35
38
  auto_throttle_target_concurrency: float = None,
36
39
  auto_throttle_start_delay: float = 0.25,
37
40
  auto_throttle_max_delay: float = 10.0,
41
+ respect_robots: bool = True,
38
42
  ):
39
43
  self.concurrency = concurrency
40
44
  self._timeout = aiohttp.ClientTimeout(total=timeout)
41
45
  self._headers = HEADERS | (headers or {}) # merge headers
42
- self._proxies = proxies or {}
46
+ self._proxies = proxies if (isinstance(proxies, defaultdict) or proxies) else {}
47
+ self.respect_robots = respect_robots
43
48
 
44
49
  self.retry_policy = retry_policy or RetryPolicy()
45
50
  self.throttler = throttler or AutoThrottler(
@@ -57,8 +62,10 @@ class Crawler:
57
62
  self._workers: list[asyncio.Task] = []
58
63
  self._closed = False
59
64
  self._stats = CrawlerStats()
65
+ self._robots_policy: RobotsTxtPolicy | None = None
60
66
 
61
- def build_session(self):
67
+ def build_session(self) -> aiohttp.ClientSession:
68
+ """Construct an `aiohttp.ClientSession` with tracing and pooling."""
62
69
  trace_config = build_trace_config(self._stats)
63
70
  # Need to build the connector as late as possible as it requires the loop
64
71
  connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
@@ -69,26 +76,34 @@ class Crawler:
69
76
  trace_configs=[trace_config]
70
77
  )
71
78
 
72
- async def __aenter__(self):
79
+ async def __aenter__(self) -> "Crawler":
80
+ """Initialize HTTP session and start background workers."""
73
81
  if self._session is None:
74
82
  # self._session = aiohttp.ClientSession(timeout=self._timeout)
75
83
  self._session = self.build_session()
76
84
 
85
+ if self.respect_robots:
86
+ self._robots_policy = RobotsTxtPolicy(self._session)
87
+
77
88
  self._workers = [
78
89
  asyncio.create_task(self._worker(), name=f"crawler-worker-{i}")
79
90
  for i in range(self.concurrency)
80
91
  ]
81
92
  return self
82
93
 
83
- async def __aexit__(self, *_):
94
+ async def __aexit__(self, *_) -> None:
95
+ """Tear down workers and close the HTTP session."""
84
96
  self._closed = True
85
97
  for w in self._workers:
86
98
  w.cancel()
99
+
87
100
  await asyncio.gather(*self._workers, return_exceptions=True)
101
+
88
102
  if self._session:
89
103
  await self._session.close()
90
104
 
91
- def submit(self, req: Request):
105
+ def submit(self, req: Request) -> None:
106
+ """Queue a request for fetching or raise if crawler already closed."""
92
107
  if self._closed:
93
108
  raise RuntimeError("crawler is closed")
94
109
  self._pending.put_nowait(req)
@@ -96,18 +111,28 @@ class Crawler:
96
111
  def __aiter__(self) -> AsyncIterator[Response]:
97
112
  return self._result_iter()
98
113
 
99
- async def _result_iter(self):
114
+ async def _result_iter(self) -> AsyncIterator[Response]:
115
+ """Async iterator yielding responses as workers produce them."""
100
116
  # while not self._closed:
101
117
  while not (self._closed and self._results.empty()):
102
118
  resp = await self._results.get()
103
119
  self._results.task_done()
104
120
  yield resp
105
121
 
106
- def _proxy_for(self, url: str):
122
+ def _proxy_for(self, url: str) -> str | None:
107
123
  host = urllib.parse.urlsplit(url).hostname
108
- return self._proxies.get(host)
109
-
110
- async def _worker(self):
124
+ try:
125
+ # bracket notation first, for defaultdicts
126
+ value = self._proxies[host]
127
+ except KeyError:
128
+ value = self._proxies.get(host)
129
+
130
+ if not value:
131
+ log.debug("proxy", extra={"host": host, "value": value})
132
+ return value
133
+
134
+ async def _worker(self) -> None:
135
+ """Worker loop that fetches pending requests and enqueues results."""
111
136
  while True:
112
137
  req = await self._pending.get()
113
138
  try:
@@ -133,8 +158,17 @@ class Crawler:
133
158
  self._pending.task_done()
134
159
 
135
160
  async def _fetch_one(self, req: Request) -> Response | None:
161
+ """Fetch a single request, handling robots, throttling, and retries."""
136
162
  host = req.hostname
137
163
 
164
+ if self._robots_policy:
165
+ can_fetch = await self._robots_policy.can_fetch(
166
+ req.url, self._headers.get("User-Agent")
167
+ )
168
+ if not can_fetch:
169
+ log.debug("disallowed by robots.txt", extra={"url": req.url})
170
+ return Response(req, 403, b"", error=RuntimeError("Disallowed by robots.txt"))
171
+
138
172
  # TODO: Move this filter to hooks
139
173
  if req.url.lower().endswith((".pdf", ".zip", ".exe")):
140
174
  req.max_retries = 0
@@ -181,7 +215,8 @@ class Crawler:
181
215
  log.error("request failed", extra={"url": req.url}, exc_info=exc)
182
216
  return Response(req, 0, b"", error=exc)
183
217
 
184
- async def _retry(self, req: Request):
218
+ async def _retry(self, req: Request) -> None:
219
+ """Reschedule a request according to the retry policy."""
185
220
  req.retries += 1
186
221
  delay = self.retry_policy.get_delay(req)
187
222
 
@@ -1,24 +1,26 @@
1
1
  import time
2
2
  from dataclasses import dataclass, field
3
- from typing import Any, Dict
3
+ from typing import Any
4
4
 
5
5
 
6
6
  @dataclass
7
7
  class Request:
8
+ """HTTP request envelope used by the crawler."""
8
9
  url: str
9
10
  method: str = "GET"
10
- headers: Dict[str, str] = field(default_factory=dict)
11
+ headers: dict[str, str] = field(default_factory=dict)
11
12
  timeout: float = 15.0
12
13
 
13
14
  retries: int = 0
14
15
  max_retries: int | None = None
15
16
  dont_retry: bool = False
16
17
 
17
- meta: Dict[str, Any] = field(default_factory=dict)
18
+ meta: dict[str, Any] = field(default_factory=dict)
18
19
 
19
20
  created_at: float = field(default_factory=time.monotonic)
20
21
 
21
22
  def copy_for_retry(self) -> "Request":
23
+ """Create a copy incrementing the retry counter for scheduling."""
22
24
  return Request(
23
25
  url=self.url,
24
26
  method=self.method,
@@ -26,6 +28,7 @@ class Request:
26
28
  timeout=self.timeout,
27
29
  retries=self.retries + 1,
28
30
  max_retries=self.max_retries,
31
+ dont_retry=self.dont_retry,
29
32
  meta=self.meta,
30
33
  )
31
34
 
@@ -10,5 +10,5 @@ class Response:
10
10
  request: Request
11
11
  status: int
12
12
  body: bytes
13
- headers: dict | None = None
13
+ headers: dict[str, str] | None = None
14
14
  error: Optional[Exception] = field(default=None, kw_only=True)
@@ -0,0 +1,82 @@
1
+ import asyncio
2
+ import urllib.parse
3
+ import urllib.robotparser
4
+
5
+ import aiohttp
6
+
7
+ from wxpath.util.logging import get_logger
8
+
9
+ log = get_logger(__name__)
10
+
11
+
12
+ class RobotsTxtPolicy:
13
+ """Caches and evaluates robots.txt rules for crawler requests."""
14
+
15
+ def __init__(self,
16
+ session: aiohttp.ClientSession,
17
+ default_parser: type['RobotsParserBase'] | None = None):
18
+ self._session = session
19
+ self._parsers: dict[str, "RobotsParserBase"] = {}
20
+ self._lock = asyncio.Lock()
21
+ self._default_parser = default_parser or UrllibRobotParser
22
+
23
+ async def can_fetch(self, url: str, user_agent: str | None) -> bool:
24
+ """Return whether the crawler is allowed to fetch `url`."""
25
+ host = urllib.parse.urlsplit(url).hostname
26
+ if not host:
27
+ return False
28
+
29
+ # Due to multiple aiohttp workers running concurrently, we need to lock
30
+ async with self._lock:
31
+ if host not in self._parsers:
32
+ self._parsers[host] = await self._fetch_robots_txt(host)
33
+
34
+ return self._parsers[host].can_fetch(url, user_agent)
35
+
36
+ async def _fetch_robots_txt(self, host: str) -> "RobotsParserBase":
37
+ """Retrieve and parse the robots.txt for `host`, failing open on errors."""
38
+ url = f"http://{host}/robots.txt"
39
+ try:
40
+ async with self._session.get(url) as response:
41
+ if response.status == 200:
42
+ text = await response.text()
43
+ # Pass the text as-is to the parser, let it handle the format
44
+ if self._default_parser == UrllibRobotParser:
45
+ return self._default_parser(text.splitlines())
46
+ else:
47
+ return self._default_parser(text)
48
+ else:
49
+ # Empty robots.txt - allow all
50
+ if self._default_parser == UrllibRobotParser:
51
+ return self._default_parser([])
52
+ else:
53
+ return self._default_parser("")
54
+ except Exception:
55
+ # If robots.txt is unavailable, allow all requests (fail open)
56
+ log.debug(f"Failed to fetch robots.txt from {host}, allowing all requests")
57
+ if self._default_parser == UrllibRobotParser:
58
+ return self._default_parser([])
59
+ else:
60
+ return self._default_parser("")
61
+
62
+
63
+ class RobotsParserBase:
64
+ """Base type for robots.txt parsers used by the policy."""
65
+
66
+
67
+ class UrllibRobotParser(RobotsParserBase):
68
+ """Adapter around `urllib.robotparser.RobotFileParser`."""
69
+
70
+ def __init__(self, text):
71
+ self._parser = urllib.robotparser.RobotFileParser()
72
+ # urllib.robotparser.RobotFileParser.parse() expects a list of lines
73
+ if isinstance(text, str):
74
+ lines = text.splitlines() if text else []
75
+ else:
76
+ lines = text if text else []
77
+ self._parser.parse(lines)
78
+
79
+ def can_fetch(self, url, user_agent):
80
+ """Return whether the URL is allowed for the given user agent."""
81
+ return self._parser.can_fetch(user_agent, url)
82
+