wxpath 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxpath/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from . import settings
1
2
  from .core.runtime.engine import wxpath_async, wxpath_async_blocking, wxpath_async_blocking_iter
2
3
  from .util.logging import configure_logging
3
4
 
@@ -6,4 +7,5 @@ __all__ = [
6
7
  'wxpath_async_blocking',
7
8
  'wxpath_async_blocking_iter',
8
9
  'configure_logging',
10
+ 'settings',
9
11
  ]
wxpath/cli.py CHANGED
@@ -47,6 +47,11 @@ def main():
47
47
  help="Respect robots.txt",
48
48
  default=True
49
49
  )
50
+ arg_parser.add_argument(
51
+ "--insecure",
52
+ action="store_true",
53
+ help="Disable SSL certificate verification (use for sites with broken chains)",
54
+ )
50
55
  arg_parser.add_argument(
51
56
  "--cache",
52
57
  action="store_true",
@@ -112,6 +117,7 @@ def main():
112
117
  concurrency=args.concurrency,
113
118
  per_host=args.concurrency_per_host,
114
119
  respect_robots=args.respect_robots,
120
+ verify_ssl=not args.insecure,
115
121
  headers=custom_headers
116
122
  )
117
123
  engine = WXPathEngine(crawler=crawler)
wxpath/core/models.py CHANGED
@@ -61,6 +61,7 @@ class InfiniteCrawlIntent(ProcessIntent):
61
61
 
62
62
  @dataclass(slots=True)
63
63
  class ExtractIntent(ProcessIntent):
64
+ """TODO: May be redundant with ProcessIntent?"""
64
65
  pass
65
66
 
66
67
 
wxpath/core/ops.py CHANGED
@@ -19,6 +19,7 @@ from wxpath.core.parser import (
19
19
  Binary,
20
20
  Call,
21
21
  ContextItem,
22
+ Depth,
22
23
  Segment,
23
24
  Segments,
24
25
  String,
@@ -78,7 +79,10 @@ def get_operator(
78
79
 
79
80
 
80
81
  @register('url', (String,))
82
+ @register('url', (String, Depth))
81
83
  @register('url', (String, Xpath))
84
+ @register('url', (String, Depth, Xpath))
85
+ @register('url', (String, Xpath, Depth))
82
86
  def _handle_url_str_lit(curr_elem: html.HtmlElement,
83
87
  curr_segments: list[Url | Xpath],
84
88
  curr_depth: int, **kwargs) -> Iterable[Intent]:
@@ -87,9 +91,12 @@ def _handle_url_str_lit(curr_elem: html.HtmlElement,
87
91
 
88
92
  next_segments = curr_segments[1:]
89
93
 
90
- if len(url_call.args) == 2:
94
+ # NOTE: Expects parser to produce UrlCrawl node in expressions
95
+ # that look like `url('...', follow=//a/@href)`
96
+ if isinstance(url_call, UrlCrawl):
97
+ xpath_arg = [arg for arg in url_call.args if isinstance(arg, Xpath)][0]
91
98
  _segments = [
92
- UrlCrawl('///url', [url_call.args[1], url_call.args[0].value])
99
+ UrlCrawl('///url', [xpath_arg, url_call.args[0].value])
93
100
  ] + next_segments
94
101
 
95
102
  yield CrawlIntent(url=url_call.args[0].value, next_segments=_segments)
@@ -112,16 +119,6 @@ def _handle_xpath(curr_elem: html.HtmlElement,
112
119
  raise ValueError("Element must be provided when path_expr does not start with 'url()'.")
113
120
  base_url = getattr(curr_elem, 'base_url', None)
114
121
  log.debug("base url", extra={"depth": curr_depth, "op": 'xpath', "base_url": base_url})
115
-
116
- _backlink_str = f"string('{curr_elem.get('backlink')}')"
117
- # We use the root tree's depth and not curr_depth because curr_depth accounts for a +1
118
- # increment after each url*() hop
119
- _depth_str = f"number({curr_elem.getroottree().getroot().get('depth')})"
120
- expr = expr.replace('wx:backlink()', _backlink_str)
121
- expr = expr.replace('wx:backlink(.)', _backlink_str)
122
- expr = expr.replace('wx:depth()', _depth_str)
123
- expr = expr.replace('wx:depth(.)', _depth_str)
124
-
125
122
  elems = curr_elem.xpath3(expr)
126
123
 
127
124
  next_segments = curr_segments[1:]
wxpath/core/parser.py CHANGED
@@ -13,7 +13,8 @@ except ImportError:
13
13
 
14
14
 
15
15
  TOKEN_SPEC = [
16
- ("NUMBER", r"\d+(\.\d+)?"),
16
+ ("NUMBER", r"\d+\.\d+"),
17
+ ("INTEGER", r"\d+"),
17
18
  ("STRING", r"'([^'\\]|\\.)*'|\"([^\"\\]|\\.)*\""), # TODO: Rename to URL Literal
18
19
  ("WXPATH", r"/{0,3}\s*url"), # Must come before NAME to match 'url' as WXPATH
19
20
  # ("///URL", r"/{3}\s*url"),
@@ -22,6 +23,7 @@ TOKEN_SPEC = [
22
23
  ("URL", r"\s*url"), # Must come before NAME to match 'url' as WXPATH
23
24
  # ("NAME", r"[a-zA-Z_][a-zA-Z0-9_]*"),
24
25
  ("FOLLOW", r",?\s{,}follow="),
26
+ ("DEPTH", r",?\s{,}depth="),
25
27
  ("OP", r"\|\||<=|>=|!=|=|<|>|\+|-|\*|/|!"), # Added || for string concat
26
28
  ("LPAREN", r"\("),
27
29
  ("RPAREN", r"\)"),
@@ -63,6 +65,14 @@ def tokenize(src: str):
63
65
  class Number:
64
66
  value: float
65
67
 
68
+ @dataclass
69
+ class Integer:
70
+ value: int
71
+
72
+ @dataclass
73
+ class Depth(Integer):
74
+ pass
75
+
66
76
  @dataclass
67
77
  class String:
68
78
  value: str
@@ -273,6 +283,10 @@ class Parser:
273
283
  if tok.type == "NUMBER":
274
284
  self.advance()
275
285
  return Number(float(tok.value))
286
+
287
+ if tok.type == "INTEGER":
288
+ self.advance()
289
+ return Integer(int(tok.value))
276
290
 
277
291
  if tok.type == "STRING":
278
292
  self.advance()
@@ -358,18 +372,18 @@ class Parser:
358
372
  self.advance()
359
373
 
360
374
  return result
361
-
362
375
 
363
376
  def capture_url_arg_content(self) -> list[Call | Xpath | ContextItem]:
364
377
  """Capture content inside a url() call, handling nested wxpath expressions.
365
378
 
366
379
  Supports patterns like::
367
380
 
368
- url('...') -> [String]
369
- url('...' follow=//a/@href) -> [String, Xpath]
370
- url(//a/@href) -> [Xpath]
371
- url( url('..')//a/@href ) -> [Call, Xpath]
372
- url( url( url('..')//a )//b ) -> [Call, Xpath]
381
+ url('...') -> [String]
382
+ url('...' follow=//a/@href) -> [String, Xpath]
383
+ url('...' follow=//a/@href depth=2) -> [String, Xpath, Integer]
384
+ url(//a/@href depth=2) -> [Xpath, Integer]
385
+ url( url('..')//a/@href ) -> [Call, Xpath]
386
+ url( url( url('..')//a )//b ) -> [Call, Xpath]
373
387
 
374
388
  Returns:
375
389
  A list of parsed elements: Xpath nodes for xpath content and Call
@@ -380,7 +394,10 @@ class Parser:
380
394
  paren_balance = 1 # We're already inside the opening paren of url()
381
395
  brace_balance = 0 # Track braces for map constructors
382
396
  reached_follow_token = False
397
+ reached_depth_token = False
383
398
  follow_xpath = ""
399
+ depth_number = ""
400
+
384
401
  while paren_balance > 0 and self.token.type != "EOF":
385
402
  if self.token.type == "WXPATH":
386
403
  # Found nested wxpath: save any accumulated xpath content first
@@ -396,13 +413,22 @@ class Parser:
396
413
 
397
414
  elif self.token.type == "FOLLOW":
398
415
  reached_follow_token = True
416
+ reached_depth_token = False
417
+ self.advance()
418
+
419
+ elif self.token.type == "DEPTH":
420
+ reached_depth_token = True
421
+ reached_follow_token = False
399
422
  self.advance()
400
423
 
401
424
  elif self.token.type == "LPAREN":
402
425
  # Opening paren that's NOT part of a url() call
403
426
  # (it's part of an xpath function like contains(), starts-with(), etc.)
404
427
  paren_balance += 1
405
- current_xpath += self.token.value
428
+ if not reached_follow_token:
429
+ current_xpath += self.token.value
430
+ else:
431
+ follow_xpath += self.token.value
406
432
  self.advance()
407
433
 
408
434
  elif self.token.type == "RPAREN":
@@ -410,26 +436,37 @@ class Parser:
410
436
  if paren_balance == 0:
411
437
  # This is the closing paren of the outer url()
412
438
  break
413
- current_xpath += self.token.value
439
+ if not reached_follow_token:
440
+ current_xpath += self.token.value
441
+ else:
442
+ follow_xpath += self.token.value
414
443
  self.advance()
415
444
 
416
445
  elif self.token.type == "LBRACE":
417
446
  # Opening brace for map constructors
418
447
  brace_balance += 1
419
- current_xpath += self.token.value
448
+ if not reached_follow_token:
449
+ current_xpath += self.token.value
450
+ else:
451
+ follow_xpath += self.token.value
420
452
  self.advance()
421
453
 
422
454
  elif self.token.type == "RBRACE":
423
455
  brace_balance -= 1
424
- current_xpath += self.token.value
456
+ if not reached_follow_token:
457
+ current_xpath += self.token.value
458
+ else:
459
+ follow_xpath += self.token.value
425
460
  self.advance()
426
461
 
427
462
  else:
428
463
  # Accumulate all other tokens as xpath content
429
- if not reached_follow_token:
430
- current_xpath += self.token.value
431
- else:
464
+ if reached_follow_token:
432
465
  follow_xpath += self.token.value
466
+ elif reached_depth_token:
467
+ depth_number += self.token.value
468
+ else:
469
+ current_xpath += self.token.value
433
470
 
434
471
  self.advance()
435
472
 
@@ -447,6 +484,9 @@ class Parser:
447
484
  if follow_xpath.strip():
448
485
  elements.append(Xpath(follow_xpath.strip()))
449
486
 
487
+ if depth_number.strip():
488
+ elements.append(Depth(int(depth_number.strip())))
489
+
450
490
  return elements
451
491
 
452
492
  def parse_call(self, func_name: str) -> Call | Segments:
@@ -462,13 +502,16 @@ class Parser:
462
502
  self.advance()
463
503
  # Handle follow=...
464
504
  if self.token.type == "FOLLOW":
465
- self.advance()
466
505
  follow_arg = self.capture_url_arg_content()
467
506
  args.extend(follow_arg)
507
+ if self.token.type == "DEPTH":
508
+ depth_arg = self.capture_url_arg_content()
509
+ args.extend(depth_arg)
468
510
  elif self.token.type == "WXPATH":
469
511
  # Nested wxpath: url( url('...')//a/@href ) or url( /url(...) )
470
- # Use capture_url_arg_content to handle nested wxpath and xpath
471
- args = self.capture_url_arg_content()
512
+ # NOTE: We used to use capture_url_arg_content to handle nested wxpath and xpath
513
+ # args = self.capture_url_arg_content()
514
+ args = self.nud()
472
515
  else:
473
516
  # Simple xpath argument: url(//a/@href)
474
517
  # Could still contain nested wxpath, so use capture_url_arg_content
@@ -489,8 +532,18 @@ class Parser:
489
532
 
490
533
  return _specify_call_types(func_name, args)
491
534
 
492
-
493
535
  def _specify_call_types(func_name: str, args: list) -> Call | Segments:
536
+ """
537
+ Specify the type of a call based on the function name and arguments.
538
+ TODO: Provide example wxpath expressions for each call type.
539
+
540
+ Args:
541
+ func_name: The name of the function.
542
+ args: The arguments of the function.
543
+
544
+ Returns:
545
+ Call | Segments: The type of the call.
546
+ """
494
547
  if func_name == "url":
495
548
  if len(args) == 1:
496
549
  if isinstance(args[0], String):
@@ -500,17 +553,33 @@ def _specify_call_types(func_name: str, args: list) -> Call | Segments:
500
553
  else:
501
554
  raise ValueError(f"Unknown argument type: {type(args[0])}")
502
555
  elif len(args) == 2:
503
- if isinstance(args[0], String) and isinstance(args[1], Xpath):
556
+ arg0, arg1 = args
557
+ if isinstance(arg0, String) and isinstance(arg1, Xpath):
558
+ # Example: url('...', follow=//a/@href)
504
559
  return UrlCrawl(func_name, args)
505
- elif isinstance(args[0], UrlLiteral) and isinstance(args[1], Xpath):
560
+ elif isinstance(arg0, String) and isinstance(arg1, Integer):
561
+ # Example: url('...', depth=2)
562
+ return UrlLiteral(func_name, args)
563
+ elif isinstance(arg0, UrlLiteral) and isinstance(arg1, Xpath):
506
564
  args.append(UrlQuery('url', [ContextItem()]))
507
565
  return Segments(args)
508
- elif isinstance(args[0], (Segments, list)) and isinstance(args[1], Xpath):
509
- segs = args[0]
510
- segs.append(args[1])
566
+ elif isinstance(arg0, (Segments, list)) and isinstance(arg1, Xpath):
567
+ segs = arg0
568
+ segs.append(arg1)
511
569
  return Segments(segs)
512
570
  else:
513
571
  raise ValueError(f"Unknown arguments: {args}")
572
+ elif len(args) == 3:
573
+ arg0, arg1, arg2 = args
574
+ if (isinstance(arg0, String) and (
575
+ (isinstance(arg1, Xpath) and isinstance(arg2, Integer)) or
576
+ (isinstance(arg1, Integer) and isinstance(arg2, Xpath))
577
+ )):
578
+ # Example: url('...', follow=//a/@href, depth=2)
579
+ # Example: url('...', depth=2, follow=//a/@href)
580
+ return UrlCrawl(func_name, args)
581
+ else:
582
+ raise ValueError(f"Unknown arguments: {args}")
514
583
  else:
515
584
  raise ValueError(f"Unknown arguments: {args}")
516
585
  elif func_name == "/url" or func_name == "//url":
@@ -18,7 +18,7 @@ from wxpath.core.models import (
18
18
  ProcessIntent,
19
19
  )
20
20
  from wxpath.core.ops import get_operator
21
- from wxpath.core.parser import Binary, Segment, Segments
21
+ from wxpath.core.parser import Binary, Depth, Segment, Segments
22
22
  from wxpath.core.runtime.helpers import parse_html
23
23
  from wxpath.hooks.registry import FetchContext, get_hooks
24
24
  from wxpath.http.client.crawler import Crawler
@@ -158,17 +158,48 @@ class WXPathEngine(HookedEngineBase):
158
158
  if allow_redirects:
159
159
  self.allowed_response_codes |= {301, 302, 303, 307, 308}
160
160
 
161
+ def _get_max_depth(self, bin_or_segs: Binary | Segments, max_depth: int) -> int:
162
+ """Get the maximum crawl depth for a given expression. Will find a Depth
163
+ argument at the beginning of the expression and return its value. Otherwise, returns the
164
+ max_depth value provided.
165
+ TODO: There has to be a better way to do this.
166
+ """
167
+ if isinstance(bin_or_segs, Binary):
168
+ if hasattr(bin_or_segs.left, 'func') == 'url':
169
+ depth_arg = [arg for arg in bin_or_segs.left.args if isinstance(arg, Depth)][0]
170
+ return int(depth_arg.value)
171
+ elif hasattr(bin_or_segs.right, 'func') == 'url':
172
+ depth_arg = [arg for arg in bin_or_segs.right.args if isinstance(arg, Depth)][0]
173
+ return int(depth_arg.value)
174
+ elif isinstance(bin_or_segs, Segments):
175
+ depth_arg = [arg for arg in bin_or_segs[0].args if isinstance(arg, Depth)]
176
+ if depth_arg:
177
+ return int(depth_arg[0].value)
178
+ return max_depth
179
+
161
180
  async def run(
162
181
  self,
163
182
  expression: str,
164
183
  max_depth: int,
165
- progress: bool = False
184
+ progress: bool = False,
185
+ yield_errors: bool = False,
166
186
  ) -> AsyncGenerator[Any, None]:
167
187
  """Execute a wxpath expression concurrently and yield results.
168
188
 
169
189
  Builds and drives a BFS-like crawl pipeline that honors robots rules,
170
190
  throttling, and hook callbacks while walking the web graph.
171
191
 
192
+ NOTES ON max_depth:
193
+ If depth is provided in the expression, it will be used to limit the depth of the
194
+ crawl. If depth is provided in the expression and max_depth is provided as an argument
195
+ to `run`, the inline depth in the expression will take precedence.
196
+
197
+ Currently, max_depth control flow logic is detected and executed in the
198
+ engine. In the future, the operation handlers (ops.py) could be responsible for
199
+ detecting max_depth, and sending a terminal intent to the engine. It's also possible
200
+ that the depth terminals are relative to the current depth (i.e. `url(//xpath, depth=2)`
201
+ implies crawling only the next 2 levels). This is not yet supported.
202
+
172
203
  Args:
173
204
  expression: WXPath expression string to evaluate.
174
205
  max_depth: Maximum crawl depth to follow for url hops.
@@ -178,7 +209,9 @@ class WXPathEngine(HookedEngineBase):
178
209
  Extracted values produced by the expression (HTML elements or
179
210
  wxpath-specific value types).
180
211
  """
181
- segments = parser.parse(expression)
212
+ bin_or_segs = parser.parse(expression)
213
+
214
+ max_depth = self._get_max_depth(bin_or_segs, max_depth)
182
215
 
183
216
  queue: asyncio.Queue[CrawlTask] = asyncio.Queue()
184
217
  inflight: dict[str, CrawlTask] = {}
@@ -222,7 +255,7 @@ class WXPathEngine(HookedEngineBase):
222
255
  seed_task = CrawlTask(
223
256
  elem=None,
224
257
  url=None,
225
- segments=segments,
258
+ segments=bin_or_segs,
226
259
  depth=-1,
227
260
  backlink=None,
228
261
  )
@@ -248,12 +281,32 @@ class WXPathEngine(HookedEngineBase):
248
281
 
249
282
  if task is None:
250
283
  log.warning(f"Got unexpected response from {resp.request.url}")
284
+
285
+ if yield_errors:
286
+ yield {
287
+ "__type__": "error",
288
+ "url": resp.request.url,
289
+ "reason": "unexpected_response",
290
+ "status": resp.body,
291
+ "body": resp.body
292
+ }
293
+
251
294
  if is_terminal():
252
295
  break
253
296
  continue
254
297
 
255
298
  if resp.error:
256
299
  log.warning(f"Got error from {resp.request.url}: {resp.error}")
300
+
301
+ if yield_errors:
302
+ yield {
303
+ "__type__": "error",
304
+ "url": resp.request.url,
305
+ "reason": "network_error",
306
+ "exception": str(resp.error),
307
+ "status": resp.status,
308
+ "body": resp.body
309
+ }
257
310
  if is_terminal():
258
311
  break
259
312
  continue
@@ -261,6 +314,16 @@ class WXPathEngine(HookedEngineBase):
261
314
  # NOTE: Consider allowing redirects
262
315
  if resp.status not in self.allowed_response_codes or not resp.body:
263
316
  log.warning(f"Got non-200 response from {resp.request.url}")
317
+
318
+ if yield_errors:
319
+ yield {
320
+ "__type__": "error",
321
+ "url": resp.request.url,
322
+ "reason": "bad_status",
323
+ "status": resp.status,
324
+ "body": resp.body
325
+ }
326
+
264
327
  if is_terminal():
265
328
  break
266
329
  continue
@@ -276,6 +339,7 @@ class WXPathEngine(HookedEngineBase):
276
339
  base_url=task.url,
277
340
  backlink=task.backlink,
278
341
  depth=task.depth,
342
+ response=resp
279
343
  )
280
344
 
281
345
  elem = await self.post_parse_hooks(elem, task)
@@ -388,10 +452,12 @@ class WXPathEngine(HookedEngineBase):
388
452
  def wxpath_async(path_expr: str,
389
453
  max_depth: int,
390
454
  progress: bool = False,
391
- engine: WXPathEngine | None = None) -> AsyncGenerator[Any, None]:
455
+ engine: WXPathEngine | None = None,
456
+ yield_errors: bool = False
457
+ ) -> AsyncGenerator[Any, None]:
392
458
  if engine is None:
393
459
  engine = WXPathEngine()
394
- return engine.run(path_expr, max_depth, progress=progress)
460
+ return engine.run(path_expr, max_depth, progress=progress, yield_errors=yield_errors)
395
461
 
396
462
 
397
463
  ##### ASYNC IN SYNC #####
@@ -400,6 +466,7 @@ def wxpath_async_blocking_iter(
400
466
  max_depth: int = 1,
401
467
  progress: bool = False,
402
468
  engine: WXPathEngine | None = None,
469
+ yield_errors: bool = False
403
470
  ) -> Iterator[Any]:
404
471
  """Evaluate a wxpath expression using concurrent breadth-first traversal.
405
472
 
@@ -419,7 +486,8 @@ def wxpath_async_blocking_iter(
419
486
  """
420
487
  loop = asyncio.new_event_loop()
421
488
  asyncio.set_event_loop(loop)
422
- agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress, engine=engine)
489
+ agen = wxpath_async(path_expr, max_depth=max_depth, progress=progress,
490
+ engine=engine, yield_errors=yield_errors)
423
491
 
424
492
  try:
425
493
  while True:
@@ -437,8 +505,11 @@ def wxpath_async_blocking(
437
505
  max_depth: int = 1,
438
506
  progress: bool = False,
439
507
  engine: WXPathEngine | None = None,
508
+ yield_errors: bool = False
440
509
  ) -> list[Any]:
441
510
  return list(wxpath_async_blocking_iter(path_expr,
442
511
  max_depth=max_depth,
443
512
  progress=progress,
444
- engine=engine))
513
+ engine=engine,
514
+ yield_errors=yield_errors,
515
+ ))
@@ -6,7 +6,7 @@ from wxpath.util.logging import get_logger
6
6
  log = get_logger(__name__)
7
7
 
8
8
 
9
- def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
9
+ def parse_html(content, base_url=None, response=None, **elem_kv_pairs) -> html.HtmlElement:
10
10
  elem = etree.HTML(content, parser=patches.html_parser_with_xpath3, base_url=base_url)
11
11
  if base_url:
12
12
  elem.getroottree().docinfo.URL = base_url # make base-uri() work
@@ -14,12 +14,15 @@ def parse_html(content, base_url=None, **elem_kv_pairs) -> html.HtmlElement:
14
14
  elem.set("{http://www.w3.org/XML/1998/namespace}base", base_url)
15
15
  elem.base_url = base_url # sets both attribute and doc-level URL
16
16
 
17
- # NOTE: some pages may have multiple root elements, i.e.
17
+ if response:
18
+ elem.response = response
19
+ elem.getroottree().getroot().response = response
20
+ # NOTE: some pages may have multiple root elements, i.e.
18
21
  # len(elem.itersiblings()) > 0 AND elem.getparent() is None.
19
22
  # This breaks elementpath. If elem has siblings, recreate the
20
23
  # root element and only the root element.
21
24
  if len(list(elem.itersiblings())) > 0:
22
- elem = detach_html_root(elem, base_url)
25
+ elem = detach_html_root(elem, base_url)
23
26
 
24
27
  for k, v in elem_kv_pairs.items():
25
28
  elem.set(k, str(v))
@@ -5,5 +5,5 @@ from wxpath.http.client.response import Response
5
5
  __all__ = [
6
6
  "Crawler",
7
7
  "Request",
8
- "Response"
8
+ "Response",
9
9
  ]
@@ -1,7 +1,7 @@
1
1
  import aiohttp
2
2
 
3
3
  try:
4
- from aiohttp_client_cache import CachedSession, SQLiteBackend
4
+ from aiohttp_client_cache import CachedSession
5
5
  except ImportError:
6
6
  CachedSession = None
7
7
 
@@ -42,7 +42,7 @@ def get_async_session(
42
42
  if timeout is None:
43
43
  timeout = aiohttp.ClientTimeout(total=CRAWLER_SETTINGS.timeout)
44
44
 
45
- if CACHE_SETTINGS.enabled and CachedSession and SQLiteBackend:
45
+ if CACHE_SETTINGS.enabled and CachedSession:
46
46
  log.info("using aiohttp-client-cache")
47
47
  return CachedSession(
48
48
  cache=get_cache_backend(),
@@ -71,6 +71,7 @@ class Crawler:
71
71
  *,
72
72
  headers: dict | None = None,
73
73
  proxies: dict | None = None,
74
+ verify_ssl: bool | None = None,
74
75
  retry_policy: RetryPolicy | None = None,
75
76
  throttler: AbstractThrottler | None = None,
76
77
  auto_throttle_target_concurrency: float = None,
@@ -82,6 +83,9 @@ class Crawler:
82
83
 
83
84
  self.concurrency = concurrency if concurrency is not None else cfg.concurrency
84
85
  self.per_host = per_host if per_host is not None else cfg.per_host
86
+ self._verify_ssl = verify_ssl if verify_ssl is not None else getattr(
87
+ cfg, "verify_ssl", True
88
+ )
85
89
 
86
90
  timeout = timeout if timeout is not None else cfg.timeout
87
91
  self._timeout = aiohttp.ClientTimeout(total=timeout)
@@ -141,7 +145,11 @@ class Crawler:
141
145
  """Construct an `aiohttp.ClientSession` with tracing and pooling."""
142
146
  trace_config = build_trace_config(self._stats)
143
147
  # Need to build the connector as late as possible as it requires the loop
144
- connector = aiohttp.TCPConnector(limit=self.concurrency*2, ttl_dns_cache=300)
148
+ connector = aiohttp.TCPConnector(
149
+ limit=self.concurrency * 2,
150
+ ttl_dns_cache=300,
151
+ ssl=self._verify_ssl,
152
+ )
145
153
  return get_async_session(
146
154
  headers=self._headers,
147
155
  timeout=self._timeout,
@@ -274,22 +282,26 @@ class Crawler:
274
282
  else:
275
283
  log.info("[CACHE MISS]", extra={"req.url": req.url, "resp.url": resp.url})
276
284
 
285
+ _start = time.monotonic()
277
286
  body = await resp.read()
278
287
 
279
- latency = time.monotonic() - start
288
+ end = time.monotonic()
289
+ latency = end - _start
280
290
  self.throttler.record_latency(host, latency)
281
291
 
282
292
  if self.retry_policy.should_retry(req, response=resp):
283
293
  await self._retry(req)
284
294
  return None
285
295
 
286
- return Response(req, resp.status, body, dict(resp.headers))
296
+ return Response(req, resp.status, body, dict(resp.headers),
297
+ request_start=_start, response_end=end)
287
298
  except asyncio.CancelledError:
288
299
  # Normal during shutdown / timeout propagation
289
300
  log.debug("cancelled error", extra={"url": req.url})
290
301
  raise
291
302
  except Exception as exc:
292
- latency = time.monotonic() - start
303
+ end = time.monotonic()
304
+ latency = end - start
293
305
  self.throttler.record_latency(host, latency)
294
306
 
295
307
  if self.retry_policy.should_retry(req, exception=exc):
@@ -297,7 +309,7 @@ class Crawler:
297
309
  return None
298
310
 
299
311
  log.error("request failed", extra={"url": req.url}, exc_info=exc)
300
- return Response(req, 0, b"", error=exc)
312
+ return Response(req, 0, b"", error=exc, request_start=start, response_end=end)
301
313
 
302
314
  async def _retry(self, req: Request) -> None:
303
315
  """Reschedule a request according to the retry policy."""
@@ -9,7 +9,7 @@ class Request:
9
9
  url: str
10
10
  method: str = "GET"
11
11
  headers: dict[str, str] = field(default_factory=dict)
12
- timeout: float = 15.0
12
+ timeout: float | None = None
13
13
 
14
14
  retries: int = 0
15
15
  max_retries: int | None = None
@@ -1,4 +1,3 @@
1
- # wxpath/http/response.py
2
1
  from dataclasses import dataclass, field
3
2
  from typing import Optional
4
3
 
@@ -12,3 +11,10 @@ class Response:
12
11
  body: bytes
13
12
  headers: dict[str, str] | None = None
14
13
  error: Optional[Exception] = field(default=None, kw_only=True)
14
+
15
+ request_start: float | None = None
16
+ response_end: float | None = None
17
+
18
+ @property
19
+ def latency(self) -> float:
20
+ return self.response_end - self.request_start
@@ -19,13 +19,13 @@ class RetryPolicy:
19
19
 
20
20
  if request.max_retries is not None and request.retries >= request.max_retries:
21
21
  return False
22
-
22
+
23
23
  if request.retries >= self.max_retries:
24
24
  return False
25
25
 
26
26
  if response is not None and response.status in self.retry_statuses:
27
27
  return True
28
-
28
+
29
29
  if exception is not None:
30
30
  return True
31
31
 
File without changes
File without changes