lacuscore 1.9.3__tar.gz → 1.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.9.3
3
+ Version: 1.9.5
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
@@ -28,9 +28,9 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
28
28
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
29
29
  Requires-Dist: defang (>=0.5.3,<0.6.0)
30
30
  Requires-Dist: dnspython (>=2.6.1,<3.0.0)
31
- Requires-Dist: playwrightcapture[recaptcha] (>=1.24.6,<2.0.0)
31
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.24.11,<2.0.0)
32
32
  Requires-Dist: redis[hiredis] (>=5.0.4,<6.0.0)
33
- Requires-Dist: requests (>=2.31.0,<3.0.0)
33
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
34
34
  Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
35
35
  Project-URL: Documentation, https://lacuscore.readthedocs.io/en/latest/
36
36
  Project-URL: Repository, https://github.com/ail-project/LacusCore
@@ -43,9 +43,19 @@ from .helpers import (
43
43
 
44
44
  if sys.version_info < (3, 11):
45
45
  from async_timeout import timeout
46
+
47
+ def timeout_expired(timeout_cm, logger, error_message: str) -> None: # type: ignore[no-untyped-def]
48
+ if timeout_cm.expired:
49
+ logger.warning(f'Timeout expired: {error_message}')
50
+
46
51
  else:
47
52
  from asyncio import timeout
48
53
 
54
+ def timeout_expired(timeout_cm, logger, error_message: str) -> None: # type: ignore[no-untyped-def]
55
+ if timeout_cm.expired():
56
+ logger.warning(f'Timeout expired: {error_message}')
57
+
58
+
49
59
  BROWSER = Literal['chromium', 'firefox', 'webkit']
50
60
 
51
61
 
@@ -523,15 +533,12 @@ class LacusCore():
523
533
  browser_engine = 'webkit'
524
534
  try:
525
535
  logger.debug(f'Capturing {url}')
526
- # NOTE: starting with python 3.11, we can use asyncio.timeout
527
- # async with asyncio.timeout(self.max_capture_time):
528
- general_timeout = to_capture.get('general_timeout_in_sec')
529
536
  stats_pipeline.sadd(f'stats:{today}:captures', url)
530
537
  async with Capture(
531
538
  browser=browser_engine,
532
539
  device_name=to_capture.get('device_name'),
533
540
  proxy=proxy,
534
- general_timeout_in_sec=general_timeout,
541
+ general_timeout_in_sec=to_capture.get('general_timeout_in_sec'),
535
542
  loglevel=self.master_logger.getEffectiveLevel(),
536
543
  uuid=uuid) as capture:
537
544
  # required by Mypy: https://github.com/python/mypy/issues/3004
@@ -544,20 +551,30 @@ class LacusCore():
544
551
  capture.timezone_id = to_capture.get('timezone_id') # type: ignore[assignment]
545
552
  capture.locale = to_capture.get('locale') # type: ignore[assignment]
546
553
  capture.color_scheme = to_capture.get('color_scheme') # type: ignore[assignment]
554
+
555
+ # make sure the initialization doesn't take too long
556
+ init_timeout = max(self.max_capture_time / 10, 5)
547
557
  try:
548
- async with timeout(general_timeout):
558
+ async with timeout(init_timeout) as initialize_timeout:
549
559
  await capture.initialize_context()
550
560
  except (TimeoutError, asyncio.exceptions.TimeoutError):
551
- logger.warning(f'Initializing the context for {url} took longer than the allowed general timeout ({general_timeout}s)')
552
- raise RetryCapture(f'Initializing the context for {url} took longer than the allowed general timeout ({general_timeout}s)')
553
- async with timeout(self.max_capture_time):
554
- playwright_result = await capture.capture_page(
555
- url, referer=to_capture.get('referer'),
556
- depth=to_capture.get('depth', 0),
557
- rendered_hostname_only=to_capture.get('rendered_hostname_only', True),
558
- with_favicon=to_capture.get('with_favicon', False),
559
- allow_tracking=to_capture.get('allow_tracking', False),
560
- max_depth_capture_time=self.max_capture_time)
561
+ timeout_expired(initialize_timeout, logger, 'Initializing took too long.')
562
+ logger.warning(f'Initializing the context for {url} took longer than the allowed initialization timeout ({init_timeout}s)')
563
+ raise RetryCapture(f'Initializing the context for {url} took longer than the allowed initialization timeout ({init_timeout}s)')
564
+
565
+ try:
566
+ async with timeout(self.max_capture_time) as capture_timeout:
567
+ playwright_result = await capture.capture_page(
568
+ url, referer=to_capture.get('referer'),
569
+ depth=to_capture.get('depth', 0),
570
+ rendered_hostname_only=to_capture.get('rendered_hostname_only', True),
571
+ with_favicon=to_capture.get('with_favicon', False),
572
+ allow_tracking=to_capture.get('allow_tracking', False),
573
+ max_depth_capture_time=self.max_capture_time)
574
+ except (TimeoutError, asyncio.exceptions.TimeoutError):
575
+ timeout_expired(capture_timeout, logger, 'Capture took too long.')
576
+ logger.warning(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
577
+ raise RetryCapture(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
561
578
  result = cast(CaptureResponse, playwright_result)
562
579
  if 'error' in result and 'error_name' in result:
563
580
  # generate stats
@@ -575,10 +592,6 @@ class LacusCore():
575
592
  # The capture can be canceled if it has been running for way too long.
576
593
  # We can give it another short.
577
594
  raise RetryCapture(f'The capture of {url} has been cancelled.')
578
- except (TimeoutError, asyncio.exceptions.TimeoutError):
579
- logger.warning(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
580
- result = {'error': f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)'}
581
- raise CaptureError(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
582
595
  except Exception as e:
583
596
  logger.exception(f'Something went poorly {url} - {e}')
584
597
  result = {'error': f'Something went poorly {url} - {e}'}
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lacuscore"
3
- version = "1.9.3"
3
+ version = "1.9.5"
4
4
  description = "Core of Lacus, usable as a module"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -29,12 +29,12 @@ classifiers = [
29
29
 
30
30
  [tool.poetry.dependencies]
31
31
  python = "^3.8"
32
- requests = "^2.31.0"
32
+ requests = "^2.32.3"
33
33
  Sphinx = [
34
34
  {version = "<7.2", python = "<3.9", optional = true},
35
35
  {version = "^7.2", python = ">=3.9", optional = true}
36
36
  ]
37
- playwrightcapture = {extras = ["recaptcha"], version = "^1.24.6"}
37
+ playwrightcapture = {extras = ["recaptcha"], version = "^1.24.11"}
38
38
  defang = "^0.5.3"
39
39
  ua-parser = "^0.18.0"
40
40
  redis = {version = "^5.0.4", extras = ["hiredis"]}
@@ -47,14 +47,14 @@ docs = ["Sphinx"]
47
47
  [tool.poetry.group.dev.dependencies]
48
48
  types-redis = {version = "^4.6.0.20240425"}
49
49
  mypy = "^1.10.0"
50
- types-requests = "^2.31.0.20240406"
51
- types-beautifulsoup4 = "^4.12.0.20240229"
50
+ types-requests = "^2.32.0.20240523"
51
+ types-beautifulsoup4 = "^4.12.0.20240511"
52
52
  ipython = [
53
53
  {version = "<8.13.0", python = "<3.9"},
54
54
  {version = "^8.18.0", python = ">=3.9"},
55
55
  {version = "^8.19.0", python = ">=3.10"}
56
56
  ]
57
- pytest = "^8.2.0"
57
+ pytest = "^8.2.1"
58
58
 
59
59
  [build-system]
60
60
  requires = ["poetry_core"]
File without changes
File without changes
File without changes