lacuscore 1.9.4__py3-none-any.whl → 1.9.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lacuscore/lacuscore.py CHANGED
@@ -531,20 +531,40 @@ class LacusCore():
531
531
  browser_engine = 'firefox'
532
532
  else:
533
533
  browser_engine = 'webkit'
534
+
535
+ cookies: list[dict[str, Any]] = []
536
+ if to_capture.get('cookies') and to_capture['cookies'] is not None:
537
+ # In order to properly pass the cookies to playwright,
538
+ # each of then must have a name, a value and either a domain + path or a URL
539
+ # Name and value are mandatory, and we cannot auto-fill them.
540
+ # If the cookie doesn't have a domain + path OR a URL, we fill the domain
541
+ # with the hostname of the URL we try to capture and the path with "/"
542
+ for cookie in to_capture['cookies']:
543
+ if len(cookie) == 1:
544
+ # we have a cookie in the format key: value
545
+ name, value = cookie.popitem()
546
+ cookie = {'name': name, 'value': value}
547
+ if 'name' not in cookie or 'value' not in cookie:
548
+ logger.warning(f'Invalid cookie: {cookie}')
549
+ continue
550
+ if 'domain' not in cookie and 'url' not in cookie:
551
+ cookie['domain'] = splitted_url.hostname
552
+ cookie['path'] = '/'
553
+ cookies.append(cookie)
554
+
534
555
  try:
535
556
  logger.debug(f'Capturing {url}')
536
- general_timeout = to_capture.get('general_timeout_in_sec')
537
557
  stats_pipeline.sadd(f'stats:{today}:captures', url)
538
558
  async with Capture(
539
559
  browser=browser_engine,
540
560
  device_name=to_capture.get('device_name'),
541
561
  proxy=proxy,
542
- general_timeout_in_sec=general_timeout,
562
+ general_timeout_in_sec=to_capture.get('general_timeout_in_sec'),
543
563
  loglevel=self.master_logger.getEffectiveLevel(),
544
564
  uuid=uuid) as capture:
545
565
  # required by Mypy: https://github.com/python/mypy/issues/3004
546
566
  capture.headers = to_capture.get('headers') # type: ignore[assignment]
547
- capture.cookies = to_capture.get('cookies') # type: ignore[assignment]
567
+ capture.cookies = cookies # type: ignore[assignment]
548
568
  capture.viewport = to_capture.get('viewport') # type: ignore[assignment]
549
569
  capture.user_agent = to_capture.get('user_agent') # type: ignore[assignment]
550
570
  capture.http_credentials = to_capture.get('http_credentials') # type: ignore[assignment]
@@ -552,27 +572,30 @@ class LacusCore():
552
572
  capture.timezone_id = to_capture.get('timezone_id') # type: ignore[assignment]
553
573
  capture.locale = to_capture.get('locale') # type: ignore[assignment]
554
574
  capture.color_scheme = to_capture.get('color_scheme') # type: ignore[assignment]
575
+
576
+ # make sure the initialization doesn't take too long
577
+ init_timeout = max(self.max_capture_time / 10, 5)
555
578
  try:
556
- # make sure the initialization doesn't take too long
557
- if general_timeout is None:
558
- general_timeout = 5
559
- init_timeout = max(general_timeout / 2, 5)
560
579
  async with timeout(init_timeout) as initialize_timeout:
561
580
  await capture.initialize_context()
562
-
563
581
  except (TimeoutError, asyncio.exceptions.TimeoutError):
564
582
  timeout_expired(initialize_timeout, logger, 'Initializing took too long.')
565
- logger.warning(f'Initializing the context for {url} took longer than the allowed general timeout ({general_timeout}s)')
566
- raise RetryCapture(f'Initializing the context for {url} took longer than the allowed general timeout ({general_timeout}s)')
567
-
568
- async with timeout(self.max_capture_time) as capture_timeout:
569
- playwright_result = await capture.capture_page(
570
- url, referer=to_capture.get('referer'),
571
- depth=to_capture.get('depth', 0),
572
- rendered_hostname_only=to_capture.get('rendered_hostname_only', True),
573
- with_favicon=to_capture.get('with_favicon', False),
574
- allow_tracking=to_capture.get('allow_tracking', False),
575
- max_depth_capture_time=self.max_capture_time)
583
+ logger.warning(f'Initializing the context for {url} took longer than the allowed initialization timeout ({init_timeout}s)')
584
+ raise RetryCapture(f'Initializing the context for {url} took longer than the allowed initialization timeout ({init_timeout}s)')
585
+
586
+ try:
587
+ async with timeout(self.max_capture_time) as capture_timeout:
588
+ playwright_result = await capture.capture_page(
589
+ url, referer=to_capture.get('referer'),
590
+ depth=to_capture.get('depth', 0),
591
+ rendered_hostname_only=to_capture.get('rendered_hostname_only', True),
592
+ with_favicon=to_capture.get('with_favicon', False),
593
+ allow_tracking=to_capture.get('allow_tracking', False),
594
+ max_depth_capture_time=self.max_capture_time)
595
+ except (TimeoutError, asyncio.exceptions.TimeoutError):
596
+ timeout_expired(capture_timeout, logger, 'Capture took too long.')
597
+ logger.warning(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
598
+ raise RetryCapture(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
576
599
  result = cast(CaptureResponse, playwright_result)
577
600
  if 'error' in result and 'error_name' in result:
578
601
  # generate stats
@@ -581,7 +604,7 @@ class LacusCore():
581
604
  except RetryCapture as e:
582
605
  raise e
583
606
  except PlaywrightCaptureException as e:
584
- logger.exception(f'Invalid parameters for the capture of {url} - {e}')
607
+ logger.warning(f'Invalid parameters for the capture of {url} - {e}')
585
608
  result = {'error': f'Invalid parameters for the capture of {url} - {e}'}
586
609
  raise CaptureError(f'Invalid parameters for the capture of {url} - {e}')
587
610
  except asyncio.CancelledError:
@@ -590,11 +613,6 @@ class LacusCore():
590
613
  # The capture can be canceled if it has been running for way too long.
591
614
  # We can give it another short.
592
615
  raise RetryCapture(f'The capture of {url} has been cancelled.')
593
- except (TimeoutError, asyncio.exceptions.TimeoutError):
594
- timeout_expired(capture_timeout, logger, 'Capture took too long.')
595
- logger.warning(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
596
- result = {'error': f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)'}
597
- raise CaptureError(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
598
616
  except Exception as e:
599
617
  logger.exception(f'Something went poorly {url} - {e}')
600
618
  result = {'error': f'Something went poorly {url} - {e}'}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.9.4
3
+ Version: 1.9.6
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
@@ -28,9 +28,9 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
28
28
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
29
29
  Requires-Dist: defang (>=0.5.3,<0.6.0)
30
30
  Requires-Dist: dnspython (>=2.6.1,<3.0.0)
31
- Requires-Dist: playwrightcapture[recaptcha] (>=1.24.10,<2.0.0)
32
- Requires-Dist: redis[hiredis] (>=5.0.4,<6.0.0)
33
- Requires-Dist: requests (>=2.32.1,<3.0.0)
31
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.24.11,<2.0.0)
32
+ Requires-Dist: redis[hiredis] (>=5.0.6,<6.0.0)
33
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
34
34
  Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
35
35
  Project-URL: Documentation, https://lacuscore.readthedocs.io/en/latest/
36
36
  Project-URL: Repository, https://github.com/ail-project/LacusCore
@@ -1,10 +1,10 @@
1
1
  lacuscore/__init__.py,sha256=hM4lKoPNybDCUMWdXTVVI1gRk_riLvRZ7IwFbamZLzE,341
2
2
  lacuscore/helpers.py,sha256=lULN7HhY-4a4HG-ybIt4jO3wEGTxkm_jKNqsGpNZo4Y,2711
3
3
  lacuscore/lacus_monitoring.py,sha256=UOfE_1-_rhVeKJXQ_m9XxYkr7VwyQnA6iK-x_tcXJfo,2775
4
- lacuscore/lacuscore.py,sha256=zi6oOcwmZ0yENIfJy55baeGXw3Js4lg5fmQz0YFRNJk,43030
4
+ lacuscore/lacuscore.py,sha256=pk9E2w2zEH8l6VywXW4Ur_-XAMhFVDWTWIt2bIZ07Bo,44047
5
5
  lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  lacuscore/task_logger.py,sha256=8WbdJdKnGeFCxt9gtCNLI9vAQQZbsy2I5PRQpHP7XFU,1916
7
- lacuscore-1.9.4.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
- lacuscore-1.9.4.dist-info/METADATA,sha256=Lr1wGWp4zfC19UwS51sjTxznPywLBZ7-0e9rAF88cYs,2629
9
- lacuscore-1.9.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
10
- lacuscore-1.9.4.dist-info/RECORD,,
7
+ lacuscore-1.9.6.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
+ lacuscore-1.9.6.dist-info/METADATA,sha256=HGVQl2vRm95AmDm9d0z0OU89GX-8ZGUDsiSR8eotx3Y,2629
9
+ lacuscore-1.9.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
10
+ lacuscore-1.9.6.dist-info/RECORD,,