lacuscore 1.12.10__py3-none-any.whl → 1.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lacuscore/helpers.py CHANGED
@@ -15,6 +15,8 @@ from pydantic_core import from_json
15
15
 
16
16
  from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
17
17
 
18
+ from playwright._impl._api_structures import Cookie # , StorageState
19
+
18
20
 
19
21
  class LacusCoreException(Exception):
20
22
  pass
@@ -71,7 +73,10 @@ class CaptureResponseJson(TypedDict, total=False):
71
73
  status: int
72
74
  last_redirected_url: str | None
73
75
  har: dict[str, Any] | None
74
- cookies: list[dict[str, str]] | None
76
+ cookies: list[Cookie] | None
77
+ # NOTE: should be that, but StorageState doesn't define the indexeddb
78
+ # storage: StorageState | None
79
+ storage: dict[str, Any] | None
75
80
  error: str | None
76
81
  html: str | None
77
82
  png: str | None
@@ -93,7 +98,10 @@ class CaptureSettings(BaseModel):
93
98
  user_agent: str | None = None
94
99
  proxy: str | dict[str, str] | None = None
95
100
  general_timeout_in_sec: int | None = None
96
- cookies: list[dict[str, Any]] | None = None
101
+ cookies: list[Cookie] | None = None
102
+ # NOTE: should be that, but StorageState doesn't define the indexeddb
103
+ # storage: StorageState | None = None
104
+ storage: dict[str, Any] | None = None
97
105
  headers: dict[str, str] | None = None
98
106
  http_credentials: dict[str, str] | None = None
99
107
  geolocation: dict[str, float] | None = None
@@ -197,10 +205,51 @@ class CaptureSettings(BaseModel):
197
205
  if not cookies:
198
206
  return None
199
207
  if isinstance(cookies, str):
200
- # Cookies are invalid, ignoring.
201
- pass
202
- elif isinstance(cookies, list):
203
- return cookies
208
+ # might be a json dump, try to load it and ignore otherwise
209
+ try:
210
+ cookies = json.loads(cookies)
211
+ except json.JSONDecodeError as e:
212
+ print(e)
213
+ # Cookies are invalid, ignoring.
214
+ return None
215
+ if isinstance(cookies, dict):
216
+ # might be a single cookie in the format name: value, make it a list
217
+ cookies = [cookies]
218
+ if isinstance(cookies, list):
219
+ # make sure the cookies are in the right format
220
+ to_return = []
221
+ for cookie in cookies:
222
+ if isinstance(cookie, dict):
223
+ if 'name' in cookie and 'value' in cookie:
224
+ to_return.append(cookie)
225
+ elif len(cookie) == 1:
226
+ # {'name': 'value'} => {'name': 'name', 'value': 'value'}
227
+ name, value = cookie.popitem()
228
+ if name and value:
229
+ to_return.append({'name': name, 'value': value})
230
+ else:
231
+ # invalid cookie, ignoring
232
+ pass
233
+ return to_return
234
+ return None
235
+
236
+ @field_validator('storage', mode='before')
237
+ @classmethod
238
+ def load_storage_json(cls, storage: Any) -> dict[str, Any] | None:
239
+ """That's the storage as exported from Playwright:
240
+ https://playwright.dev/python/docs/api/class-browsercontext#browser-context-storage-state
241
+ """
242
+ if not storage:
243
+ return None
244
+ if isinstance(storage, str):
245
+ # might be a json dump, try to load it and ignore otherwise
246
+ try:
247
+ storage = json.loads(storage)
248
+ except json.JSONDecodeError:
249
+ # storage is invalid, ignoring.
250
+ return None
251
+ if isinstance(storage, dict) and 'cookies' in storage and 'origins' in storage:
252
+ return storage
204
253
  return None
205
254
 
206
255
  @field_validator('headers', mode='before')
lacuscore/lacuscore.py CHANGED
@@ -12,14 +12,13 @@ import re
12
12
  import sys
13
13
  import time
14
14
  import unicodedata
15
- import zlib
16
15
 
17
16
  from asyncio import Task
18
17
  from base64 import b64decode, b64encode
19
18
  from datetime import date, timedelta
20
19
  from ipaddress import ip_address, IPv4Address, IPv6Address
21
20
  from tempfile import NamedTemporaryFile
22
- from typing import Literal, Any, overload, cast
21
+ from typing import Literal, Any, overload, cast, TYPE_CHECKING
23
22
  from collections.abc import Iterator
24
23
  from uuid import uuid4
25
24
  from urllib.parse import urlsplit
@@ -56,6 +55,9 @@ else:
56
55
  if timeout_cm.expired():
57
56
  logger.warning(f'Timeout expired: {error_message}')
58
57
 
58
+ if TYPE_CHECKING:
59
+ from playwright._impl._api_structures import Cookie
60
+
59
61
 
60
62
  BROWSER = Literal['chromium', 'firefox', 'webkit']
61
63
 
@@ -131,6 +133,7 @@ class LacusCore():
131
133
  proxy: str | dict[str, str] | None=None,
132
134
  general_timeout_in_sec: int | None=None,
133
135
  cookies: list[dict[str, Any]] | None=None,
136
+ storage: dict[str, Any] | None=None,
134
137
  headers: dict[str, str] | None=None,
135
138
  http_credentials: dict[str, str] | None=None,
136
139
  geolocation: dict[str, float] | None=None,
@@ -162,6 +165,7 @@ class LacusCore():
162
165
  proxy: str | dict[str, str] | None=None,
163
166
  general_timeout_in_sec: int | None=None,
164
167
  cookies: list[dict[str, Any]] | None=None,
168
+ storage: dict[str, Any] | None=None,
165
169
  headers: dict[str, str] | None=None,
166
170
  http_credentials: dict[str, str] | None=None,
167
171
  geolocation: dict[str, float] | None=None,
@@ -195,6 +199,7 @@ class LacusCore():
195
199
  :param proxy: SOCKS5 proxy to use for capturing
196
200
  :param general_timeout_in_sec: The capture will raise a timeout it it takes more than that time
197
201
  :param cookies: A list of cookies
202
+ :param storage: A storage state from another capture
198
203
  :param headers: The headers to pass to the capture
199
204
  :param http_credentials: HTTP Credentials to pass to the capture
200
205
  :param geolocation: Geolocation of the browser to pass to the capture
@@ -223,7 +228,7 @@ class LacusCore():
223
228
  'browser': browser, 'device_name': device_name,
224
229
  'user_agent': user_agent, 'proxy': proxy,
225
230
  'general_timeout_in_sec': general_timeout_in_sec,
226
- 'cookies': cookies, 'headers': headers,
231
+ 'cookies': cookies, 'storage': storage, 'headers': headers,
227
232
  'http_credentials': http_credentials, 'geolocation': geolocation,
228
233
  'timezone_id': timezone_id, 'locale': locale,
229
234
  'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
@@ -232,7 +237,6 @@ class LacusCore():
232
237
  # Quietly force it to true if headed is not allowed.
233
238
  'headless': headless if self.headed_allowed else True,
234
239
  'max_retries': max_retries}
235
-
236
240
  try:
237
241
  to_enqueue = CaptureSettings(**settings)
238
242
  except ValidationError as e:
@@ -245,7 +249,6 @@ class LacusCore():
245
249
  if isinstance(existing_uuid, bytes):
246
250
  return existing_uuid.decode()
247
251
  return existing_uuid
248
-
249
252
  if uuid:
250
253
  # Make sure we do not already have a capture with that UUID
251
254
  if self.get_capture_status(uuid) == CaptureStatus.UNKNOWN:
@@ -468,7 +471,7 @@ class LacusCore():
468
471
  else:
469
472
  browser_engine = 'webkit'
470
473
 
471
- cookies: list[dict[str, Any]] = []
474
+ cookies: list[Cookie] = []
472
475
  if to_capture.cookies:
473
476
  # In order to properly pass the cookies to playwright,
474
477
  # each of then must have a name, a value and either a domain + path or a URL
@@ -477,18 +480,16 @@ class LacusCore():
477
480
  # with the hostname of the URL we try to capture and the path with "/"
478
481
  # NOTE: these changes can only be done here because we need the URL.
479
482
  for cookie in to_capture.cookies:
480
- if len(cookie) == 1:
481
- # we have a cookie in the format key: value
482
- name, value = cookie.popitem()
483
- cookie = {'name': name, 'value': value}
484
483
  if 'name' not in cookie or 'value' not in cookie:
485
484
  logger.warning(f'Invalid cookie: {cookie}')
486
485
  continue
487
486
  if 'domain' not in cookie and 'url' not in cookie:
487
+ if not splitted_url.hostname:
488
+ # If for any reason we cannot get the hostname there, ignore the cookie
489
+ continue
488
490
  cookie['domain'] = splitted_url.hostname
489
491
  cookie['path'] = '/'
490
492
  cookies.append(cookie)
491
-
492
493
  try:
493
494
  logger.debug(f'Capturing {url}')
494
495
  stats_pipeline.sadd(f'stats:{today}:captures', url)
@@ -503,6 +504,7 @@ class LacusCore():
503
504
  # required by Mypy: https://github.com/python/mypy/issues/3004
504
505
  capture.headers = to_capture.headers # type: ignore[assignment]
505
506
  capture.cookies = cookies # type: ignore[assignment]
507
+ capture.storage = to_capture.storage # type: ignore[assignment]
506
508
  capture.viewport = to_capture.viewport # type: ignore[assignment]
507
509
  capture.user_agent = to_capture.user_agent # type: ignore[assignment]
508
510
  capture.http_credentials = to_capture.http_credentials # type: ignore[assignment]
@@ -666,6 +668,8 @@ class LacusCore():
666
668
  hash_to_set['har'] = pickle.dumps(results['har'])
667
669
  if results.get('cookies'):
668
670
  hash_to_set['cookies'] = pickle.dumps(results['cookies'])
671
+ if results.get('storage'):
672
+ hash_to_set['storage'] = pickle.dumps(results['storage'])
669
673
  if results.get('potential_favicons'):
670
674
  hash_to_set['potential_favicons'] = pickle.dumps(results['potential_favicons'])
671
675
  if results.get('html') and results['html'] is not None:
@@ -685,7 +689,7 @@ class LacusCore():
685
689
  hash_to_set['children'] = pickle.dumps(children)
686
690
 
687
691
  for key in results.keys():
688
- if key in ['har', 'cookies', 'potential_favicons', 'html', 'children'] or not results.get(key):
692
+ if key in ['har', 'cookies', 'storage', 'potential_favicons', 'html', 'children'] or not results.get(key):
689
693
  continue
690
694
  # these entries can be stored directly
691
695
  hash_to_set[key] = results[key] # type: ignore[literal-required]
@@ -702,12 +706,6 @@ class LacusCore():
702
706
  if root_key is None:
703
707
  root_key = f'lacus:capture_results_hash:{capture_uuid}'
704
708
 
705
- if not self.redis.exists(root_key):
706
- if old_response := self.redis.get(f'lacus:capture_results:{capture_uuid}'):
707
- # TODO: remove in 1.8.* - old format used last in 1.6, and kept no more than 10H in redis
708
- return pickle.loads(zlib.decompress(old_response))
709
- return None
710
-
711
709
  # New format and capture done
712
710
 
713
711
  to_return: CaptureResponse = {}
@@ -716,6 +714,8 @@ class LacusCore():
716
714
  to_return['har'] = pickle.loads(value)
717
715
  elif key == b'cookies':
718
716
  to_return['cookies'] = pickle.loads(value)
717
+ elif key == b'storage':
718
+ to_return['storage'] = pickle.loads(value)
719
719
  elif key == b'potential_favicons':
720
720
  to_return['potential_favicons'] = pickle.loads(value)
721
721
  elif key == b'children':
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lacuscore
3
- Version: 1.12.10
3
+ Version: 1.13.1
4
4
  Summary: Core of Lacus, usable as a module
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
7
7
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9,<4.0
8
+ Requires-Python: >=3.9.2,<4.0
9
9
  Classifier: Development Status :: 5 - Production/Stable
10
10
  Classifier: Environment :: Console
11
11
  Classifier: Intended Audience :: Information Technology
@@ -14,7 +14,6 @@ Classifier: Intended Audience :: Telecommunications Industry
14
14
  Classifier: License :: OSI Approved :: BSD License
15
15
  Classifier: Operating System :: POSIX :: Linux
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: Programming Language :: Python :: 3.12
@@ -22,12 +21,12 @@ Classifier: Programming Language :: Python :: 3.13
22
21
  Classifier: Topic :: Internet
23
22
  Classifier: Topic :: Security
24
23
  Provides-Extra: docs
25
- Requires-Dist: Sphinx (>=8.1.3) ; (python_version >= "3.10") and (extra == "docs")
24
+ Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
26
25
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
27
26
  Requires-Dist: defang (>=0.5.3)
28
27
  Requires-Dist: dnspython (>=2.7.0)
29
28
  Requires-Dist: eval-type-backport (>=0.2.2) ; python_version < "3.10"
30
- Requires-Dist: playwrightcapture[recaptcha] (>=1.27.8)
29
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.28.1)
31
30
  Requires-Dist: pydantic (>=2.10.6)
32
31
  Requires-Dist: redis[hiredis] (>=5.2.1)
33
32
  Requires-Dist: requests (>=2.32.3)
@@ -0,0 +1,10 @@
1
+ lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
+ lacuscore/helpers.py,sha256=Nt3oMMDGgl3rDkDujuAaxWtb3cXeSGk2pdXg5lNEqhI,13188
3
+ lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
4
+ lacuscore/lacuscore.py,sha256=fzW-04_pFK_Bqv-66-ei4OLS3nxpHmNEXCn4_2pXhTU,42212
5
+ lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
7
+ lacuscore-1.13.1.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
+ lacuscore-1.13.1.dist-info/METADATA,sha256=IyAbfTlzx6MfiGGSm4xcQsz_l7zlAS4iKdhpc1xlOuA,2570
9
+ lacuscore-1.13.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
10
+ lacuscore-1.13.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.1
2
+ Generator: poetry-core 2.1.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,10 +0,0 @@
1
- lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
- lacuscore/helpers.py,sha256=GKgy8-kvGJLrOv431AtQRTtMSJ5GNtnwsj-K6WqF0EA,10993
3
- lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
4
- lacuscore/lacuscore.py,sha256=S-qxQJR4WmJHDSAay9tBCWHz2f4uM6z1Elmox7hln54,42049
5
- lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
7
- lacuscore-1.12.10.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
- lacuscore-1.12.10.dist-info/METADATA,sha256=Z2TqBsZDn5K0omvbs0Gv_4TvZBkEAH1RxVO7XfXznl4,2619
9
- lacuscore-1.12.10.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
10
- lacuscore-1.12.10.dist-info/RECORD,,