lacuscore 1.13.0__py3-none-any.whl → 1.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lacuscore/helpers.py CHANGED
@@ -15,6 +15,8 @@ from pydantic_core import from_json
15
15
 
16
16
  from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
17
17
 
18
+ from playwright._impl._api_structures import Cookie # , StorageState
19
+
18
20
 
19
21
  class LacusCoreException(Exception):
20
22
  pass
@@ -71,7 +73,10 @@ class CaptureResponseJson(TypedDict, total=False):
71
73
  status: int
72
74
  last_redirected_url: str | None
73
75
  har: dict[str, Any] | None
74
- cookies: list[dict[str, str]] | None
76
+ cookies: list[Cookie] | None
77
+ # NOTE: should be that, but StorageState doesn't define the indexeddb
78
+ # storage: StorageState | None
79
+ storage: dict[str, Any] | None
75
80
  error: str | None
76
81
  html: str | None
77
82
  png: str | None
@@ -93,7 +98,10 @@ class CaptureSettings(BaseModel):
93
98
  user_agent: str | None = None
94
99
  proxy: str | dict[str, str] | None = None
95
100
  general_timeout_in_sec: int | None = None
96
- cookies: list[dict[str, Any]] | None = None
101
+ cookies: list[Cookie] | None = None
102
+ # NOTE: should be that, but StorageState doesn't define the indexeddb
103
+ # storage: StorageState | None = None
104
+ storage: dict[str, Any] | None = None
97
105
  headers: dict[str, str] | None = None
98
106
  http_credentials: dict[str, str] | None = None
99
107
  geolocation: dict[str, float] | None = None
@@ -197,10 +205,51 @@ class CaptureSettings(BaseModel):
197
205
  if not cookies:
198
206
  return None
199
207
  if isinstance(cookies, str):
200
- # Cookies are invalid, ignoring.
201
- pass
202
- elif isinstance(cookies, list):
203
- return cookies
208
+ # might be a json dump, try to load it and ignore otherwise
209
+ try:
210
+ cookies = json.loads(cookies)
211
+ except json.JSONDecodeError as e:
212
+ print(e)
213
+ # Cookies are invalid, ignoring.
214
+ return None
215
+ if isinstance(cookies, dict):
216
+ # might be a single cookie in the format name: value, make it a list
217
+ cookies = [cookies]
218
+ if isinstance(cookies, list):
219
+ # make sure the cookies are in the right format
220
+ to_return = []
221
+ for cookie in cookies:
222
+ if isinstance(cookie, dict):
223
+ if 'name' in cookie and 'value' in cookie:
224
+ to_return.append(cookie)
225
+ elif len(cookie) == 1:
226
+ # {'name': 'value'} => {'name': 'name', 'value': 'value'}
227
+ name, value = cookie.popitem()
228
+ if name and value:
229
+ to_return.append({'name': name, 'value': value})
230
+ else:
231
+ # invalid cookie, ignoring
232
+ pass
233
+ return to_return
234
+ return None
235
+
236
+ @field_validator('storage', mode='before')
237
+ @classmethod
238
+ def load_storage_json(cls, storage: Any) -> dict[str, Any] | None:
239
+ """That's the storage as exported from Playwright:
240
+ https://playwright.dev/python/docs/api/class-browsercontext#browser-context-storage-state
241
+ """
242
+ if not storage:
243
+ return None
244
+ if isinstance(storage, str):
245
+ # might be a json dump, try to load it and ignore otherwise
246
+ try:
247
+ storage = json.loads(storage)
248
+ except json.JSONDecodeError:
249
+ # storage is invalid, ignoring.
250
+ return None
251
+ if isinstance(storage, dict) and 'cookies' in storage and 'origins' in storage:
252
+ return storage
204
253
  return None
205
254
 
206
255
  @field_validator('headers', mode='before')
lacuscore/lacuscore.py CHANGED
@@ -18,7 +18,7 @@ from base64 import b64decode, b64encode
18
18
  from datetime import date, timedelta
19
19
  from ipaddress import ip_address, IPv4Address, IPv6Address
20
20
  from tempfile import NamedTemporaryFile
21
- from typing import Literal, Any, overload, cast
21
+ from typing import Literal, Any, overload, cast, TYPE_CHECKING
22
22
  from collections.abc import Iterator
23
23
  from uuid import uuid4
24
24
  from urllib.parse import urlsplit
@@ -55,6 +55,9 @@ else:
55
55
  if timeout_cm.expired():
56
56
  logger.warning(f'Timeout expired: {error_message}')
57
57
 
58
+ if TYPE_CHECKING:
59
+ from playwright._impl._api_structures import Cookie
60
+
58
61
 
59
62
  BROWSER = Literal['chromium', 'firefox', 'webkit']
60
63
 
@@ -130,6 +133,7 @@ class LacusCore():
130
133
  proxy: str | dict[str, str] | None=None,
131
134
  general_timeout_in_sec: int | None=None,
132
135
  cookies: list[dict[str, Any]] | None=None,
136
+ storage: dict[str, Any] | None=None,
133
137
  headers: dict[str, str] | None=None,
134
138
  http_credentials: dict[str, str] | None=None,
135
139
  geolocation: dict[str, float] | None=None,
@@ -161,6 +165,7 @@ class LacusCore():
161
165
  proxy: str | dict[str, str] | None=None,
162
166
  general_timeout_in_sec: int | None=None,
163
167
  cookies: list[dict[str, Any]] | None=None,
168
+ storage: dict[str, Any] | None=None,
164
169
  headers: dict[str, str] | None=None,
165
170
  http_credentials: dict[str, str] | None=None,
166
171
  geolocation: dict[str, float] | None=None,
@@ -194,6 +199,7 @@ class LacusCore():
194
199
  :param proxy: SOCKS5 proxy to use for capturing
195
200
  :param general_timeout_in_sec: The capture will raise a timeout it it takes more than that time
196
201
  :param cookies: A list of cookies
202
+ :param storage: A storage state from another capture
197
203
  :param headers: The headers to pass to the capture
198
204
  :param http_credentials: HTTP Credentials to pass to the capture
199
205
  :param geolocation: Geolocation of the browser to pass to the capture
@@ -222,7 +228,7 @@ class LacusCore():
222
228
  'browser': browser, 'device_name': device_name,
223
229
  'user_agent': user_agent, 'proxy': proxy,
224
230
  'general_timeout_in_sec': general_timeout_in_sec,
225
- 'cookies': cookies, 'headers': headers,
231
+ 'cookies': cookies, 'storage': storage, 'headers': headers,
226
232
  'http_credentials': http_credentials, 'geolocation': geolocation,
227
233
  'timezone_id': timezone_id, 'locale': locale,
228
234
  'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
@@ -231,7 +237,6 @@ class LacusCore():
231
237
  # Quietly force it to true if headed is not allowed.
232
238
  'headless': headless if self.headed_allowed else True,
233
239
  'max_retries': max_retries}
234
-
235
240
  try:
236
241
  to_enqueue = CaptureSettings(**settings)
237
242
  except ValidationError as e:
@@ -244,7 +249,6 @@ class LacusCore():
244
249
  if isinstance(existing_uuid, bytes):
245
250
  return existing_uuid.decode()
246
251
  return existing_uuid
247
-
248
252
  if uuid:
249
253
  # Make sure we do not already have a capture with that UUID
250
254
  if self.get_capture_status(uuid) == CaptureStatus.UNKNOWN:
@@ -467,7 +471,7 @@ class LacusCore():
467
471
  else:
468
472
  browser_engine = 'webkit'
469
473
 
470
- cookies: list[dict[str, Any]] = []
474
+ cookies: list[Cookie] = []
471
475
  if to_capture.cookies:
472
476
  # In order to properly pass the cookies to playwright,
473
477
  # each of then must have a name, a value and either a domain + path or a URL
@@ -476,18 +480,16 @@ class LacusCore():
476
480
  # with the hostname of the URL we try to capture and the path with "/"
477
481
  # NOTE: these changes can only be done here because we need the URL.
478
482
  for cookie in to_capture.cookies:
479
- if len(cookie) == 1:
480
- # we have a cookie in the format key: value
481
- name, value = cookie.popitem()
482
- cookie = {'name': name, 'value': value}
483
483
  if 'name' not in cookie or 'value' not in cookie:
484
484
  logger.warning(f'Invalid cookie: {cookie}')
485
485
  continue
486
486
  if 'domain' not in cookie and 'url' not in cookie:
487
+ if not splitted_url.hostname:
488
+ # If for any reason we cannot get the hostname there, ignore the cookie
489
+ continue
487
490
  cookie['domain'] = splitted_url.hostname
488
491
  cookie['path'] = '/'
489
492
  cookies.append(cookie)
490
-
491
493
  try:
492
494
  logger.debug(f'Capturing {url}')
493
495
  stats_pipeline.sadd(f'stats:{today}:captures', url)
@@ -502,6 +504,7 @@ class LacusCore():
502
504
  # required by Mypy: https://github.com/python/mypy/issues/3004
503
505
  capture.headers = to_capture.headers # type: ignore[assignment]
504
506
  capture.cookies = cookies # type: ignore[assignment]
507
+ capture.storage = to_capture.storage # type: ignore[assignment]
505
508
  capture.viewport = to_capture.viewport # type: ignore[assignment]
506
509
  capture.user_agent = to_capture.user_agent # type: ignore[assignment]
507
510
  capture.http_credentials = to_capture.http_credentials # type: ignore[assignment]
@@ -665,6 +668,8 @@ class LacusCore():
665
668
  hash_to_set['har'] = pickle.dumps(results['har'])
666
669
  if results.get('cookies'):
667
670
  hash_to_set['cookies'] = pickle.dumps(results['cookies'])
671
+ if results.get('storage'):
672
+ hash_to_set['storage'] = pickle.dumps(results['storage'])
668
673
  if results.get('potential_favicons'):
669
674
  hash_to_set['potential_favicons'] = pickle.dumps(results['potential_favicons'])
670
675
  if results.get('html') and results['html'] is not None:
@@ -684,7 +689,7 @@ class LacusCore():
684
689
  hash_to_set['children'] = pickle.dumps(children)
685
690
 
686
691
  for key in results.keys():
687
- if key in ['har', 'cookies', 'potential_favicons', 'html', 'children'] or not results.get(key):
692
+ if key in ['har', 'cookies', 'storage', 'potential_favicons', 'html', 'children'] or not results.get(key):
688
693
  continue
689
694
  # these entries can be stored directly
690
695
  hash_to_set[key] = results[key] # type: ignore[literal-required]
@@ -709,6 +714,8 @@ class LacusCore():
709
714
  to_return['har'] = pickle.loads(value)
710
715
  elif key == b'cookies':
711
716
  to_return['cookies'] = pickle.loads(value)
717
+ elif key == b'storage':
718
+ to_return['storage'] = pickle.loads(value)
712
719
  elif key == b'potential_favicons':
713
720
  to_return['potential_favicons'] = pickle.loads(value)
714
721
  elif key == b'children':
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lacuscore
3
- Version: 1.13.0
3
+ Version: 1.13.1
4
4
  Summary: Core of Lacus, usable as a module
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -21,12 +21,12 @@ Classifier: Programming Language :: Python :: 3.13
21
21
  Classifier: Topic :: Internet
22
22
  Classifier: Topic :: Security
23
23
  Provides-Extra: docs
24
- Requires-Dist: Sphinx (>=8.1.3) ; (python_version >= "3.10") and (extra == "docs")
24
+ Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
25
25
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
26
26
  Requires-Dist: defang (>=0.5.3)
27
27
  Requires-Dist: dnspython (>=2.7.0)
28
28
  Requires-Dist: eval-type-backport (>=0.2.2) ; python_version < "3.10"
29
- Requires-Dist: playwrightcapture[recaptcha] (>=1.28.0)
29
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.28.1)
30
30
  Requires-Dist: pydantic (>=2.10.6)
31
31
  Requires-Dist: redis[hiredis] (>=5.2.1)
32
32
  Requires-Dist: requests (>=2.32.3)
@@ -0,0 +1,10 @@
1
+ lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
+ lacuscore/helpers.py,sha256=Nt3oMMDGgl3rDkDujuAaxWtb3cXeSGk2pdXg5lNEqhI,13188
3
+ lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
4
+ lacuscore/lacuscore.py,sha256=fzW-04_pFK_Bqv-66-ei4OLS3nxpHmNEXCn4_2pXhTU,42212
5
+ lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
7
+ lacuscore-1.13.1.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
+ lacuscore-1.13.1.dist-info/METADATA,sha256=IyAbfTlzx6MfiGGSm4xcQsz_l7zlAS4iKdhpc1xlOuA,2570
9
+ lacuscore-1.13.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
10
+ lacuscore-1.13.1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
- lacuscore/helpers.py,sha256=GKgy8-kvGJLrOv431AtQRTtMSJ5GNtnwsj-K6WqF0EA,10993
3
- lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
4
- lacuscore/lacuscore.py,sha256=4yUGTdURX61Os3pvpSqtXP6u9FEhcIb4utaiSUMGIpo,41687
5
- lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
7
- lacuscore-1.13.0.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
- lacuscore-1.13.0.dist-info/METADATA,sha256=6X46qhNEVj4QflVHAjAp2U6AI97JdDIOuwaPISTtQfg,2570
9
- lacuscore-1.13.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
10
- lacuscore-1.13.0.dist-info/RECORD,,