lacuscore 1.12.10__py3-none-any.whl → 1.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lacuscore/helpers.py +55 -6
- lacuscore/lacuscore.py +18 -18
- {lacuscore-1.12.10.dist-info → lacuscore-1.13.1.dist-info}/METADATA +4 -5
- lacuscore-1.13.1.dist-info/RECORD +10 -0
- {lacuscore-1.12.10.dist-info → lacuscore-1.13.1.dist-info}/WHEEL +1 -1
- lacuscore-1.12.10.dist-info/RECORD +0 -10
- {lacuscore-1.12.10.dist-info → lacuscore-1.13.1.dist-info}/LICENSE +0 -0
lacuscore/helpers.py
CHANGED
@@ -15,6 +15,8 @@ from pydantic_core import from_json
|
|
15
15
|
|
16
16
|
from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
|
17
17
|
|
18
|
+
from playwright._impl._api_structures import Cookie # , StorageState
|
19
|
+
|
18
20
|
|
19
21
|
class LacusCoreException(Exception):
|
20
22
|
pass
|
@@ -71,7 +73,10 @@ class CaptureResponseJson(TypedDict, total=False):
|
|
71
73
|
status: int
|
72
74
|
last_redirected_url: str | None
|
73
75
|
har: dict[str, Any] | None
|
74
|
-
cookies: list[
|
76
|
+
cookies: list[Cookie] | None
|
77
|
+
# NOTE: should be that, but StorageState doesn't define the indexeddb
|
78
|
+
# storage: StorageState | None
|
79
|
+
storage: dict[str, Any] | None
|
75
80
|
error: str | None
|
76
81
|
html: str | None
|
77
82
|
png: str | None
|
@@ -93,7 +98,10 @@ class CaptureSettings(BaseModel):
|
|
93
98
|
user_agent: str | None = None
|
94
99
|
proxy: str | dict[str, str] | None = None
|
95
100
|
general_timeout_in_sec: int | None = None
|
96
|
-
cookies: list[
|
101
|
+
cookies: list[Cookie] | None = None
|
102
|
+
# NOTE: should be that, but StorageState doesn't define the indexeddb
|
103
|
+
# storage: StorageState | None = None
|
104
|
+
storage: dict[str, Any] | None = None
|
97
105
|
headers: dict[str, str] | None = None
|
98
106
|
http_credentials: dict[str, str] | None = None
|
99
107
|
geolocation: dict[str, float] | None = None
|
@@ -197,10 +205,51 @@ class CaptureSettings(BaseModel):
|
|
197
205
|
if not cookies:
|
198
206
|
return None
|
199
207
|
if isinstance(cookies, str):
|
200
|
-
#
|
201
|
-
|
202
|
-
|
203
|
-
|
208
|
+
# might be a json dump, try to load it and ignore otherwise
|
209
|
+
try:
|
210
|
+
cookies = json.loads(cookies)
|
211
|
+
except json.JSONDecodeError as e:
|
212
|
+
print(e)
|
213
|
+
# Cookies are invalid, ignoring.
|
214
|
+
return None
|
215
|
+
if isinstance(cookies, dict):
|
216
|
+
# might be a single cookie in the format name: value, make it a list
|
217
|
+
cookies = [cookies]
|
218
|
+
if isinstance(cookies, list):
|
219
|
+
# make sure the cookies are in the right format
|
220
|
+
to_return = []
|
221
|
+
for cookie in cookies:
|
222
|
+
if isinstance(cookie, dict):
|
223
|
+
if 'name' in cookie and 'value' in cookie:
|
224
|
+
to_return.append(cookie)
|
225
|
+
elif len(cookie) == 1:
|
226
|
+
# {'name': 'value'} => {'name': 'name', 'value': 'value'}
|
227
|
+
name, value = cookie.popitem()
|
228
|
+
if name and value:
|
229
|
+
to_return.append({'name': name, 'value': value})
|
230
|
+
else:
|
231
|
+
# invalid cookie, ignoring
|
232
|
+
pass
|
233
|
+
return to_return
|
234
|
+
return None
|
235
|
+
|
236
|
+
@field_validator('storage', mode='before')
|
237
|
+
@classmethod
|
238
|
+
def load_storage_json(cls, storage: Any) -> dict[str, Any] | None:
|
239
|
+
"""That's the storage as exported from Playwright:
|
240
|
+
https://playwright.dev/python/docs/api/class-browsercontext#browser-context-storage-state
|
241
|
+
"""
|
242
|
+
if not storage:
|
243
|
+
return None
|
244
|
+
if isinstance(storage, str):
|
245
|
+
# might be a json dump, try to load it and ignore otherwise
|
246
|
+
try:
|
247
|
+
storage = json.loads(storage)
|
248
|
+
except json.JSONDecodeError:
|
249
|
+
# storage is invalid, ignoring.
|
250
|
+
return None
|
251
|
+
if isinstance(storage, dict) and 'cookies' in storage and 'origins' in storage:
|
252
|
+
return storage
|
204
253
|
return None
|
205
254
|
|
206
255
|
@field_validator('headers', mode='before')
|
lacuscore/lacuscore.py
CHANGED
@@ -12,14 +12,13 @@ import re
|
|
12
12
|
import sys
|
13
13
|
import time
|
14
14
|
import unicodedata
|
15
|
-
import zlib
|
16
15
|
|
17
16
|
from asyncio import Task
|
18
17
|
from base64 import b64decode, b64encode
|
19
18
|
from datetime import date, timedelta
|
20
19
|
from ipaddress import ip_address, IPv4Address, IPv6Address
|
21
20
|
from tempfile import NamedTemporaryFile
|
22
|
-
from typing import Literal, Any, overload, cast
|
21
|
+
from typing import Literal, Any, overload, cast, TYPE_CHECKING
|
23
22
|
from collections.abc import Iterator
|
24
23
|
from uuid import uuid4
|
25
24
|
from urllib.parse import urlsplit
|
@@ -56,6 +55,9 @@ else:
|
|
56
55
|
if timeout_cm.expired():
|
57
56
|
logger.warning(f'Timeout expired: {error_message}')
|
58
57
|
|
58
|
+
if TYPE_CHECKING:
|
59
|
+
from playwright._impl._api_structures import Cookie
|
60
|
+
|
59
61
|
|
60
62
|
BROWSER = Literal['chromium', 'firefox', 'webkit']
|
61
63
|
|
@@ -131,6 +133,7 @@ class LacusCore():
|
|
131
133
|
proxy: str | dict[str, str] | None=None,
|
132
134
|
general_timeout_in_sec: int | None=None,
|
133
135
|
cookies: list[dict[str, Any]] | None=None,
|
136
|
+
storage: dict[str, Any] | None=None,
|
134
137
|
headers: dict[str, str] | None=None,
|
135
138
|
http_credentials: dict[str, str] | None=None,
|
136
139
|
geolocation: dict[str, float] | None=None,
|
@@ -162,6 +165,7 @@ class LacusCore():
|
|
162
165
|
proxy: str | dict[str, str] | None=None,
|
163
166
|
general_timeout_in_sec: int | None=None,
|
164
167
|
cookies: list[dict[str, Any]] | None=None,
|
168
|
+
storage: dict[str, Any] | None=None,
|
165
169
|
headers: dict[str, str] | None=None,
|
166
170
|
http_credentials: dict[str, str] | None=None,
|
167
171
|
geolocation: dict[str, float] | None=None,
|
@@ -195,6 +199,7 @@ class LacusCore():
|
|
195
199
|
:param proxy: SOCKS5 proxy to use for capturing
|
196
200
|
:param general_timeout_in_sec: The capture will raise a timeout it it takes more than that time
|
197
201
|
:param cookies: A list of cookies
|
202
|
+
:param storage: A storage state from another capture
|
198
203
|
:param headers: The headers to pass to the capture
|
199
204
|
:param http_credentials: HTTP Credentials to pass to the capture
|
200
205
|
:param geolocation: Geolocation of the browser to pass to the capture
|
@@ -223,7 +228,7 @@ class LacusCore():
|
|
223
228
|
'browser': browser, 'device_name': device_name,
|
224
229
|
'user_agent': user_agent, 'proxy': proxy,
|
225
230
|
'general_timeout_in_sec': general_timeout_in_sec,
|
226
|
-
'cookies': cookies, 'headers': headers,
|
231
|
+
'cookies': cookies, 'storage': storage, 'headers': headers,
|
227
232
|
'http_credentials': http_credentials, 'geolocation': geolocation,
|
228
233
|
'timezone_id': timezone_id, 'locale': locale,
|
229
234
|
'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
|
@@ -232,7 +237,6 @@ class LacusCore():
|
|
232
237
|
# Quietly force it to true if headed is not allowed.
|
233
238
|
'headless': headless if self.headed_allowed else True,
|
234
239
|
'max_retries': max_retries}
|
235
|
-
|
236
240
|
try:
|
237
241
|
to_enqueue = CaptureSettings(**settings)
|
238
242
|
except ValidationError as e:
|
@@ -245,7 +249,6 @@ class LacusCore():
|
|
245
249
|
if isinstance(existing_uuid, bytes):
|
246
250
|
return existing_uuid.decode()
|
247
251
|
return existing_uuid
|
248
|
-
|
249
252
|
if uuid:
|
250
253
|
# Make sure we do not already have a capture with that UUID
|
251
254
|
if self.get_capture_status(uuid) == CaptureStatus.UNKNOWN:
|
@@ -468,7 +471,7 @@ class LacusCore():
|
|
468
471
|
else:
|
469
472
|
browser_engine = 'webkit'
|
470
473
|
|
471
|
-
cookies: list[
|
474
|
+
cookies: list[Cookie] = []
|
472
475
|
if to_capture.cookies:
|
473
476
|
# In order to properly pass the cookies to playwright,
|
474
477
|
# each of then must have a name, a value and either a domain + path or a URL
|
@@ -477,18 +480,16 @@ class LacusCore():
|
|
477
480
|
# with the hostname of the URL we try to capture and the path with "/"
|
478
481
|
# NOTE: these changes can only be done here because we need the URL.
|
479
482
|
for cookie in to_capture.cookies:
|
480
|
-
if len(cookie) == 1:
|
481
|
-
# we have a cookie in the format key: value
|
482
|
-
name, value = cookie.popitem()
|
483
|
-
cookie = {'name': name, 'value': value}
|
484
483
|
if 'name' not in cookie or 'value' not in cookie:
|
485
484
|
logger.warning(f'Invalid cookie: {cookie}')
|
486
485
|
continue
|
487
486
|
if 'domain' not in cookie and 'url' not in cookie:
|
487
|
+
if not splitted_url.hostname:
|
488
|
+
# If for any reason we cannot get the hostname there, ignore the cookie
|
489
|
+
continue
|
488
490
|
cookie['domain'] = splitted_url.hostname
|
489
491
|
cookie['path'] = '/'
|
490
492
|
cookies.append(cookie)
|
491
|
-
|
492
493
|
try:
|
493
494
|
logger.debug(f'Capturing {url}')
|
494
495
|
stats_pipeline.sadd(f'stats:{today}:captures', url)
|
@@ -503,6 +504,7 @@ class LacusCore():
|
|
503
504
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
504
505
|
capture.headers = to_capture.headers # type: ignore[assignment]
|
505
506
|
capture.cookies = cookies # type: ignore[assignment]
|
507
|
+
capture.storage = to_capture.storage # type: ignore[assignment]
|
506
508
|
capture.viewport = to_capture.viewport # type: ignore[assignment]
|
507
509
|
capture.user_agent = to_capture.user_agent # type: ignore[assignment]
|
508
510
|
capture.http_credentials = to_capture.http_credentials # type: ignore[assignment]
|
@@ -666,6 +668,8 @@ class LacusCore():
|
|
666
668
|
hash_to_set['har'] = pickle.dumps(results['har'])
|
667
669
|
if results.get('cookies'):
|
668
670
|
hash_to_set['cookies'] = pickle.dumps(results['cookies'])
|
671
|
+
if results.get('storage'):
|
672
|
+
hash_to_set['storage'] = pickle.dumps(results['storage'])
|
669
673
|
if results.get('potential_favicons'):
|
670
674
|
hash_to_set['potential_favicons'] = pickle.dumps(results['potential_favicons'])
|
671
675
|
if results.get('html') and results['html'] is not None:
|
@@ -685,7 +689,7 @@ class LacusCore():
|
|
685
689
|
hash_to_set['children'] = pickle.dumps(children)
|
686
690
|
|
687
691
|
for key in results.keys():
|
688
|
-
if key in ['har', 'cookies', 'potential_favicons', 'html', 'children'] or not results.get(key):
|
692
|
+
if key in ['har', 'cookies', 'storage', 'potential_favicons', 'html', 'children'] or not results.get(key):
|
689
693
|
continue
|
690
694
|
# these entries can be stored directly
|
691
695
|
hash_to_set[key] = results[key] # type: ignore[literal-required]
|
@@ -702,12 +706,6 @@ class LacusCore():
|
|
702
706
|
if root_key is None:
|
703
707
|
root_key = f'lacus:capture_results_hash:{capture_uuid}'
|
704
708
|
|
705
|
-
if not self.redis.exists(root_key):
|
706
|
-
if old_response := self.redis.get(f'lacus:capture_results:{capture_uuid}'):
|
707
|
-
# TODO: remove in 1.8.* - old format used last in 1.6, and kept no more than 10H in redis
|
708
|
-
return pickle.loads(zlib.decompress(old_response))
|
709
|
-
return None
|
710
|
-
|
711
709
|
# New format and capture done
|
712
710
|
|
713
711
|
to_return: CaptureResponse = {}
|
@@ -716,6 +714,8 @@ class LacusCore():
|
|
716
714
|
to_return['har'] = pickle.loads(value)
|
717
715
|
elif key == b'cookies':
|
718
716
|
to_return['cookies'] = pickle.loads(value)
|
717
|
+
elif key == b'storage':
|
718
|
+
to_return['storage'] = pickle.loads(value)
|
719
719
|
elif key == b'potential_favicons':
|
720
720
|
to_return['potential_favicons'] = pickle.loads(value)
|
721
721
|
elif key == b'children':
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.13.1
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
7
7
|
Author-email: raphael.vinot@circl.lu
|
8
|
-
Requires-Python: >=3.9,<4.0
|
8
|
+
Requires-Python: >=3.9.2,<4.0
|
9
9
|
Classifier: Development Status :: 5 - Production/Stable
|
10
10
|
Classifier: Environment :: Console
|
11
11
|
Classifier: Intended Audience :: Information Technology
|
@@ -14,7 +14,6 @@ Classifier: Intended Audience :: Telecommunications Industry
|
|
14
14
|
Classifier: License :: OSI Approved :: BSD License
|
15
15
|
Classifier: Operating System :: POSIX :: Linux
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
20
19
|
Classifier: Programming Language :: Python :: 3.12
|
@@ -22,12 +21,12 @@ Classifier: Programming Language :: Python :: 3.13
|
|
22
21
|
Classifier: Topic :: Internet
|
23
22
|
Classifier: Topic :: Security
|
24
23
|
Provides-Extra: docs
|
25
|
-
Requires-Dist: Sphinx (>=8.
|
24
|
+
Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
|
26
25
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
27
26
|
Requires-Dist: defang (>=0.5.3)
|
28
27
|
Requires-Dist: dnspython (>=2.7.0)
|
29
28
|
Requires-Dist: eval-type-backport (>=0.2.2) ; python_version < "3.10"
|
30
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.
|
29
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.28.1)
|
31
30
|
Requires-Dist: pydantic (>=2.10.6)
|
32
31
|
Requires-Dist: redis[hiredis] (>=5.2.1)
|
33
32
|
Requires-Dist: requests (>=2.32.3)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
|
2
|
+
lacuscore/helpers.py,sha256=Nt3oMMDGgl3rDkDujuAaxWtb3cXeSGk2pdXg5lNEqhI,13188
|
3
|
+
lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
|
4
|
+
lacuscore/lacuscore.py,sha256=fzW-04_pFK_Bqv-66-ei4OLS3nxpHmNEXCn4_2pXhTU,42212
|
5
|
+
lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
|
7
|
+
lacuscore-1.13.1.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
|
8
|
+
lacuscore-1.13.1.dist-info/METADATA,sha256=IyAbfTlzx6MfiGGSm4xcQsz_l7zlAS4iKdhpc1xlOuA,2570
|
9
|
+
lacuscore-1.13.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
10
|
+
lacuscore-1.13.1.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
|
2
|
-
lacuscore/helpers.py,sha256=GKgy8-kvGJLrOv431AtQRTtMSJ5GNtnwsj-K6WqF0EA,10993
|
3
|
-
lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
|
4
|
-
lacuscore/lacuscore.py,sha256=S-qxQJR4WmJHDSAay9tBCWHz2f4uM6z1Elmox7hln54,42049
|
5
|
-
lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
|
7
|
-
lacuscore-1.12.10.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
|
8
|
-
lacuscore-1.12.10.dist-info/METADATA,sha256=Z2TqBsZDn5K0omvbs0Gv_4TvZBkEAH1RxVO7XfXznl4,2619
|
9
|
-
lacuscore-1.12.10.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
10
|
-
lacuscore-1.12.10.dist-info/RECORD,,
|
File without changes
|