lacuscore 1.13.0__tar.gz → 1.13.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lacuscore-1.13.0 → lacuscore-1.13.2}/PKG-INFO +4 -3
- {lacuscore-1.13.0 → lacuscore-1.13.2}/lacuscore/helpers.py +63 -7
- {lacuscore-1.13.0 → lacuscore-1.13.2}/lacuscore/lacuscore.py +18 -11
- {lacuscore-1.13.0 → lacuscore-1.13.2}/pyproject.toml +7 -6
- {lacuscore-1.13.0 → lacuscore-1.13.2}/LICENSE +0 -0
- {lacuscore-1.13.0 → lacuscore-1.13.2}/README.md +0 -0
- {lacuscore-1.13.0 → lacuscore-1.13.2}/lacuscore/__init__.py +0 -0
- {lacuscore-1.13.0 → lacuscore-1.13.2}/lacuscore/lacus_monitoring.py +0 -0
- {lacuscore-1.13.0 → lacuscore-1.13.2}/lacuscore/py.typed +0 -0
- {lacuscore-1.13.0 → lacuscore-1.13.2}/lacuscore/task_logger.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.13.
|
3
|
+
Version: 1.13.2
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
@@ -21,15 +21,16 @@ Classifier: Programming Language :: Python :: 3.13
|
|
21
21
|
Classifier: Topic :: Internet
|
22
22
|
Classifier: Topic :: Security
|
23
23
|
Provides-Extra: docs
|
24
|
-
Requires-Dist: Sphinx (>=8.
|
24
|
+
Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
|
25
25
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
26
26
|
Requires-Dist: defang (>=0.5.3)
|
27
27
|
Requires-Dist: dnspython (>=2.7.0)
|
28
28
|
Requires-Dist: eval-type-backport (>=0.2.2) ; python_version < "3.10"
|
29
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.28.
|
29
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.28.1)
|
30
30
|
Requires-Dist: pydantic (>=2.10.6)
|
31
31
|
Requires-Dist: redis[hiredis] (>=5.2.1)
|
32
32
|
Requires-Dist: requests (>=2.32.3)
|
33
|
+
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
|
33
34
|
Requires-Dist: ua-parser[regex] (>=1.0.1)
|
34
35
|
Project-URL: Documentation, https://lacuscore.readthedocs.io/en/latest/
|
35
36
|
Project-URL: Issues, https://github.com/ail-project/issues
|
@@ -3,10 +3,11 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import json
|
6
|
+
import sys
|
6
7
|
|
7
8
|
from enum import IntEnum, unique
|
8
9
|
from logging import LoggerAdapter
|
9
|
-
from typing import Any,
|
10
|
+
from typing import Any, Literal
|
10
11
|
from collections.abc import MutableMapping, Mapping
|
11
12
|
|
12
13
|
from defang import refang
|
@@ -15,6 +16,14 @@ from pydantic_core import from_json
|
|
15
16
|
|
16
17
|
from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
|
17
18
|
|
19
|
+
from playwright._impl._api_structures import Cookie # , StorageState
|
20
|
+
|
21
|
+
|
22
|
+
if sys.version_info >= (3, 12):
|
23
|
+
from typing import TypedDict
|
24
|
+
else:
|
25
|
+
from typing_extensions import TypedDict
|
26
|
+
|
18
27
|
|
19
28
|
class LacusCoreException(Exception):
|
20
29
|
pass
|
@@ -71,7 +80,10 @@ class CaptureResponseJson(TypedDict, total=False):
|
|
71
80
|
status: int
|
72
81
|
last_redirected_url: str | None
|
73
82
|
har: dict[str, Any] | None
|
74
|
-
cookies: list[
|
83
|
+
cookies: list[Cookie] | None
|
84
|
+
# NOTE: should be that, but StorageState doesn't define the indexeddb
|
85
|
+
# storage: StorageState | None
|
86
|
+
storage: dict[str, Any] | None
|
75
87
|
error: str | None
|
76
88
|
html: str | None
|
77
89
|
png: str | None
|
@@ -93,7 +105,10 @@ class CaptureSettings(BaseModel):
|
|
93
105
|
user_agent: str | None = None
|
94
106
|
proxy: str | dict[str, str] | None = None
|
95
107
|
general_timeout_in_sec: int | None = None
|
96
|
-
cookies: list[
|
108
|
+
cookies: list[Cookie] | None = None
|
109
|
+
# NOTE: should be that, but StorageState doesn't define the indexeddb
|
110
|
+
# storage: StorageState | None = None
|
111
|
+
storage: dict[str, Any] | None = None
|
97
112
|
headers: dict[str, str] | None = None
|
98
113
|
http_credentials: dict[str, str] | None = None
|
99
114
|
geolocation: dict[str, float] | None = None
|
@@ -197,10 +212,51 @@ class CaptureSettings(BaseModel):
|
|
197
212
|
if not cookies:
|
198
213
|
return None
|
199
214
|
if isinstance(cookies, str):
|
200
|
-
#
|
201
|
-
|
202
|
-
|
203
|
-
|
215
|
+
# might be a json dump, try to load it and ignore otherwise
|
216
|
+
try:
|
217
|
+
cookies = json.loads(cookies)
|
218
|
+
except json.JSONDecodeError as e:
|
219
|
+
print(e)
|
220
|
+
# Cookies are invalid, ignoring.
|
221
|
+
return None
|
222
|
+
if isinstance(cookies, dict):
|
223
|
+
# might be a single cookie in the format name: value, make it a list
|
224
|
+
cookies = [cookies]
|
225
|
+
if isinstance(cookies, list):
|
226
|
+
# make sure the cookies are in the right format
|
227
|
+
to_return = []
|
228
|
+
for cookie in cookies:
|
229
|
+
if isinstance(cookie, dict):
|
230
|
+
if 'name' in cookie and 'value' in cookie:
|
231
|
+
to_return.append(cookie)
|
232
|
+
elif len(cookie) == 1:
|
233
|
+
# {'name': 'value'} => {'name': 'name', 'value': 'value'}
|
234
|
+
name, value = cookie.popitem()
|
235
|
+
if name and value:
|
236
|
+
to_return.append({'name': name, 'value': value})
|
237
|
+
else:
|
238
|
+
# invalid cookie, ignoring
|
239
|
+
pass
|
240
|
+
return to_return
|
241
|
+
return None
|
242
|
+
|
243
|
+
@field_validator('storage', mode='before')
|
244
|
+
@classmethod
|
245
|
+
def load_storage_json(cls, storage: Any) -> dict[str, Any] | None:
|
246
|
+
"""That's the storage as exported from Playwright:
|
247
|
+
https://playwright.dev/python/docs/api/class-browsercontext#browser-context-storage-state
|
248
|
+
"""
|
249
|
+
if not storage:
|
250
|
+
return None
|
251
|
+
if isinstance(storage, str):
|
252
|
+
# might be a json dump, try to load it and ignore otherwise
|
253
|
+
try:
|
254
|
+
storage = json.loads(storage)
|
255
|
+
except json.JSONDecodeError:
|
256
|
+
# storage is invalid, ignoring.
|
257
|
+
return None
|
258
|
+
if isinstance(storage, dict) and 'cookies' in storage and 'origins' in storage:
|
259
|
+
return storage
|
204
260
|
return None
|
205
261
|
|
206
262
|
@field_validator('headers', mode='before')
|
@@ -18,7 +18,7 @@ from base64 import b64decode, b64encode
|
|
18
18
|
from datetime import date, timedelta
|
19
19
|
from ipaddress import ip_address, IPv4Address, IPv6Address
|
20
20
|
from tempfile import NamedTemporaryFile
|
21
|
-
from typing import Literal, Any, overload, cast
|
21
|
+
from typing import Literal, Any, overload, cast, TYPE_CHECKING
|
22
22
|
from collections.abc import Iterator
|
23
23
|
from uuid import uuid4
|
24
24
|
from urllib.parse import urlsplit
|
@@ -55,6 +55,9 @@ else:
|
|
55
55
|
if timeout_cm.expired():
|
56
56
|
logger.warning(f'Timeout expired: {error_message}')
|
57
57
|
|
58
|
+
if TYPE_CHECKING:
|
59
|
+
from playwright._impl._api_structures import Cookie
|
60
|
+
|
58
61
|
|
59
62
|
BROWSER = Literal['chromium', 'firefox', 'webkit']
|
60
63
|
|
@@ -130,6 +133,7 @@ class LacusCore():
|
|
130
133
|
proxy: str | dict[str, str] | None=None,
|
131
134
|
general_timeout_in_sec: int | None=None,
|
132
135
|
cookies: list[dict[str, Any]] | None=None,
|
136
|
+
storage: dict[str, Any] | None=None,
|
133
137
|
headers: dict[str, str] | None=None,
|
134
138
|
http_credentials: dict[str, str] | None=None,
|
135
139
|
geolocation: dict[str, float] | None=None,
|
@@ -161,6 +165,7 @@ class LacusCore():
|
|
161
165
|
proxy: str | dict[str, str] | None=None,
|
162
166
|
general_timeout_in_sec: int | None=None,
|
163
167
|
cookies: list[dict[str, Any]] | None=None,
|
168
|
+
storage: dict[str, Any] | None=None,
|
164
169
|
headers: dict[str, str] | None=None,
|
165
170
|
http_credentials: dict[str, str] | None=None,
|
166
171
|
geolocation: dict[str, float] | None=None,
|
@@ -194,6 +199,7 @@ class LacusCore():
|
|
194
199
|
:param proxy: SOCKS5 proxy to use for capturing
|
195
200
|
:param general_timeout_in_sec: The capture will raise a timeout it it takes more than that time
|
196
201
|
:param cookies: A list of cookies
|
202
|
+
:param storage: A storage state from another capture
|
197
203
|
:param headers: The headers to pass to the capture
|
198
204
|
:param http_credentials: HTTP Credentials to pass to the capture
|
199
205
|
:param geolocation: Geolocation of the browser to pass to the capture
|
@@ -222,7 +228,7 @@ class LacusCore():
|
|
222
228
|
'browser': browser, 'device_name': device_name,
|
223
229
|
'user_agent': user_agent, 'proxy': proxy,
|
224
230
|
'general_timeout_in_sec': general_timeout_in_sec,
|
225
|
-
'cookies': cookies, 'headers': headers,
|
231
|
+
'cookies': cookies, 'storage': storage, 'headers': headers,
|
226
232
|
'http_credentials': http_credentials, 'geolocation': geolocation,
|
227
233
|
'timezone_id': timezone_id, 'locale': locale,
|
228
234
|
'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
|
@@ -231,7 +237,6 @@ class LacusCore():
|
|
231
237
|
# Quietly force it to true if headed is not allowed.
|
232
238
|
'headless': headless if self.headed_allowed else True,
|
233
239
|
'max_retries': max_retries}
|
234
|
-
|
235
240
|
try:
|
236
241
|
to_enqueue = CaptureSettings(**settings)
|
237
242
|
except ValidationError as e:
|
@@ -244,7 +249,6 @@ class LacusCore():
|
|
244
249
|
if isinstance(existing_uuid, bytes):
|
245
250
|
return existing_uuid.decode()
|
246
251
|
return existing_uuid
|
247
|
-
|
248
252
|
if uuid:
|
249
253
|
# Make sure we do not already have a capture with that UUID
|
250
254
|
if self.get_capture_status(uuid) == CaptureStatus.UNKNOWN:
|
@@ -467,7 +471,7 @@ class LacusCore():
|
|
467
471
|
else:
|
468
472
|
browser_engine = 'webkit'
|
469
473
|
|
470
|
-
cookies: list[
|
474
|
+
cookies: list[Cookie] = []
|
471
475
|
if to_capture.cookies:
|
472
476
|
# In order to properly pass the cookies to playwright,
|
473
477
|
# each of then must have a name, a value and either a domain + path or a URL
|
@@ -476,18 +480,16 @@ class LacusCore():
|
|
476
480
|
# with the hostname of the URL we try to capture and the path with "/"
|
477
481
|
# NOTE: these changes can only be done here because we need the URL.
|
478
482
|
for cookie in to_capture.cookies:
|
479
|
-
if len(cookie) == 1:
|
480
|
-
# we have a cookie in the format key: value
|
481
|
-
name, value = cookie.popitem()
|
482
|
-
cookie = {'name': name, 'value': value}
|
483
483
|
if 'name' not in cookie or 'value' not in cookie:
|
484
484
|
logger.warning(f'Invalid cookie: {cookie}')
|
485
485
|
continue
|
486
486
|
if 'domain' not in cookie and 'url' not in cookie:
|
487
|
+
if not splitted_url.hostname:
|
488
|
+
# If for any reason we cannot get the hostname there, ignore the cookie
|
489
|
+
continue
|
487
490
|
cookie['domain'] = splitted_url.hostname
|
488
491
|
cookie['path'] = '/'
|
489
492
|
cookies.append(cookie)
|
490
|
-
|
491
493
|
try:
|
492
494
|
logger.debug(f'Capturing {url}')
|
493
495
|
stats_pipeline.sadd(f'stats:{today}:captures', url)
|
@@ -502,6 +504,7 @@ class LacusCore():
|
|
502
504
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
503
505
|
capture.headers = to_capture.headers # type: ignore[assignment]
|
504
506
|
capture.cookies = cookies # type: ignore[assignment]
|
507
|
+
capture.storage = to_capture.storage # type: ignore[assignment]
|
505
508
|
capture.viewport = to_capture.viewport # type: ignore[assignment]
|
506
509
|
capture.user_agent = to_capture.user_agent # type: ignore[assignment]
|
507
510
|
capture.http_credentials = to_capture.http_credentials # type: ignore[assignment]
|
@@ -665,6 +668,8 @@ class LacusCore():
|
|
665
668
|
hash_to_set['har'] = pickle.dumps(results['har'])
|
666
669
|
if results.get('cookies'):
|
667
670
|
hash_to_set['cookies'] = pickle.dumps(results['cookies'])
|
671
|
+
if results.get('storage'):
|
672
|
+
hash_to_set['storage'] = pickle.dumps(results['storage'])
|
668
673
|
if results.get('potential_favicons'):
|
669
674
|
hash_to_set['potential_favicons'] = pickle.dumps(results['potential_favicons'])
|
670
675
|
if results.get('html') and results['html'] is not None:
|
@@ -684,7 +689,7 @@ class LacusCore():
|
|
684
689
|
hash_to_set['children'] = pickle.dumps(children)
|
685
690
|
|
686
691
|
for key in results.keys():
|
687
|
-
if key in ['har', 'cookies', 'potential_favicons', 'html', 'children'] or not results.get(key):
|
692
|
+
if key in ['har', 'cookies', 'storage', 'potential_favicons', 'html', 'children'] or not results.get(key):
|
688
693
|
continue
|
689
694
|
# these entries can be stored directly
|
690
695
|
hash_to_set[key] = results[key] # type: ignore[literal-required]
|
@@ -709,6 +714,8 @@ class LacusCore():
|
|
709
714
|
to_return['har'] = pickle.loads(value)
|
710
715
|
elif key == b'cookies':
|
711
716
|
to_return['cookies'] = pickle.loads(value)
|
717
|
+
elif key == b'storage':
|
718
|
+
to_return['storage'] = pickle.loads(value)
|
712
719
|
elif key == b'potential_favicons':
|
713
720
|
to_return['potential_favicons'] = pickle.loads(value)
|
714
721
|
elif key == b'children':
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "lacuscore"
|
3
|
-
version = "1.13.
|
3
|
+
version = "1.13.2"
|
4
4
|
description = "Core of Lacus, usable as a module"
|
5
5
|
authors = [
|
6
6
|
{name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
|
@@ -15,14 +15,15 @@ dynamic = [ "classifiers" ]
|
|
15
15
|
|
16
16
|
dependencies = [
|
17
17
|
"requests (>=2.32.3)",
|
18
|
-
"playwrightcapture[recaptcha] (>=1.28.
|
18
|
+
"playwrightcapture[recaptcha] (>=1.28.1)",
|
19
19
|
"defang (>=0.5.3)",
|
20
20
|
"ua-parser[regex] (>=1.0.1)",
|
21
21
|
"redis [hiredis] (>=5.2.1)",
|
22
22
|
"dnspython (>=2.7.0)",
|
23
23
|
"async-timeout (>=5.0.1) ; python_version < \"3.11\"",
|
24
24
|
"pydantic (>=2.10.6)",
|
25
|
-
"eval-type-backport (>=0.2.2) ; python_version < \"3.10\""
|
25
|
+
"eval-type-backport (>=0.2.2) ; python_version < \"3.10\"",
|
26
|
+
"typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
|
26
27
|
]
|
27
28
|
|
28
29
|
[project.urls]
|
@@ -43,14 +44,14 @@ classifiers = [
|
|
43
44
|
]
|
44
45
|
|
45
46
|
[project.optional-dependencies]
|
46
|
-
docs = ["Sphinx (>=8.
|
47
|
+
docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\""]
|
47
48
|
|
48
49
|
[tool.poetry.group.dev.dependencies]
|
49
50
|
mypy = "^1.15.0"
|
50
51
|
types-redis = {version = "^4.6.0.20241004"}
|
51
|
-
types-requests = "^2.32.0.
|
52
|
+
types-requests = "^2.32.0.20250306"
|
52
53
|
types-beautifulsoup4 = "^4.12.0.20250204"
|
53
|
-
pytest = "^8.3.
|
54
|
+
pytest = "^8.3.5"
|
54
55
|
|
55
56
|
[build-system]
|
56
57
|
requires = ["poetry-core>=2.0"]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|