lacuscore 1.10.0__py3-none-any.whl → 1.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lacuscore/__init__.py +7 -2
- lacuscore/helpers.py +205 -29
- lacuscore/lacuscore.py +69 -155
- {lacuscore-1.10.0.dist-info → lacuscore-1.10.2.dist-info}/METADATA +3 -2
- lacuscore-1.10.2.dist-info/RECORD +10 -0
- lacuscore-1.10.0.dist-info/RECORD +0 -10
- {lacuscore-1.10.0.dist-info → lacuscore-1.10.2.dist-info}/LICENSE +0 -0
- {lacuscore-1.10.0.dist-info → lacuscore-1.10.2.dist-info}/WHEEL +0 -0
lacuscore/__init__.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from .lacuscore import LacusCore
|
2
|
-
from .helpers import CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings # noqa
|
2
|
+
from .helpers import (CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings, # noqa
|
3
|
+
LacusCoreException, CaptureError, RetryCapture, CaptureSettingsError) # noqa
|
3
4
|
from .lacus_monitoring import LacusCoreMonitoring # noqa
|
4
5
|
|
5
6
|
__all__ = [
|
@@ -8,5 +9,9 @@ __all__ = [
|
|
8
9
|
'CaptureResponse',
|
9
10
|
'CaptureResponseJson',
|
10
11
|
'CaptureSettings',
|
11
|
-
'LacusCoreMonitoring'
|
12
|
+
'LacusCoreMonitoring',
|
13
|
+
'LacusCoreException',
|
14
|
+
'CaptureError',
|
15
|
+
'RetryCapture',
|
16
|
+
'CaptureSettingsError'
|
12
17
|
]
|
lacuscore/helpers.py
CHANGED
@@ -2,9 +2,15 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
+
import json
|
6
|
+
|
5
7
|
from enum import IntEnum, unique
|
6
8
|
from logging import LoggerAdapter
|
7
|
-
from typing import MutableMapping, Any, TypedDict
|
9
|
+
from typing import MutableMapping, Any, TypedDict, Literal, Mapping
|
10
|
+
|
11
|
+
from defang import refang # type: ignore[import-untyped]
|
12
|
+
from pydantic import BaseModel, field_validator, model_validator, ValidationError
|
13
|
+
from pydantic_core import from_json
|
8
14
|
|
9
15
|
from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
|
10
16
|
|
@@ -22,7 +28,11 @@ class RetryCapture(LacusCoreException):
|
|
22
28
|
|
23
29
|
|
24
30
|
class CaptureSettingsError(LacusCoreException):
|
25
|
-
|
31
|
+
'''Can handle Pydantic validation errors'''
|
32
|
+
|
33
|
+
def __init__(self, message: str, pydantic_validation_errors: ValidationError | None=None) -> None:
|
34
|
+
super().__init__(message)
|
35
|
+
self.pydantic_validation_errors = pydantic_validation_errors
|
26
36
|
|
27
37
|
|
28
38
|
class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
@@ -71,32 +81,198 @@ class CaptureResponseJson(TypedDict, total=False):
|
|
71
81
|
potential_favicons: list[str] | None
|
72
82
|
|
73
83
|
|
74
|
-
class CaptureSettings(
|
84
|
+
class CaptureSettings(BaseModel):
|
75
85
|
'''The capture settings that can be passed to Lacus.'''
|
76
86
|
|
77
|
-
url: str | None
|
78
|
-
document_name: str | None
|
79
|
-
document: str | None
|
80
|
-
browser:
|
81
|
-
device_name: str | None
|
82
|
-
user_agent: str | None
|
83
|
-
proxy: str | dict[str, str] | None
|
84
|
-
general_timeout_in_sec: int | None
|
85
|
-
cookies: list[dict[str, Any]] | None
|
86
|
-
headers:
|
87
|
-
http_credentials: dict[str, str] | None
|
88
|
-
geolocation: dict[str, float] | None
|
89
|
-
timezone_id: str | None
|
90
|
-
locale: str | None
|
91
|
-
color_scheme: str | None
|
92
|
-
viewport: dict[str, int] | None
|
93
|
-
referer: str | None
|
94
|
-
with_favicon: bool
|
95
|
-
allow_tracking: bool
|
96
|
-
force: bool
|
97
|
-
recapture_interval: int
|
98
|
-
priority: int
|
99
|
-
uuid: str | None
|
100
|
-
|
101
|
-
depth: int
|
102
|
-
rendered_hostname_only: bool # Note: only used if depth is > 0
|
87
|
+
url: str | None = None
|
88
|
+
document_name: str | None = None
|
89
|
+
document: str | None = None
|
90
|
+
browser: Literal['chromium', 'firefox', 'webkit'] | None = None
|
91
|
+
device_name: str | None = None
|
92
|
+
user_agent: str | None = None
|
93
|
+
proxy: str | dict[str, str] | None = None
|
94
|
+
general_timeout_in_sec: int | None = None
|
95
|
+
cookies: list[dict[str, Any]] | None = None
|
96
|
+
headers: dict[str, str] | None = None
|
97
|
+
http_credentials: dict[str, str] | None = None
|
98
|
+
geolocation: dict[str, float] | None = None
|
99
|
+
timezone_id: str | None = None
|
100
|
+
locale: str | None = None
|
101
|
+
color_scheme: str | None = None
|
102
|
+
viewport: dict[str, int] | None = None
|
103
|
+
referer: str | None = None
|
104
|
+
with_favicon: bool = False
|
105
|
+
allow_tracking: bool = False
|
106
|
+
force: bool = False
|
107
|
+
recapture_interval: int = 300
|
108
|
+
priority: int = 0
|
109
|
+
uuid: str | None = None
|
110
|
+
|
111
|
+
depth: int = 0
|
112
|
+
rendered_hostname_only: bool = True # Note: only used if depth is > 0
|
113
|
+
|
114
|
+
@model_validator(mode='after')
|
115
|
+
def check_capture_element(self) -> CaptureSettings:
|
116
|
+
if self.document_name and not self.document:
|
117
|
+
raise CaptureSettingsError('You must provide a document if you provide a document name')
|
118
|
+
if self.document and not self.document_name:
|
119
|
+
raise CaptureSettingsError('You must provide a document name if you provide a document')
|
120
|
+
|
121
|
+
if self.url and (self.document or self.document_name):
|
122
|
+
raise CaptureSettingsError('You cannot provide both a URL and a document to capture')
|
123
|
+
if not self.url and not (self.document and self.document_name):
|
124
|
+
raise CaptureSettingsError('You must provide either a URL or a document to capture')
|
125
|
+
return self
|
126
|
+
|
127
|
+
@field_validator('url', mode='after')
|
128
|
+
@classmethod
|
129
|
+
def load_url(cls, v: str | None) -> str | None:
|
130
|
+
if isinstance(v, str):
|
131
|
+
url = v.strip()
|
132
|
+
url = refang(url) # In case we get a defanged url at this stage.
|
133
|
+
if (not url.lower().startswith('data:')
|
134
|
+
and not url.lower().startswith('http:')
|
135
|
+
and not url.lower().startswith('https:')
|
136
|
+
and not url.lower().startswith('file:')):
|
137
|
+
url = f'http://{url}'
|
138
|
+
return url
|
139
|
+
return v
|
140
|
+
|
141
|
+
@field_validator('document_name', mode='after')
|
142
|
+
@classmethod
|
143
|
+
def load_document_name(cls, v: str | None) -> str | None:
|
144
|
+
if isinstance(v, str):
|
145
|
+
name = v.strip()
|
146
|
+
if '.' not in name:
|
147
|
+
# The browser will simply display the file as text if there is no extension.
|
148
|
+
# Just add HTML as a fallback, as it will be the most comon one.
|
149
|
+
name = f'{name}.html'
|
150
|
+
return name
|
151
|
+
return v
|
152
|
+
|
153
|
+
@field_validator('proxy', mode='before')
|
154
|
+
@classmethod
|
155
|
+
def load_proxy_json(cls, v: Any) -> str | dict[str, str] | None:
|
156
|
+
if not v:
|
157
|
+
return None
|
158
|
+
if isinstance(v, str):
|
159
|
+
if v.startswith('{'):
|
160
|
+
return from_json(v)
|
161
|
+
# Just the proxy
|
162
|
+
return v
|
163
|
+
elif isinstance(v, dict):
|
164
|
+
return v
|
165
|
+
return None
|
166
|
+
|
167
|
+
@field_validator('cookies', mode='before')
|
168
|
+
@classmethod
|
169
|
+
def load_cookies_json(cls, v: Any) -> list[dict[str, Any]] | None:
|
170
|
+
if not v:
|
171
|
+
return None
|
172
|
+
if isinstance(v, str):
|
173
|
+
if v.startswith('['):
|
174
|
+
return from_json(v)
|
175
|
+
# Cookies are invalid, ignoring.
|
176
|
+
elif isinstance(v, list):
|
177
|
+
return v
|
178
|
+
return None
|
179
|
+
|
180
|
+
@field_validator('headers', mode='before')
|
181
|
+
@classmethod
|
182
|
+
def load_headers_json(cls, v: Any) -> dict[str, str] | None:
|
183
|
+
if not v:
|
184
|
+
return None
|
185
|
+
if isinstance(v, str):
|
186
|
+
if v[0] == '{':
|
187
|
+
return from_json(v)
|
188
|
+
else:
|
189
|
+
# make it a dict
|
190
|
+
new_headers = {}
|
191
|
+
for header_line in v.splitlines():
|
192
|
+
if header_line and ':' in header_line:
|
193
|
+
splitted = header_line.split(':', 1)
|
194
|
+
if splitted and len(splitted) == 2:
|
195
|
+
header, h_value = splitted
|
196
|
+
if header.strip() and h_value.strip():
|
197
|
+
new_headers[header.strip()] = h_value.strip()
|
198
|
+
return new_headers
|
199
|
+
elif isinstance(v, dict):
|
200
|
+
return v
|
201
|
+
return None
|
202
|
+
|
203
|
+
@field_validator('http_credentials', mode='before')
|
204
|
+
@classmethod
|
205
|
+
def load_http_creds_json(cls, v: Any) -> dict[str, str] | None:
|
206
|
+
if not v:
|
207
|
+
return None
|
208
|
+
if isinstance(v, str):
|
209
|
+
if v.startswith('{'):
|
210
|
+
return from_json(v)
|
211
|
+
elif isinstance(v, dict):
|
212
|
+
return v
|
213
|
+
return None
|
214
|
+
|
215
|
+
@field_validator('http_credentials', mode='after')
|
216
|
+
@classmethod
|
217
|
+
def check_http_creds(cls, v: dict[str, str] | None) -> dict[str, str] | None:
|
218
|
+
if not v:
|
219
|
+
return v
|
220
|
+
if 'username' in v and 'password' in v:
|
221
|
+
return v
|
222
|
+
raise CaptureSettingsError(f'HTTP credentials must have a username and a password: {v}')
|
223
|
+
|
224
|
+
@field_validator('geolocation', mode='before')
|
225
|
+
@classmethod
|
226
|
+
def load_geolocation_json(cls, v: Any) -> dict[str, float] | None:
|
227
|
+
if not v:
|
228
|
+
return None
|
229
|
+
if isinstance(v, str):
|
230
|
+
if v.startswith('{'):
|
231
|
+
return from_json(v)
|
232
|
+
elif isinstance(v, dict):
|
233
|
+
return v
|
234
|
+
return None
|
235
|
+
|
236
|
+
@field_validator('geolocation', mode='after')
|
237
|
+
@classmethod
|
238
|
+
def check_geolocation(cls, v: dict[str, float] | None) -> dict[str, float] | None:
|
239
|
+
if not v:
|
240
|
+
return v
|
241
|
+
if 'latitude' in v and 'longitude' in v:
|
242
|
+
return v
|
243
|
+
raise CaptureSettingsError(f'A geolocation must have a latitude and a longitude: {v}')
|
244
|
+
|
245
|
+
@field_validator('viewport', mode='before')
|
246
|
+
@classmethod
|
247
|
+
def load_viewport_json(cls, v: Any) -> dict[str, int] | None:
|
248
|
+
if not v:
|
249
|
+
return None
|
250
|
+
if isinstance(v, str):
|
251
|
+
if v.startswith('{'):
|
252
|
+
return from_json(v)
|
253
|
+
elif isinstance(v, dict):
|
254
|
+
return v
|
255
|
+
return None
|
256
|
+
|
257
|
+
@field_validator('viewport', mode='after')
|
258
|
+
@classmethod
|
259
|
+
def check_viewport(cls, v: dict[str, int] | None) -> dict[str, int] | None:
|
260
|
+
if not v:
|
261
|
+
return v
|
262
|
+
if 'width' in v and 'height' in v:
|
263
|
+
return v
|
264
|
+
raise CaptureSettingsError(f'A viewport must have a width and a height: {v}')
|
265
|
+
|
266
|
+
def redis_dump(self) -> Mapping[str | bytes, bytes | float | int | str]:
|
267
|
+
mapping_capture: dict[str | bytes, bytes | float | int | str] = {}
|
268
|
+
for key, value in dict(self).items():
|
269
|
+
if value is None:
|
270
|
+
continue
|
271
|
+
if isinstance(value, bool):
|
272
|
+
mapping_capture[key] = 1 if value else 0
|
273
|
+
elif isinstance(value, (list, dict)):
|
274
|
+
if value:
|
275
|
+
mapping_capture[key] = json.dumps(value)
|
276
|
+
elif isinstance(value, (bytes, float, int, str)) and value not in ['', b'']: # we're ok with 0 for example
|
277
|
+
mapping_capture[key] = value
|
278
|
+
return mapping_capture
|
lacuscore/lacuscore.py
CHANGED
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import asyncio
|
6
6
|
import hashlib
|
7
|
-
import json
|
8
7
|
import logging
|
9
8
|
import os
|
10
9
|
import pickle
|
@@ -19,7 +18,6 @@ from asyncio import Task
|
|
19
18
|
from base64 import b64decode, b64encode
|
20
19
|
from datetime import date, timedelta
|
21
20
|
from ipaddress import ip_address, IPv4Address, IPv6Address
|
22
|
-
from pathlib import Path
|
23
21
|
from tempfile import NamedTemporaryFile
|
24
22
|
from typing import Literal, Any, overload, cast, Iterator
|
25
23
|
from uuid import uuid4
|
@@ -29,8 +27,8 @@ from dns import resolver
|
|
29
27
|
from dns.exception import DNSException
|
30
28
|
from dns.exception import Timeout as DNSTimeout
|
31
29
|
|
32
|
-
from
|
33
|
-
from
|
30
|
+
from playwrightcapture import Capture, PlaywrightCaptureException, InvalidPlaywrightParameter
|
31
|
+
from pydantic import ValidationError
|
34
32
|
from redis import Redis
|
35
33
|
from redis.exceptions import ConnectionError as RedisConnectionError
|
36
34
|
from redis.exceptions import DataError
|
@@ -114,7 +112,7 @@ class LacusCore():
|
|
114
112
|
return bool(self.redis.ping())
|
115
113
|
|
116
114
|
@overload
|
117
|
-
def enqueue(self, *, settings:
|
115
|
+
def enqueue(self, *, settings: dict[str, Any] | None=None) -> str:
|
118
116
|
...
|
119
117
|
|
120
118
|
@overload
|
@@ -127,7 +125,7 @@ class LacusCore():
|
|
127
125
|
proxy: str | dict[str, str] | None=None,
|
128
126
|
general_timeout_in_sec: int | None=None,
|
129
127
|
cookies: list[dict[str, Any]] | None=None,
|
130
|
-
headers:
|
128
|
+
headers: dict[str, str] | None=None,
|
131
129
|
http_credentials: dict[str, str] | None=None,
|
132
130
|
geolocation: dict[str, float] | None=None,
|
133
131
|
timezone_id: str | None=None,
|
@@ -146,7 +144,7 @@ class LacusCore():
|
|
146
144
|
...
|
147
145
|
|
148
146
|
def enqueue(self, *,
|
149
|
-
settings:
|
147
|
+
settings: dict[str, Any] | None=None,
|
150
148
|
url: str | None=None,
|
151
149
|
document_name: str | None=None, document: str | None=None,
|
152
150
|
depth: int=0,
|
@@ -155,7 +153,7 @@ class LacusCore():
|
|
155
153
|
proxy: str | dict[str, str] | None=None,
|
156
154
|
general_timeout_in_sec: int | None=None,
|
157
155
|
cookies: list[dict[str, Any]] | None=None,
|
158
|
-
headers:
|
156
|
+
headers: dict[str, str] | None=None,
|
159
157
|
http_credentials: dict[str, str] | None=None,
|
160
158
|
geolocation: dict[str, float] | None=None,
|
161
159
|
timezone_id: str | None=None,
|
@@ -177,7 +175,7 @@ class LacusCore():
|
|
177
175
|
|
178
176
|
:param url: URL to capture (incompatible with document and document_name)
|
179
177
|
:param document_name: Filename of the document to capture (required if document is used)
|
180
|
-
:param document: Document to capture itself (requires a document_name)
|
178
|
+
:param document: Document to capture itself (requires a document_name), must be base64 encoded
|
181
179
|
:param depth: [Dangerous] Depth of the capture. If > 0, the URLs of the rendered document will be extracted and captured. It can take a very long time.
|
182
180
|
:param browser: The prowser to use for the capture
|
183
181
|
:param device_name: The name of the device, must be something Playwright knows
|
@@ -203,56 +201,24 @@ class LacusCore():
|
|
203
201
|
|
204
202
|
:return: UUID, reference to the capture for later use
|
205
203
|
"""
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
if browser:
|
225
|
-
to_enqueue['browser'] = browser
|
226
|
-
if device_name:
|
227
|
-
to_enqueue['device_name'] = device_name
|
228
|
-
if user_agent:
|
229
|
-
to_enqueue['user_agent'] = user_agent
|
230
|
-
if proxy:
|
231
|
-
to_enqueue['proxy'] = proxy
|
232
|
-
if general_timeout_in_sec is not None: # that would be a terrible idea, but this one could be 0
|
233
|
-
to_enqueue['general_timeout_in_sec'] = general_timeout_in_sec
|
234
|
-
if cookies:
|
235
|
-
to_enqueue['cookies'] = cookies
|
236
|
-
if headers:
|
237
|
-
to_enqueue['headers'] = headers
|
238
|
-
if http_credentials:
|
239
|
-
to_enqueue['http_credentials'] = http_credentials
|
240
|
-
if geolocation:
|
241
|
-
to_enqueue['geolocation'] = geolocation
|
242
|
-
if timezone_id:
|
243
|
-
to_enqueue['timezone_id'] = timezone_id
|
244
|
-
if locale:
|
245
|
-
to_enqueue['locale'] = locale
|
246
|
-
if color_scheme:
|
247
|
-
to_enqueue['color_scheme'] = color_scheme
|
248
|
-
if viewport:
|
249
|
-
to_enqueue['viewport'] = viewport
|
250
|
-
if referer:
|
251
|
-
to_enqueue['referer'] = referer
|
252
|
-
if with_favicon:
|
253
|
-
to_enqueue['with_favicon'] = with_favicon
|
254
|
-
if allow_tracking:
|
255
|
-
to_enqueue['allow_tracking'] = allow_tracking
|
204
|
+
if not settings:
|
205
|
+
settings = {'depth': depth, 'rendered_hostname_only': rendered_hostname_only,
|
206
|
+
'url': url, 'document_name': document_name, 'document': document,
|
207
|
+
'browser': browser, 'device_name': device_name,
|
208
|
+
'user_agent': user_agent, 'proxy': proxy,
|
209
|
+
'general_timeout_in_sec': general_timeout_in_sec,
|
210
|
+
'cookies': cookies, 'headers': headers,
|
211
|
+
'http_credentials': http_credentials, 'geolocation': geolocation,
|
212
|
+
'timezone_id': timezone_id, 'locale': locale,
|
213
|
+
'color_scheme': color_scheme, 'viewport': viewport,
|
214
|
+
'referer': referer, 'with_favicon': with_favicon,
|
215
|
+
'allow_tracking': allow_tracking}
|
216
|
+
|
217
|
+
try:
|
218
|
+
to_enqueue = CaptureSettings(**settings)
|
219
|
+
except ValidationError as e:
|
220
|
+
self.master_logger.warning(f'Unable to validate settings: {e}.')
|
221
|
+
raise CaptureSettingsError('Invalid settings', e)
|
256
222
|
|
257
223
|
hash_query = hashlib.sha512(pickle.dumps(to_enqueue)).hexdigest()
|
258
224
|
if not force:
|
@@ -271,21 +237,9 @@ class LacusCore():
|
|
271
237
|
else:
|
272
238
|
perma_uuid = str(uuid4())
|
273
239
|
|
274
|
-
mapping_capture: dict[str, bytes | float | int | str] = {}
|
275
|
-
for key, value in to_enqueue.items():
|
276
|
-
if value is None:
|
277
|
-
continue
|
278
|
-
if isinstance(value, bool):
|
279
|
-
mapping_capture[key] = 1 if value else 0
|
280
|
-
elif isinstance(value, (list, dict)):
|
281
|
-
if value:
|
282
|
-
mapping_capture[key] = json.dumps(value)
|
283
|
-
elif isinstance(value, (bytes, float, int, str)) and value not in ['', b'']: # we're ok with 0 for example
|
284
|
-
mapping_capture[key] = value
|
285
|
-
|
286
240
|
p = self.redis.pipeline()
|
287
241
|
p.set(f'lacus:query_hash:{hash_query}', perma_uuid, nx=True, ex=recapture_interval)
|
288
|
-
p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=
|
242
|
+
p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=to_enqueue.redis_dump())
|
289
243
|
p.zadd('lacus:to_capture', {perma_uuid: priority if priority is not None else 0})
|
290
244
|
try:
|
291
245
|
p.execute()
|
@@ -399,77 +353,33 @@ class LacusCore():
|
|
399
353
|
|
400
354
|
retry = False
|
401
355
|
try:
|
402
|
-
setting_keys = ['depth', 'rendered_hostname_only', 'url', 'document_name',
|
403
|
-
'document', 'browser', 'device_name', 'user_agent', 'proxy',
|
404
|
-
'general_timeout_in_sec', 'cookies', 'headers', 'http_credentials',
|
405
|
-
'viewport', 'referer', 'geolocation', 'timezone_id', 'locale',
|
406
|
-
'color_scheme', 'with_favicon', 'allow_tracking']
|
407
356
|
result: CaptureResponse = {}
|
408
|
-
|
409
|
-
document_as_bytes = b''
|
357
|
+
_to_capture: dict[bytes, Any] = {}
|
410
358
|
url: str = ''
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
'referer', 'timezone_id', 'locale', 'color_scheme']:
|
417
|
-
# string
|
418
|
-
to_capture[k] = v.decode() # type: ignore[literal-required]
|
419
|
-
elif k in ['cookies', 'http_credentials', 'viewport', 'geolocation']:
|
420
|
-
# dicts or list
|
421
|
-
to_capture[k] = json.loads(v) # type: ignore[literal-required]
|
422
|
-
elif k in ['proxy', 'headers']:
|
423
|
-
# can be dict or str
|
424
|
-
try:
|
425
|
-
to_capture[k] = json.loads(v) # type: ignore[literal-required]
|
426
|
-
except Exception:
|
427
|
-
to_capture[k] = v.decode() # type: ignore[literal-required]
|
428
|
-
elif k in ['general_timeout_in_sec', 'depth']:
|
429
|
-
# int
|
430
|
-
to_capture[k] = int(v) # type: ignore[literal-required]
|
431
|
-
elif k in ['rendered_hostname_only', 'with_favicon', 'allow_tracking']:
|
432
|
-
# bool
|
433
|
-
to_capture[k] = bool(int(v)) # type: ignore[literal-required]
|
434
|
-
elif k == 'document':
|
435
|
-
document_as_bytes = b64decode(v)
|
436
|
-
else:
|
437
|
-
raise CaptureSettingsError(f'Unexpected setting: {k}: {v}')
|
438
|
-
except CaptureSettingsError as e:
|
439
|
-
raise e
|
440
|
-
except Exception as e:
|
441
|
-
raise CaptureSettingsError(f'Error while preparing settings: {e}')
|
359
|
+
_to_capture = self.redis.hgetall(f'lacus:capture_settings:{uuid}')
|
360
|
+
|
361
|
+
if not _to_capture:
|
362
|
+
result = {'error': f'No capture settings for {uuid}'}
|
363
|
+
raise CaptureError(f'No capture settings for {uuid}')
|
442
364
|
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
365
|
+
try:
|
366
|
+
to_capture = CaptureSettings(**{k.decode(): v.decode() for k, v in _to_capture.items()})
|
367
|
+
except ValidationError as e:
|
368
|
+
logger.warning(f'Settings invalid: {e}')
|
369
|
+
raise CaptureSettingsError('Invalid settings', e)
|
447
370
|
|
448
|
-
if
|
371
|
+
if to_capture.document:
|
449
372
|
# we do not have a URL yet.
|
450
|
-
|
451
|
-
|
452
|
-
raise CaptureSettingsError('No document name provided, settings are invalid')
|
453
|
-
if not Path(name).suffix:
|
454
|
-
# The browser will simply display the file as text if there is no extension.
|
455
|
-
# Just add HTML as a fallback, as it will be the most comon one.
|
456
|
-
name = f'{name}.html'
|
457
|
-
document_name = Path(name).name
|
458
|
-
tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
|
373
|
+
document_as_bytes = b64decode(to_capture.document)
|
374
|
+
tmp_f = NamedTemporaryFile(suffix=to_capture.document_name, delete=False)
|
459
375
|
with open(tmp_f.name, "wb") as f:
|
460
376
|
f.write(document_as_bytes)
|
461
377
|
url = f'file://{tmp_f.name}'
|
462
|
-
elif to_capture.
|
463
|
-
|
464
|
-
url = refang(url) # In case we get a defanged url at this stage.
|
465
|
-
if url.lower().startswith('file:') and self.only_global_lookups:
|
378
|
+
elif to_capture.url:
|
379
|
+
if to_capture.url.lower().startswith('file:') and self.only_global_lookups:
|
466
380
|
result = {'error': f'Not allowed to capture a file on disk: {url}'}
|
467
381
|
raise CaptureError(f'Not allowed to capture a file on disk: {url}')
|
468
|
-
|
469
|
-
and not url.lower().startswith('http:')
|
470
|
-
and not url.lower().startswith('https:')
|
471
|
-
and not url.lower().startswith('file:')):
|
472
|
-
url = f'http://{url}'
|
382
|
+
url = to_capture.url
|
473
383
|
else:
|
474
384
|
result = {'error': f'No valid URL to capture for {uuid} - {to_capture}'}
|
475
385
|
raise CaptureError(f'No valid URL to capture for {uuid} - {to_capture}')
|
@@ -479,7 +389,7 @@ class LacusCore():
|
|
479
389
|
except Exception as e:
|
480
390
|
result = {'error': f'Invalid URL: {url} - {e}'}
|
481
391
|
raise CaptureError(f'Invalid URL: {url} - {e}')
|
482
|
-
proxy = to_capture.
|
392
|
+
proxy = to_capture.proxy
|
483
393
|
if self.tor_proxy:
|
484
394
|
# check if onion or forced
|
485
395
|
if (proxy == 'force_tor' # if the proxy is set to "force_tor", we use the pre-configured tor proxy, regardless the URL.
|
@@ -521,9 +431,12 @@ class LacusCore():
|
|
521
431
|
result = {'error': f'Unable to find hostname or IP in the query: "{url}".'}
|
522
432
|
raise CaptureError(f'Unable to find hostname or IP in the query: "{url}".')
|
523
433
|
|
434
|
+
# Set default as chromium
|
524
435
|
browser_engine: BROWSER = "chromium"
|
525
|
-
if to_capture.
|
526
|
-
|
436
|
+
if to_capture.browser:
|
437
|
+
browser_engine = to_capture.browser
|
438
|
+
elif to_capture.user_agent:
|
439
|
+
parsed_string = user_agent_parser.ParseUserAgent(to_capture.user_agent)
|
527
440
|
browser_family = parsed_string['family'].lower()
|
528
441
|
if browser_family.startswith('chrom'):
|
529
442
|
browser_engine = 'chromium'
|
@@ -533,13 +446,14 @@ class LacusCore():
|
|
533
446
|
browser_engine = 'webkit'
|
534
447
|
|
535
448
|
cookies: list[dict[str, Any]] = []
|
536
|
-
if to_capture.
|
449
|
+
if to_capture.cookies:
|
537
450
|
# In order to properly pass the cookies to playwright,
|
538
451
|
# each of then must have a name, a value and either a domain + path or a URL
|
539
452
|
# Name and value are mandatory, and we cannot auto-fill them.
|
540
453
|
# If the cookie doesn't have a domain + path OR a URL, we fill the domain
|
541
454
|
# with the hostname of the URL we try to capture and the path with "/"
|
542
|
-
|
455
|
+
# NOTE: these changes can only be done here because we need the URL.
|
456
|
+
for cookie in to_capture.cookies:
|
543
457
|
if len(cookie) == 1:
|
544
458
|
# we have a cookie in the format key: value
|
545
459
|
name, value = cookie.popitem()
|
@@ -557,21 +471,21 @@ class LacusCore():
|
|
557
471
|
stats_pipeline.sadd(f'stats:{today}:captures', url)
|
558
472
|
async with Capture(
|
559
473
|
browser=browser_engine,
|
560
|
-
device_name=to_capture.
|
474
|
+
device_name=to_capture.device_name,
|
561
475
|
proxy=proxy,
|
562
|
-
general_timeout_in_sec=to_capture.
|
476
|
+
general_timeout_in_sec=to_capture.general_timeout_in_sec,
|
563
477
|
loglevel=self.master_logger.getEffectiveLevel(),
|
564
478
|
uuid=uuid) as capture:
|
565
479
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
566
|
-
capture.headers = to_capture.
|
480
|
+
capture.headers = to_capture.headers # type: ignore[assignment]
|
567
481
|
capture.cookies = cookies # type: ignore[assignment]
|
568
|
-
capture.viewport = to_capture.
|
569
|
-
capture.user_agent = to_capture.
|
570
|
-
capture.http_credentials = to_capture.
|
571
|
-
capture.geolocation = to_capture.
|
572
|
-
capture.timezone_id = to_capture.
|
573
|
-
capture.locale = to_capture.
|
574
|
-
capture.color_scheme = to_capture.
|
482
|
+
capture.viewport = to_capture.viewport # type: ignore[assignment]
|
483
|
+
capture.user_agent = to_capture.user_agent # type: ignore[assignment]
|
484
|
+
capture.http_credentials = to_capture.http_credentials # type: ignore[assignment]
|
485
|
+
capture.geolocation = to_capture.geolocation # type: ignore[assignment]
|
486
|
+
capture.timezone_id = to_capture.timezone_id # type: ignore[assignment]
|
487
|
+
capture.locale = to_capture.locale # type: ignore[assignment]
|
488
|
+
capture.color_scheme = to_capture.color_scheme # type: ignore[assignment]
|
575
489
|
|
576
490
|
# make sure the initialization doesn't take too long
|
577
491
|
init_timeout = max(self.max_capture_time / 10, 5)
|
@@ -586,11 +500,11 @@ class LacusCore():
|
|
586
500
|
try:
|
587
501
|
async with timeout(self.max_capture_time) as capture_timeout:
|
588
502
|
playwright_result = await capture.capture_page(
|
589
|
-
url, referer=to_capture.
|
590
|
-
depth=to_capture.
|
591
|
-
rendered_hostname_only=to_capture.
|
592
|
-
with_favicon=to_capture.
|
593
|
-
allow_tracking=to_capture.
|
503
|
+
url, referer=to_capture.referer,
|
504
|
+
depth=to_capture.depth,
|
505
|
+
rendered_hostname_only=to_capture.rendered_hostname_only,
|
506
|
+
with_favicon=to_capture.with_favicon,
|
507
|
+
allow_tracking=to_capture.allow_tracking,
|
594
508
|
max_depth_capture_time=self.max_capture_time)
|
595
509
|
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
596
510
|
timeout_expired(capture_timeout, logger, 'Capture took too long.')
|
@@ -603,7 +517,7 @@ class LacusCore():
|
|
603
517
|
stats_pipeline.zincrby(f'stats:{today}:errors', 1, result['error_name'])
|
604
518
|
except RetryCapture as e:
|
605
519
|
raise e
|
606
|
-
except PlaywrightCaptureException as e:
|
520
|
+
except (PlaywrightCaptureException, InvalidPlaywrightParameter) as e:
|
607
521
|
logger.warning(f'Invalid parameters for the capture of {url} - {e}')
|
608
522
|
result = {'error': f'Invalid parameters for the capture of {url} - {e}'}
|
609
523
|
raise CaptureError(f'Invalid parameters for the capture of {url} - {e}')
|
@@ -669,7 +583,7 @@ class LacusCore():
|
|
669
583
|
# from the lacus:ongoing sorted set (it is definitely not ongoing anymore)
|
670
584
|
# and optionally re-added to lacus:to_capture if re want to retry it
|
671
585
|
|
672
|
-
if to_capture.
|
586
|
+
if to_capture.document:
|
673
587
|
os.unlink(tmp_f.name)
|
674
588
|
|
675
589
|
if retry:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.10.
|
3
|
+
Version: 1.10.2
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
Home-page: https://github.com/ail-project/LacusCore
|
6
6
|
License: BSD-3-Clause
|
@@ -28,7 +28,8 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
|
|
28
28
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
29
29
|
Requires-Dist: defang (>=0.5.3,<0.6.0)
|
30
30
|
Requires-Dist: dnspython (>=2.6.1,<3.0.0)
|
31
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.25.
|
31
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.25.6,<2.0.0)
|
32
|
+
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
32
33
|
Requires-Dist: redis[hiredis] (>=5.0.7,<6.0.0)
|
33
34
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
34
35
|
Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
|
2
|
+
lacuscore/helpers.py,sha256=y7LWHUMMOrB9-qWNoz_2zTP_yazf_opxXykhYjpNves,9640
|
3
|
+
lacuscore/lacus_monitoring.py,sha256=UOfE_1-_rhVeKJXQ_m9XxYkr7VwyQnA6iK-x_tcXJfo,2775
|
4
|
+
lacuscore/lacuscore.py,sha256=eQMJBf0ZYkCkACu3OWCdidpNEYn0wZ1piIVHP1kv_kU,39372
|
5
|
+
lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
lacuscore/task_logger.py,sha256=8WbdJdKnGeFCxt9gtCNLI9vAQQZbsy2I5PRQpHP7XFU,1916
|
7
|
+
lacuscore-1.10.2.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
|
8
|
+
lacuscore-1.10.2.dist-info/METADATA,sha256=xzU2dgvzO_avbhZm4cdy0DWoWR-R9IZwmLocuKBZjQg,2670
|
9
|
+
lacuscore-1.10.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
10
|
+
lacuscore-1.10.2.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
lacuscore/__init__.py,sha256=hM4lKoPNybDCUMWdXTVVI1gRk_riLvRZ7IwFbamZLzE,341
|
2
|
-
lacuscore/helpers.py,sha256=lULN7HhY-4a4HG-ybIt4jO3wEGTxkm_jKNqsGpNZo4Y,2711
|
3
|
-
lacuscore/lacus_monitoring.py,sha256=UOfE_1-_rhVeKJXQ_m9XxYkr7VwyQnA6iK-x_tcXJfo,2775
|
4
|
-
lacuscore/lacuscore.py,sha256=n0hjyuJpz5G4s5mRzTAtSCHaPtPqdMaTEO7o-2N88JI,44048
|
5
|
-
lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
lacuscore/task_logger.py,sha256=8WbdJdKnGeFCxt9gtCNLI9vAQQZbsy2I5PRQpHP7XFU,1916
|
7
|
-
lacuscore-1.10.0.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
|
8
|
-
lacuscore-1.10.0.dist-info/METADATA,sha256=PmC32J2hEc1hyXiX4wQOjAcnJDLlVIqvj-Yw8pJc13Q,2629
|
9
|
-
lacuscore-1.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
10
|
-
lacuscore-1.10.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|