lacuscore 1.10.0__py3-none-any.whl → 1.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lacuscore/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from .lacuscore import LacusCore
2
- from .helpers import CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings # noqa
2
+ from .helpers import (CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings, # noqa
3
+ LacusCoreException, CaptureError, RetryCapture, CaptureSettingsError) # noqa
3
4
  from .lacus_monitoring import LacusCoreMonitoring # noqa
4
5
 
5
6
  __all__ = [
@@ -8,5 +9,9 @@ __all__ = [
8
9
  'CaptureResponse',
9
10
  'CaptureResponseJson',
10
11
  'CaptureSettings',
11
- 'LacusCoreMonitoring'
12
+ 'LacusCoreMonitoring',
13
+ 'LacusCoreException',
14
+ 'CaptureError',
15
+ 'RetryCapture',
16
+ 'CaptureSettingsError'
12
17
  ]
lacuscore/helpers.py CHANGED
@@ -2,9 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import json
6
+
5
7
  from enum import IntEnum, unique
6
8
  from logging import LoggerAdapter
7
- from typing import MutableMapping, Any, TypedDict
9
+ from typing import MutableMapping, Any, TypedDict, Literal, Mapping
10
+
11
+ from defang import refang # type: ignore[import-untyped]
12
+ from pydantic import BaseModel, field_validator, model_validator, ValidationError
13
+ from pydantic_core import from_json
8
14
 
9
15
  from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
10
16
 
@@ -22,7 +28,11 @@ class RetryCapture(LacusCoreException):
22
28
 
23
29
 
24
30
  class CaptureSettingsError(LacusCoreException):
25
- pass
31
+ '''Can handle Pydantic validation errors'''
32
+
33
+ def __init__(self, message: str, pydantic_validation_errors: ValidationError | None=None) -> None:
34
+ super().__init__(message)
35
+ self.pydantic_validation_errors = pydantic_validation_errors
26
36
 
27
37
 
28
38
  class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
@@ -71,32 +81,198 @@ class CaptureResponseJson(TypedDict, total=False):
71
81
  potential_favicons: list[str] | None
72
82
 
73
83
 
74
- class CaptureSettings(TypedDict, total=False):
84
+ class CaptureSettings(BaseModel):
75
85
  '''The capture settings that can be passed to Lacus.'''
76
86
 
77
- url: str | None
78
- document_name: str | None
79
- document: str | None
80
- browser: str | None
81
- device_name: str | None
82
- user_agent: str | None
83
- proxy: str | dict[str, str] | None
84
- general_timeout_in_sec: int | None
85
- cookies: list[dict[str, Any]] | None
86
- headers: str | dict[str, str] | None
87
- http_credentials: dict[str, str] | None
88
- geolocation: dict[str, float] | None
89
- timezone_id: str | None
90
- locale: str | None
91
- color_scheme: str | None
92
- viewport: dict[str, int] | None
93
- referer: str | None
94
- with_favicon: bool
95
- allow_tracking: bool
96
- force: bool
97
- recapture_interval: int
98
- priority: int
99
- uuid: str | None
100
-
101
- depth: int
102
- rendered_hostname_only: bool # Note: only used if depth is > 0
87
+ url: str | None = None
88
+ document_name: str | None = None
89
+ document: str | None = None
90
+ browser: Literal['chromium', 'firefox', 'webkit'] | None = None
91
+ device_name: str | None = None
92
+ user_agent: str | None = None
93
+ proxy: str | dict[str, str] | None = None
94
+ general_timeout_in_sec: int | None = None
95
+ cookies: list[dict[str, Any]] | None = None
96
+ headers: dict[str, str] | None = None
97
+ http_credentials: dict[str, str] | None = None
98
+ geolocation: dict[str, float] | None = None
99
+ timezone_id: str | None = None
100
+ locale: str | None = None
101
+ color_scheme: str | None = None
102
+ viewport: dict[str, int] | None = None
103
+ referer: str | None = None
104
+ with_favicon: bool = False
105
+ allow_tracking: bool = False
106
+ force: bool = False
107
+ recapture_interval: int = 300
108
+ priority: int = 0
109
+ uuid: str | None = None
110
+
111
+ depth: int = 0
112
+ rendered_hostname_only: bool = True # Note: only used if depth is > 0
113
+
114
+ @model_validator(mode='after')
115
+ def check_capture_element(self) -> CaptureSettings:
116
+ if self.document_name and not self.document:
117
+ raise CaptureSettingsError('You must provide a document if you provide a document name')
118
+ if self.document and not self.document_name:
119
+ raise CaptureSettingsError('You must provide a document name if you provide a document')
120
+
121
+ if self.url and (self.document or self.document_name):
122
+ raise CaptureSettingsError('You cannot provide both a URL and a document to capture')
123
+ if not self.url and not (self.document and self.document_name):
124
+ raise CaptureSettingsError('You must provide either a URL or a document to capture')
125
+ return self
126
+
127
+ @field_validator('url', mode='after')
128
+ @classmethod
129
+ def load_url(cls, v: str | None) -> str | None:
130
+ if isinstance(v, str):
131
+ url = v.strip()
132
+ url = refang(url) # In case we get a defanged url at this stage.
133
+ if (not url.lower().startswith('data:')
134
+ and not url.lower().startswith('http:')
135
+ and not url.lower().startswith('https:')
136
+ and not url.lower().startswith('file:')):
137
+ url = f'http://{url}'
138
+ return url
139
+ return v
140
+
141
+ @field_validator('document_name', mode='after')
142
+ @classmethod
143
+ def load_document_name(cls, v: str | None) -> str | None:
144
+ if isinstance(v, str):
145
+ name = v.strip()
146
+ if '.' not in name:
147
+ # The browser will simply display the file as text if there is no extension.
148
+ # Just add HTML as a fallback, as it will be the most comon one.
149
+ name = f'{name}.html'
150
+ return name
151
+ return v
152
+
153
+ @field_validator('proxy', mode='before')
154
+ @classmethod
155
+ def load_proxy_json(cls, v: Any) -> str | dict[str, str] | None:
156
+ if not v:
157
+ return None
158
+ if isinstance(v, str):
159
+ if v.startswith('{'):
160
+ return from_json(v)
161
+ # Just the proxy
162
+ return v
163
+ elif isinstance(v, dict):
164
+ return v
165
+ return None
166
+
167
+ @field_validator('cookies', mode='before')
168
+ @classmethod
169
+ def load_cookies_json(cls, v: Any) -> list[dict[str, Any]] | None:
170
+ if not v:
171
+ return None
172
+ if isinstance(v, str):
173
+ if v.startswith('['):
174
+ return from_json(v)
175
+ # Cookies are invalid, ignoring.
176
+ elif isinstance(v, list):
177
+ return v
178
+ return None
179
+
180
+ @field_validator('headers', mode='before')
181
+ @classmethod
182
+ def load_headers_json(cls, v: Any) -> dict[str, str] | None:
183
+ if not v:
184
+ return None
185
+ if isinstance(v, str):
186
+ if v[0] == '{':
187
+ return from_json(v)
188
+ else:
189
+ # make it a dict
190
+ new_headers = {}
191
+ for header_line in v.splitlines():
192
+ if header_line and ':' in header_line:
193
+ splitted = header_line.split(':', 1)
194
+ if splitted and len(splitted) == 2:
195
+ header, h_value = splitted
196
+ if header.strip() and h_value.strip():
197
+ new_headers[header.strip()] = h_value.strip()
198
+ return new_headers
199
+ elif isinstance(v, dict):
200
+ return v
201
+ return None
202
+
203
+ @field_validator('http_credentials', mode='before')
204
+ @classmethod
205
+ def load_http_creds_json(cls, v: Any) -> dict[str, str] | None:
206
+ if not v:
207
+ return None
208
+ if isinstance(v, str):
209
+ if v.startswith('{'):
210
+ return from_json(v)
211
+ elif isinstance(v, dict):
212
+ return v
213
+ return None
214
+
215
+ @field_validator('http_credentials', mode='after')
216
+ @classmethod
217
+ def check_http_creds(cls, v: dict[str, str] | None) -> dict[str, str] | None:
218
+ if not v:
219
+ return v
220
+ if 'username' in v and 'password' in v:
221
+ return v
222
+ raise CaptureSettingsError(f'HTTP credentials must have a username and a password: {v}')
223
+
224
+ @field_validator('geolocation', mode='before')
225
+ @classmethod
226
+ def load_geolocation_json(cls, v: Any) -> dict[str, float] | None:
227
+ if not v:
228
+ return None
229
+ if isinstance(v, str):
230
+ if v.startswith('{'):
231
+ return from_json(v)
232
+ elif isinstance(v, dict):
233
+ return v
234
+ return None
235
+
236
+ @field_validator('geolocation', mode='after')
237
+ @classmethod
238
+ def check_geolocation(cls, v: dict[str, float] | None) -> dict[str, float] | None:
239
+ if not v:
240
+ return v
241
+ if 'latitude' in v and 'longitude' in v:
242
+ return v
243
+ raise CaptureSettingsError(f'A geolocation must have a latitude and a longitude: {v}')
244
+
245
+ @field_validator('viewport', mode='before')
246
+ @classmethod
247
+ def load_viewport_json(cls, v: Any) -> dict[str, int] | None:
248
+ if not v:
249
+ return None
250
+ if isinstance(v, str):
251
+ if v.startswith('{'):
252
+ return from_json(v)
253
+ elif isinstance(v, dict):
254
+ return v
255
+ return None
256
+
257
+ @field_validator('viewport', mode='after')
258
+ @classmethod
259
+ def check_viewport(cls, v: dict[str, int] | None) -> dict[str, int] | None:
260
+ if not v:
261
+ return v
262
+ if 'width' in v and 'height' in v:
263
+ return v
264
+ raise CaptureSettingsError(f'A viewport must have a width and a height: {v}')
265
+
266
+ def redis_dump(self) -> Mapping[str | bytes, bytes | float | int | str]:
267
+ mapping_capture: dict[str | bytes, bytes | float | int | str] = {}
268
+ for key, value in dict(self).items():
269
+ if value is None:
270
+ continue
271
+ if isinstance(value, bool):
272
+ mapping_capture[key] = 1 if value else 0
273
+ elif isinstance(value, (list, dict)):
274
+ if value:
275
+ mapping_capture[key] = json.dumps(value)
276
+ elif isinstance(value, (bytes, float, int, str)) and value not in ['', b'']: # we're ok with 0 for example
277
+ mapping_capture[key] = value
278
+ return mapping_capture
lacuscore/lacuscore.py CHANGED
@@ -4,7 +4,6 @@ from __future__ import annotations
4
4
 
5
5
  import asyncio
6
6
  import hashlib
7
- import json
8
7
  import logging
9
8
  import os
10
9
  import pickle
@@ -19,7 +18,6 @@ from asyncio import Task
19
18
  from base64 import b64decode, b64encode
20
19
  from datetime import date, timedelta
21
20
  from ipaddress import ip_address, IPv4Address, IPv6Address
22
- from pathlib import Path
23
21
  from tempfile import NamedTemporaryFile
24
22
  from typing import Literal, Any, overload, cast, Iterator
25
23
  from uuid import uuid4
@@ -29,8 +27,8 @@ from dns import resolver
29
27
  from dns.exception import DNSException
30
28
  from dns.exception import Timeout as DNSTimeout
31
29
 
32
- from defang import refang # type: ignore[import-untyped]
33
- from playwrightcapture import Capture, PlaywrightCaptureException
30
+ from playwrightcapture import Capture, PlaywrightCaptureException, InvalidPlaywrightParameter
31
+ from pydantic import ValidationError
34
32
  from redis import Redis
35
33
  from redis.exceptions import ConnectionError as RedisConnectionError
36
34
  from redis.exceptions import DataError
@@ -114,7 +112,7 @@ class LacusCore():
114
112
  return bool(self.redis.ping())
115
113
 
116
114
  @overload
117
- def enqueue(self, *, settings: CaptureSettings | None=None) -> str:
115
+ def enqueue(self, *, settings: dict[str, Any] | None=None) -> str:
118
116
  ...
119
117
 
120
118
  @overload
@@ -127,7 +125,7 @@ class LacusCore():
127
125
  proxy: str | dict[str, str] | None=None,
128
126
  general_timeout_in_sec: int | None=None,
129
127
  cookies: list[dict[str, Any]] | None=None,
130
- headers: str | dict[str, str] | None=None,
128
+ headers: dict[str, str] | None=None,
131
129
  http_credentials: dict[str, str] | None=None,
132
130
  geolocation: dict[str, float] | None=None,
133
131
  timezone_id: str | None=None,
@@ -146,7 +144,7 @@ class LacusCore():
146
144
  ...
147
145
 
148
146
  def enqueue(self, *,
149
- settings: CaptureSettings | None=None,
147
+ settings: dict[str, Any] | None=None,
150
148
  url: str | None=None,
151
149
  document_name: str | None=None, document: str | None=None,
152
150
  depth: int=0,
@@ -155,7 +153,7 @@ class LacusCore():
155
153
  proxy: str | dict[str, str] | None=None,
156
154
  general_timeout_in_sec: int | None=None,
157
155
  cookies: list[dict[str, Any]] | None=None,
158
- headers: str | dict[str, str] | None=None,
156
+ headers: dict[str, str] | None=None,
159
157
  http_credentials: dict[str, str] | None=None,
160
158
  geolocation: dict[str, float] | None=None,
161
159
  timezone_id: str | None=None,
@@ -177,7 +175,7 @@ class LacusCore():
177
175
 
178
176
  :param url: URL to capture (incompatible with document and document_name)
179
177
  :param document_name: Filename of the document to capture (required if document is used)
180
- :param document: Document to capture itself (requires a document_name)
178
+ :param document: Document to capture itself (requires a document_name), must be base64 encoded
181
179
  :param depth: [Dangerous] Depth of the capture. If > 0, the URLs of the rendered document will be extracted and captured. It can take a very long time.
182
180
  :param browser: The prowser to use for the capture
183
181
  :param device_name: The name of the device, must be something Playwright knows
@@ -203,56 +201,24 @@ class LacusCore():
203
201
 
204
202
  :return: UUID, reference to the capture for later use
205
203
  """
206
- to_enqueue: CaptureSettings
207
- if settings:
208
- if 'url' in settings and settings['url'] is not None:
209
- settings['url'] = settings['url'].strip()
210
- if settings.get('force') is not None:
211
- force = settings.pop('force', False)
212
- if settings.get('recapture_interval') is not None:
213
- recapture_interval = settings.pop('recapture_interval', 300)
214
- if settings.get('priority') is not None:
215
- priority = settings.pop('priority', 0)
216
- to_enqueue = settings
217
- else:
218
- to_enqueue = {'depth': depth, 'rendered_hostname_only': rendered_hostname_only}
219
- if url:
220
- to_enqueue['url'] = url.strip()
221
- elif document_name and document:
222
- to_enqueue['document_name'] = _secure_filename(document_name)
223
- to_enqueue['document'] = document
224
- if browser:
225
- to_enqueue['browser'] = browser
226
- if device_name:
227
- to_enqueue['device_name'] = device_name
228
- if user_agent:
229
- to_enqueue['user_agent'] = user_agent
230
- if proxy:
231
- to_enqueue['proxy'] = proxy
232
- if general_timeout_in_sec is not None: # that would be a terrible idea, but this one could be 0
233
- to_enqueue['general_timeout_in_sec'] = general_timeout_in_sec
234
- if cookies:
235
- to_enqueue['cookies'] = cookies
236
- if headers:
237
- to_enqueue['headers'] = headers
238
- if http_credentials:
239
- to_enqueue['http_credentials'] = http_credentials
240
- if geolocation:
241
- to_enqueue['geolocation'] = geolocation
242
- if timezone_id:
243
- to_enqueue['timezone_id'] = timezone_id
244
- if locale:
245
- to_enqueue['locale'] = locale
246
- if color_scheme:
247
- to_enqueue['color_scheme'] = color_scheme
248
- if viewport:
249
- to_enqueue['viewport'] = viewport
250
- if referer:
251
- to_enqueue['referer'] = referer
252
- if with_favicon:
253
- to_enqueue['with_favicon'] = with_favicon
254
- if allow_tracking:
255
- to_enqueue['allow_tracking'] = allow_tracking
204
+ if not settings:
205
+ settings = {'depth': depth, 'rendered_hostname_only': rendered_hostname_only,
206
+ 'url': url, 'document_name': document_name, 'document': document,
207
+ 'browser': browser, 'device_name': device_name,
208
+ 'user_agent': user_agent, 'proxy': proxy,
209
+ 'general_timeout_in_sec': general_timeout_in_sec,
210
+ 'cookies': cookies, 'headers': headers,
211
+ 'http_credentials': http_credentials, 'geolocation': geolocation,
212
+ 'timezone_id': timezone_id, 'locale': locale,
213
+ 'color_scheme': color_scheme, 'viewport': viewport,
214
+ 'referer': referer, 'with_favicon': with_favicon,
215
+ 'allow_tracking': allow_tracking}
216
+
217
+ try:
218
+ to_enqueue = CaptureSettings(**settings)
219
+ except ValidationError as e:
220
+ self.master_logger.warning(f'Unable to validate settings: {e}.')
221
+ raise CaptureSettingsError('Invalid settings', e)
256
222
 
257
223
  hash_query = hashlib.sha512(pickle.dumps(to_enqueue)).hexdigest()
258
224
  if not force:
@@ -271,21 +237,9 @@ class LacusCore():
271
237
  else:
272
238
  perma_uuid = str(uuid4())
273
239
 
274
- mapping_capture: dict[str, bytes | float | int | str] = {}
275
- for key, value in to_enqueue.items():
276
- if value is None:
277
- continue
278
- if isinstance(value, bool):
279
- mapping_capture[key] = 1 if value else 0
280
- elif isinstance(value, (list, dict)):
281
- if value:
282
- mapping_capture[key] = json.dumps(value)
283
- elif isinstance(value, (bytes, float, int, str)) and value not in ['', b'']: # we're ok with 0 for example
284
- mapping_capture[key] = value
285
-
286
240
  p = self.redis.pipeline()
287
241
  p.set(f'lacus:query_hash:{hash_query}', perma_uuid, nx=True, ex=recapture_interval)
288
- p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=mapping_capture) # type: ignore[arg-type]
242
+ p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=to_enqueue.redis_dump())
289
243
  p.zadd('lacus:to_capture', {perma_uuid: priority if priority is not None else 0})
290
244
  try:
291
245
  p.execute()
@@ -399,77 +353,33 @@ class LacusCore():
399
353
 
400
354
  retry = False
401
355
  try:
402
- setting_keys = ['depth', 'rendered_hostname_only', 'url', 'document_name',
403
- 'document', 'browser', 'device_name', 'user_agent', 'proxy',
404
- 'general_timeout_in_sec', 'cookies', 'headers', 'http_credentials',
405
- 'viewport', 'referer', 'geolocation', 'timezone_id', 'locale',
406
- 'color_scheme', 'with_favicon', 'allow_tracking']
407
356
  result: CaptureResponse = {}
408
- to_capture: CaptureSettings = {}
409
- document_as_bytes = b''
357
+ _to_capture: dict[bytes, Any] = {}
410
358
  url: str = ''
411
- try:
412
- for k, v in zip(setting_keys, self.redis.hmget(f'lacus:capture_settings:{uuid}', setting_keys)):
413
- if v is None:
414
- continue
415
- if k in ['url', 'document_name', 'browser', 'device_name', 'user_agent',
416
- 'referer', 'timezone_id', 'locale', 'color_scheme']:
417
- # string
418
- to_capture[k] = v.decode() # type: ignore[literal-required]
419
- elif k in ['cookies', 'http_credentials', 'viewport', 'geolocation']:
420
- # dicts or list
421
- to_capture[k] = json.loads(v) # type: ignore[literal-required]
422
- elif k in ['proxy', 'headers']:
423
- # can be dict or str
424
- try:
425
- to_capture[k] = json.loads(v) # type: ignore[literal-required]
426
- except Exception:
427
- to_capture[k] = v.decode() # type: ignore[literal-required]
428
- elif k in ['general_timeout_in_sec', 'depth']:
429
- # int
430
- to_capture[k] = int(v) # type: ignore[literal-required]
431
- elif k in ['rendered_hostname_only', 'with_favicon', 'allow_tracking']:
432
- # bool
433
- to_capture[k] = bool(int(v)) # type: ignore[literal-required]
434
- elif k == 'document':
435
- document_as_bytes = b64decode(v)
436
- else:
437
- raise CaptureSettingsError(f'Unexpected setting: {k}: {v}')
438
- except CaptureSettingsError as e:
439
- raise e
440
- except Exception as e:
441
- raise CaptureSettingsError(f'Error while preparing settings: {e}')
359
+ _to_capture = self.redis.hgetall(f'lacus:capture_settings:{uuid}')
360
+
361
+ if not _to_capture:
362
+ result = {'error': f'No capture settings for {uuid}'}
363
+ raise CaptureError(f'No capture settings for {uuid}')
442
364
 
443
- if not to_capture:
444
- all_entries = self.redis.hgetall(f'lacus:capture_settings:{uuid}')
445
- result = {'error': f'No capture settings for {uuid} - {all_entries}'}
446
- raise CaptureError(f'No capture settings for {uuid} - {all_entries}')
365
+ try:
366
+ to_capture = CaptureSettings(**{k.decode(): v.decode() for k, v in _to_capture.items()})
367
+ except ValidationError as e:
368
+ logger.warning(f'Settings invalid: {e}')
369
+ raise CaptureSettingsError('Invalid settings', e)
447
370
 
448
- if document_as_bytes:
371
+ if to_capture.document:
449
372
  # we do not have a URL yet.
450
- name = to_capture.pop('document_name', None)
451
- if not name:
452
- raise CaptureSettingsError('No document name provided, settings are invalid')
453
- if not Path(name).suffix:
454
- # The browser will simply display the file as text if there is no extension.
455
- # Just add HTML as a fallback, as it will be the most comon one.
456
- name = f'{name}.html'
457
- document_name = Path(name).name
458
- tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
373
+ document_as_bytes = b64decode(to_capture.document)
374
+ tmp_f = NamedTemporaryFile(suffix=to_capture.document_name, delete=False)
459
375
  with open(tmp_f.name, "wb") as f:
460
376
  f.write(document_as_bytes)
461
377
  url = f'file://{tmp_f.name}'
462
- elif to_capture.get('url') and to_capture['url'] is not None:
463
- url = to_capture['url'].strip()
464
- url = refang(url) # In case we get a defanged url at this stage.
465
- if url.lower().startswith('file:') and self.only_global_lookups:
378
+ elif to_capture.url:
379
+ if to_capture.url.lower().startswith('file:') and self.only_global_lookups:
466
380
  result = {'error': f'Not allowed to capture a file on disk: {url}'}
467
381
  raise CaptureError(f'Not allowed to capture a file on disk: {url}')
468
- if (not url.lower().startswith('data:')
469
- and not url.lower().startswith('http:')
470
- and not url.lower().startswith('https:')
471
- and not url.lower().startswith('file:')):
472
- url = f'http://{url}'
382
+ url = to_capture.url
473
383
  else:
474
384
  result = {'error': f'No valid URL to capture for {uuid} - {to_capture}'}
475
385
  raise CaptureError(f'No valid URL to capture for {uuid} - {to_capture}')
@@ -479,7 +389,7 @@ class LacusCore():
479
389
  except Exception as e:
480
390
  result = {'error': f'Invalid URL: {url} - {e}'}
481
391
  raise CaptureError(f'Invalid URL: {url} - {e}')
482
- proxy = to_capture.get('proxy')
392
+ proxy = to_capture.proxy
483
393
  if self.tor_proxy:
484
394
  # check if onion or forced
485
395
  if (proxy == 'force_tor' # if the proxy is set to "force_tor", we use the pre-configured tor proxy, regardless the URL.
@@ -521,9 +431,12 @@ class LacusCore():
521
431
  result = {'error': f'Unable to find hostname or IP in the query: "{url}".'}
522
432
  raise CaptureError(f'Unable to find hostname or IP in the query: "{url}".')
523
433
 
434
+ # Set default as chromium
524
435
  browser_engine: BROWSER = "chromium"
525
- if to_capture.get('user_agent'):
526
- parsed_string = user_agent_parser.ParseUserAgent(to_capture.get('user_agent'))
436
+ if to_capture.browser:
437
+ browser_engine = to_capture.browser
438
+ elif to_capture.user_agent:
439
+ parsed_string = user_agent_parser.ParseUserAgent(to_capture.user_agent)
527
440
  browser_family = parsed_string['family'].lower()
528
441
  if browser_family.startswith('chrom'):
529
442
  browser_engine = 'chromium'
@@ -533,13 +446,14 @@ class LacusCore():
533
446
  browser_engine = 'webkit'
534
447
 
535
448
  cookies: list[dict[str, Any]] = []
536
- if to_capture.get('cookies') and to_capture['cookies'] is not None:
449
+ if to_capture.cookies:
537
450
  # In order to properly pass the cookies to playwright,
538
451
  # each of then must have a name, a value and either a domain + path or a URL
539
452
  # Name and value are mandatory, and we cannot auto-fill them.
540
453
  # If the cookie doesn't have a domain + path OR a URL, we fill the domain
541
454
  # with the hostname of the URL we try to capture and the path with "/"
542
- for cookie in to_capture['cookies']:
455
+ # NOTE: these changes can only be done here because we need the URL.
456
+ for cookie in to_capture.cookies:
543
457
  if len(cookie) == 1:
544
458
  # we have a cookie in the format key: value
545
459
  name, value = cookie.popitem()
@@ -557,21 +471,21 @@ class LacusCore():
557
471
  stats_pipeline.sadd(f'stats:{today}:captures', url)
558
472
  async with Capture(
559
473
  browser=browser_engine,
560
- device_name=to_capture.get('device_name'),
474
+ device_name=to_capture.device_name,
561
475
  proxy=proxy,
562
- general_timeout_in_sec=to_capture.get('general_timeout_in_sec'),
476
+ general_timeout_in_sec=to_capture.general_timeout_in_sec,
563
477
  loglevel=self.master_logger.getEffectiveLevel(),
564
478
  uuid=uuid) as capture:
565
479
  # required by Mypy: https://github.com/python/mypy/issues/3004
566
- capture.headers = to_capture.get('headers') # type: ignore[assignment]
480
+ capture.headers = to_capture.headers # type: ignore[assignment]
567
481
  capture.cookies = cookies # type: ignore[assignment]
568
- capture.viewport = to_capture.get('viewport') # type: ignore[assignment]
569
- capture.user_agent = to_capture.get('user_agent') # type: ignore[assignment]
570
- capture.http_credentials = to_capture.get('http_credentials') # type: ignore[assignment]
571
- capture.geolocation = to_capture.get('geolocation') # type: ignore[assignment]
572
- capture.timezone_id = to_capture.get('timezone_id') # type: ignore[assignment]
573
- capture.locale = to_capture.get('locale') # type: ignore[assignment]
574
- capture.color_scheme = to_capture.get('color_scheme') # type: ignore[assignment]
482
+ capture.viewport = to_capture.viewport # type: ignore[assignment]
483
+ capture.user_agent = to_capture.user_agent # type: ignore[assignment]
484
+ capture.http_credentials = to_capture.http_credentials # type: ignore[assignment]
485
+ capture.geolocation = to_capture.geolocation # type: ignore[assignment]
486
+ capture.timezone_id = to_capture.timezone_id # type: ignore[assignment]
487
+ capture.locale = to_capture.locale # type: ignore[assignment]
488
+ capture.color_scheme = to_capture.color_scheme # type: ignore[assignment]
575
489
 
576
490
  # make sure the initialization doesn't take too long
577
491
  init_timeout = max(self.max_capture_time / 10, 5)
@@ -586,11 +500,11 @@ class LacusCore():
586
500
  try:
587
501
  async with timeout(self.max_capture_time) as capture_timeout:
588
502
  playwright_result = await capture.capture_page(
589
- url, referer=to_capture.get('referer'),
590
- depth=to_capture.get('depth', 0),
591
- rendered_hostname_only=to_capture.get('rendered_hostname_only', True),
592
- with_favicon=to_capture.get('with_favicon', False),
593
- allow_tracking=to_capture.get('allow_tracking', False),
503
+ url, referer=to_capture.referer,
504
+ depth=to_capture.depth,
505
+ rendered_hostname_only=to_capture.rendered_hostname_only,
506
+ with_favicon=to_capture.with_favicon,
507
+ allow_tracking=to_capture.allow_tracking,
594
508
  max_depth_capture_time=self.max_capture_time)
595
509
  except (TimeoutError, asyncio.exceptions.TimeoutError):
596
510
  timeout_expired(capture_timeout, logger, 'Capture took too long.')
@@ -603,7 +517,7 @@ class LacusCore():
603
517
  stats_pipeline.zincrby(f'stats:{today}:errors', 1, result['error_name'])
604
518
  except RetryCapture as e:
605
519
  raise e
606
- except PlaywrightCaptureException as e:
520
+ except (PlaywrightCaptureException, InvalidPlaywrightParameter) as e:
607
521
  logger.warning(f'Invalid parameters for the capture of {url} - {e}')
608
522
  result = {'error': f'Invalid parameters for the capture of {url} - {e}'}
609
523
  raise CaptureError(f'Invalid parameters for the capture of {url} - {e}')
@@ -669,7 +583,7 @@ class LacusCore():
669
583
  # from the lacus:ongoing sorted set (it is definitely not ongoing anymore)
670
584
  # and optionally re-added to lacus:to_capture if re want to retry it
671
585
 
672
- if to_capture.get('document'):
586
+ if to_capture.document:
673
587
  os.unlink(tmp_f.name)
674
588
 
675
589
  if retry:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.10.0
3
+ Version: 1.10.2
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
@@ -28,7 +28,8 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
28
28
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
29
29
  Requires-Dist: defang (>=0.5.3,<0.6.0)
30
30
  Requires-Dist: dnspython (>=2.6.1,<3.0.0)
31
- Requires-Dist: playwrightcapture[recaptcha] (>=1.25.0,<2.0.0)
31
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.25.6,<2.0.0)
32
+ Requires-Dist: pydantic (>=2.8.2,<3.0.0)
32
33
  Requires-Dist: redis[hiredis] (>=5.0.7,<6.0.0)
33
34
  Requires-Dist: requests (>=2.32.3,<3.0.0)
34
35
  Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
@@ -0,0 +1,10 @@
1
+ lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
+ lacuscore/helpers.py,sha256=y7LWHUMMOrB9-qWNoz_2zTP_yazf_opxXykhYjpNves,9640
3
+ lacuscore/lacus_monitoring.py,sha256=UOfE_1-_rhVeKJXQ_m9XxYkr7VwyQnA6iK-x_tcXJfo,2775
4
+ lacuscore/lacuscore.py,sha256=eQMJBf0ZYkCkACu3OWCdidpNEYn0wZ1piIVHP1kv_kU,39372
5
+ lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ lacuscore/task_logger.py,sha256=8WbdJdKnGeFCxt9gtCNLI9vAQQZbsy2I5PRQpHP7XFU,1916
7
+ lacuscore-1.10.2.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
+ lacuscore-1.10.2.dist-info/METADATA,sha256=xzU2dgvzO_avbhZm4cdy0DWoWR-R9IZwmLocuKBZjQg,2670
9
+ lacuscore-1.10.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
10
+ lacuscore-1.10.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- lacuscore/__init__.py,sha256=hM4lKoPNybDCUMWdXTVVI1gRk_riLvRZ7IwFbamZLzE,341
2
- lacuscore/helpers.py,sha256=lULN7HhY-4a4HG-ybIt4jO3wEGTxkm_jKNqsGpNZo4Y,2711
3
- lacuscore/lacus_monitoring.py,sha256=UOfE_1-_rhVeKJXQ_m9XxYkr7VwyQnA6iK-x_tcXJfo,2775
4
- lacuscore/lacuscore.py,sha256=n0hjyuJpz5G4s5mRzTAtSCHaPtPqdMaTEO7o-2N88JI,44048
5
- lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- lacuscore/task_logger.py,sha256=8WbdJdKnGeFCxt9gtCNLI9vAQQZbsy2I5PRQpHP7XFU,1916
7
- lacuscore-1.10.0.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
- lacuscore-1.10.0.dist-info/METADATA,sha256=PmC32J2hEc1hyXiX4wQOjAcnJDLlVIqvj-Yw8pJc13Q,2629
9
- lacuscore-1.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
10
- lacuscore-1.10.0.dist-info/RECORD,,