crawlee 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_request.py +1 -1
- crawlee/_types.py +20 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +8 -2
- crawlee/crawlers/_basic/_basic_crawler.py +62 -46
- crawlee/crawlers/_playwright/_playwright_crawler.py +6 -3
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
- {crawlee-1.1.0.dist-info → crawlee-1.1.1.dist-info}/METADATA +1 -1
- {crawlee-1.1.0.dist-info → crawlee-1.1.1.dist-info}/RECORD +13 -13
- {crawlee-1.1.0.dist-info → crawlee-1.1.1.dist-info}/WHEEL +1 -1
- {crawlee-1.1.0.dist-info → crawlee-1.1.1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.0.dist-info → crawlee-1.1.1.dist-info}/licenses/LICENSE +0 -0
crawlee/_request.py
CHANGED
crawlee/_types.py
CHANGED
|
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
15
15
|
import re
|
|
16
16
|
from collections.abc import Callable, Coroutine, Sequence
|
|
17
17
|
|
|
18
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
18
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
19
19
|
|
|
20
20
|
from crawlee import Glob, Request
|
|
21
21
|
from crawlee._request import RequestOptions
|
|
@@ -643,6 +643,25 @@ class BasicCrawlingContext:
|
|
|
643
643
|
"""Return hash of the context. Each context is considered unique."""
|
|
644
644
|
return id(self)
|
|
645
645
|
|
|
646
|
+
def create_modified_copy(
|
|
647
|
+
self,
|
|
648
|
+
push_data: PushDataFunction | None = None,
|
|
649
|
+
add_requests: AddRequestsFunction | None = None,
|
|
650
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
651
|
+
) -> Self:
|
|
652
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
653
|
+
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
|
|
654
|
+
modified_fields = {
|
|
655
|
+
key: value
|
|
656
|
+
for key, value in {
|
|
657
|
+
'push_data': push_data,
|
|
658
|
+
'add_requests': add_requests,
|
|
659
|
+
'get_key_value_store': get_key_value_store,
|
|
660
|
+
}.items()
|
|
661
|
+
if value
|
|
662
|
+
}
|
|
663
|
+
return self.__class__(**{**original_fields, **modified_fields})
|
|
664
|
+
|
|
646
665
|
|
|
647
666
|
class GetDataKwargs(TypedDict):
|
|
648
667
|
"""Keyword arguments for dataset's `get_data` method."""
|
|
@@ -167,9 +167,15 @@ class AbstractHttpCrawler(
|
|
|
167
167
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
168
168
|
|
|
169
169
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
171
|
-
|
|
170
|
+
|
|
171
|
+
# Get base URL from <base> tag if present
|
|
172
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
173
|
+
base_url: str = (
|
|
174
|
+
str(extracted_base_urls[0])
|
|
175
|
+
if extracted_base_urls
|
|
176
|
+
else context.request.loaded_url or context.request.url
|
|
172
177
|
)
|
|
178
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
173
179
|
|
|
174
180
|
if robots_txt_file:
|
|
175
181
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -14,7 +15,7 @@ from contextlib import AsyncExitStack, suppress
|
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
16
17
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
19
|
from urllib.parse import ParseResult, urlparse
|
|
19
20
|
from weakref import WeakKeyDictionary
|
|
20
21
|
|
|
@@ -96,6 +97,9 @@ if TYPE_CHECKING:
|
|
|
96
97
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
98
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
99
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
100
|
+
TParams = ParamSpec('TParams')
|
|
101
|
+
T = TypeVar('T')
|
|
102
|
+
|
|
99
103
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
104
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
105
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -520,6 +524,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
520
524
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
521
525
|
self._unexpected_stop = True
|
|
522
526
|
|
|
527
|
+
def _wrap_handler_with_error_context(
|
|
528
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
529
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
530
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
531
|
+
|
|
532
|
+
@functools.wraps(handler)
|
|
533
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
534
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
535
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
536
|
+
error_context = context.create_modified_copy(
|
|
537
|
+
push_data=self._push_data,
|
|
538
|
+
get_key_value_store=self.get_key_value_store,
|
|
539
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
540
|
+
)
|
|
541
|
+
return await handler(error_context, exception)
|
|
542
|
+
|
|
543
|
+
return wrapped_handler
|
|
544
|
+
|
|
523
545
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
524
546
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
525
547
|
if self._max_requests_per_crawl is None:
|
|
@@ -618,7 +640,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
618
640
|
|
|
619
641
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
620
642
|
"""
|
|
621
|
-
self._error_handler = handler
|
|
643
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
622
644
|
return handler
|
|
623
645
|
|
|
624
646
|
def failed_request_handler(
|
|
@@ -628,7 +650,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
628
650
|
|
|
629
651
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
630
652
|
"""
|
|
631
|
-
self._failed_request_handler = handler
|
|
653
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
632
654
|
return handler
|
|
633
655
|
|
|
634
656
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -1043,8 +1065,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1043
1065
|
return target_url.hostname == origin_url.hostname
|
|
1044
1066
|
|
|
1045
1067
|
if strategy == 'same-domain':
|
|
1046
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1047
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1068
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1069
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1048
1070
|
return origin_domain == target_domain
|
|
1049
1071
|
|
|
1050
1072
|
if strategy == 'same-origin':
|
|
@@ -1256,52 +1278,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1256
1278
|
else:
|
|
1257
1279
|
yield Request.from_url(url)
|
|
1258
1280
|
|
|
1259
|
-
async def
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1280
|
-
configuration=self._service_locator.get_configuration(),
|
|
1281
|
-
)
|
|
1282
|
-
else:
|
|
1283
|
-
request_manager = base_request_manager
|
|
1284
|
-
|
|
1285
|
-
requests = list[Request]()
|
|
1286
|
-
|
|
1287
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1288
|
-
|
|
1289
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1281
|
+
async def _add_requests(
|
|
1282
|
+
self,
|
|
1283
|
+
context: BasicCrawlingContext,
|
|
1284
|
+
requests: Sequence[str | Request],
|
|
1285
|
+
rq_id: str | None = None,
|
|
1286
|
+
rq_name: str | None = None,
|
|
1287
|
+
rq_alias: str | None = None,
|
|
1288
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1289
|
+
) -> None:
|
|
1290
|
+
"""Add requests method aware of the crawling context."""
|
|
1291
|
+
if rq_id or rq_name or rq_alias:
|
|
1292
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1293
|
+
id=rq_id,
|
|
1294
|
+
name=rq_name,
|
|
1295
|
+
alias=rq_alias,
|
|
1296
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1297
|
+
configuration=self._service_locator.get_configuration(),
|
|
1298
|
+
)
|
|
1299
|
+
else:
|
|
1300
|
+
request_manager = await self.get_request_manager()
|
|
1290
1301
|
|
|
1291
|
-
|
|
1302
|
+
context_aware_requests = list[Request]()
|
|
1303
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1304
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1305
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1306
|
+
for dst_request in filter_requests_iterator:
|
|
1307
|
+
# Update the crawl depth of the request.
|
|
1308
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1292
1309
|
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
)
|
|
1310
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1311
|
+
context_aware_requests.append(dst_request)
|
|
1296
1312
|
|
|
1297
|
-
|
|
1298
|
-
# Update the crawl depth of the request.
|
|
1299
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1313
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1300
1314
|
|
|
1301
|
-
|
|
1302
|
-
|
|
1315
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1316
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1317
|
+
result = self._context_result_map[context]
|
|
1303
1318
|
|
|
1304
|
-
|
|
1319
|
+
for add_requests_call in result.add_requests_calls:
|
|
1320
|
+
await self._add_requests(context, **add_requests_call)
|
|
1305
1321
|
|
|
1306
1322
|
for push_data_call in result.push_data_calls:
|
|
1307
1323
|
await self._push_data(**push_data_call)
|
|
@@ -369,9 +369,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
369
369
|
links_iterator: Iterator[str] = iter(
|
|
370
370
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
371
371
|
)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
)
|
|
372
|
+
|
|
373
|
+
# Get base URL from <base> tag if present
|
|
374
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
375
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
376
|
+
|
|
377
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
375
378
|
|
|
376
379
|
if robots_txt_file:
|
|
377
380
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
134
134
|
continue
|
|
135
135
|
|
|
136
136
|
try:
|
|
137
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
138
138
|
try:
|
|
139
139
|
file_content = json.load(file)
|
|
140
140
|
metadata = DatasetMetadata(**file_content)
|
|
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
163
163
|
|
|
164
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
165
165
|
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
166
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
167
167
|
try:
|
|
168
168
|
file_content = json.load(file)
|
|
169
169
|
finally:
|
|
@@ -133,7 +133,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
133
133
|
continue
|
|
134
134
|
|
|
135
135
|
try:
|
|
136
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
136
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
137
137
|
try:
|
|
138
138
|
file_content = json.load(file)
|
|
139
139
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
@@ -162,7 +162,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
162
162
|
|
|
163
163
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
164
164
|
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
165
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
165
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
166
166
|
try:
|
|
167
167
|
file_content = json.load(file)
|
|
168
168
|
finally:
|
|
@@ -239,7 +239,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
239
239
|
# Read the metadata file
|
|
240
240
|
async with self._lock:
|
|
241
241
|
try:
|
|
242
|
-
file = await asyncio.to_thread(open, record_metadata_filepath)
|
|
242
|
+
file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8')
|
|
243
243
|
except FileNotFoundError:
|
|
244
244
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
245
245
|
return None
|
|
@@ -197,7 +197,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
197
197
|
continue
|
|
198
198
|
|
|
199
199
|
try:
|
|
200
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
200
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
201
201
|
try:
|
|
202
202
|
file_content = json.load(file)
|
|
203
203
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -232,7 +232,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
232
232
|
|
|
233
233
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
234
234
|
if path_to_rq.exists() and path_to_metadata.exists():
|
|
235
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
235
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
236
236
|
try:
|
|
237
237
|
file_content = json.load(file)
|
|
238
238
|
finally:
|
|
@@ -775,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
775
775
|
"""
|
|
776
776
|
# Open the request file.
|
|
777
777
|
try:
|
|
778
|
-
file = await asyncio.to_thread(open, file_path)
|
|
778
|
+
file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
|
|
779
779
|
except FileNotFoundError:
|
|
780
780
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
781
781
|
return None
|
|
@@ -3,9 +3,9 @@ crawlee/_browserforge_workaround.py,sha256=FYQaqpqfZGYkx-A8evF9nsHnj4KK4IMtjNq3L
|
|
|
3
3
|
crawlee/_cli.py,sha256=czuEsGD8QYEiq5gtMcBxrL08hQ5OJQQkMVhAr1pvDaQ,10353
|
|
4
4
|
crawlee/_consts.py,sha256=RQ96gx7V-WPH91cVsMUz76X5UZUNDNhCudtlyGkxFVk,133
|
|
5
5
|
crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
|
|
6
|
-
crawlee/_request.py,sha256=
|
|
6
|
+
crawlee/_request.py,sha256=fnUofyFMV3HJwfcLjYr2BCZ5K9mEwl6vZd8Pr309wCE,16458
|
|
7
7
|
crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
|
|
8
|
-
crawlee/_types.py,sha256=
|
|
8
|
+
crawlee/_types.py,sha256=93yoGr_KqMDIkq3__3QYpIAJmEzZvDoilHAF7_X4J4A,29933
|
|
9
9
|
crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
|
|
10
10
|
crawlee/errors.py,sha256=RhFNA_uT615nVBHf9TylpX5YWwtDuHUUEV8LPT4CYa4,3878
|
|
11
11
|
crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
|
|
@@ -53,7 +53,7 @@ crawlee/crawlers/__init__.py,sha256=9VmFahav3rjE-2Bxa5PAhBgkYXP0k5SSAEpdG2xMZ7c,
|
|
|
53
53
|
crawlee/crawlers/_types.py,sha256=xbGTJQirgz5wUbfr12afMR4q-_5AWP7ngF2e8K5P8l0,355
|
|
54
54
|
crawlee/crawlers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
55
|
crawlee/crawlers/_abstract_http/__init__.py,sha256=QCjn8x7jpo8FwEeSRw10TVj_0La2v9mLEiQWdk2RoTw,273
|
|
56
|
-
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py,sha256=
|
|
56
|
+
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py,sha256=wFErk_Mg02vlgX_jV5GohmZs2qVZt3wy1yBKrU0lbAc,11829
|
|
57
57
|
crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5GFqkUxscwKEFpWrBYRsLKP1cfBwE,3521
|
|
58
58
|
crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
|
|
59
59
|
crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
|
|
|
65
65
|
crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
|
|
66
66
|
crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
|
|
67
67
|
crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
|
|
68
|
-
crawlee/crawlers/_basic/_basic_crawler.py,sha256
|
|
68
|
+
crawlee/crawlers/_basic/_basic_crawler.py,sha256=9N8qPh-k2b6_oywL5taMV-kfMlcs_Q7NgwzISQjzX2I,73869
|
|
69
69
|
crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
|
|
70
70
|
crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
|
|
71
71
|
crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
|
|
@@ -85,7 +85,7 @@ crawlee/crawlers/_parsel/_parsel_crawling_context.py,sha256=sZB26RcRLjSoD15myEOM
|
|
|
85
85
|
crawlee/crawlers/_parsel/_parsel_parser.py,sha256=yWBfuXUHMriK4DRnyrXTQoGeqX5WV9bOEkBp_g0YCvQ,1540
|
|
86
86
|
crawlee/crawlers/_parsel/_utils.py,sha256=MbRwx-cdjlq1zLzFYf64M3spOGQ6yxum4FvP0sdqA_Q,2693
|
|
87
87
|
crawlee/crawlers/_playwright/__init__.py,sha256=6Cahe6VEF82o8CYiP8Cmp58Cmb6Rb8uMeyy7wnwe5ms,837
|
|
88
|
-
crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=
|
|
88
|
+
crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=2ONpyYRjX31YmgFhx2pVkt-SBXZCS-HawAtsG99fwYc,24324
|
|
89
89
|
crawlee/crawlers/_playwright/_playwright_crawling_context.py,sha256=Oi0tMBXHaEDlFjqG01DzgB7Ck52bjVjz-X__eMioxas,1249
|
|
90
90
|
crawlee/crawlers/_playwright/_playwright_http_client.py,sha256=Nfm69dqX85k68jN1p3ljZWbn8egqDWPIPRykXyXsoQs,3977
|
|
91
91
|
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2laWhmJdWiGoMF5JBLBsim9NtENfagZt6FFd2Rgo,1387
|
|
@@ -159,9 +159,9 @@ crawlee/storage_clients/_base/_request_queue_client.py,sha256=cgM4yk6xJwgfzP-xaN
|
|
|
159
159
|
crawlee/storage_clients/_base/_storage_client.py,sha256=RvmKCV1U9_KxyG7n8xhClm2vwD2SKChWIiBLk6cuqw0,3523
|
|
160
160
|
crawlee/storage_clients/_base/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
161
|
crawlee/storage_clients/_file_system/__init__.py,sha256=w3twfwz5YeLYeu_70pNPBRINS2wXRvzOMvA1hUDYgf0,387
|
|
162
|
-
crawlee/storage_clients/_file_system/_dataset_client.py,sha256=
|
|
163
|
-
crawlee/storage_clients/_file_system/_key_value_store_client.py,sha256=
|
|
164
|
-
crawlee/storage_clients/_file_system/_request_queue_client.py,sha256=
|
|
162
|
+
crawlee/storage_clients/_file_system/_dataset_client.py,sha256=DTRYlm37VV7FuowenG0JoqiQdH5AMg9G0O1PPJJO-u0,17781
|
|
163
|
+
crawlee/storage_clients/_file_system/_key_value_store_client.py,sha256=zPXCKPm6w8UYLYwSOuAoc4uoFswJjTAoWMulucvFBiI,18745
|
|
164
|
+
crawlee/storage_clients/_file_system/_request_queue_client.py,sha256=3dn9DM750ftuUzDCp_Uj56tNakYb93nhmeSo2LjPeV0,34039
|
|
165
165
|
crawlee/storage_clients/_file_system/_storage_client.py,sha256=My63uc513kfUPe5X-PTYWBRe9xUGnkLqJN7IcsQd2yw,3293
|
|
166
166
|
crawlee/storage_clients/_file_system/_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
167
|
crawlee/storage_clients/_file_system/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
199
199
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
200
200
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
201
201
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
crawlee-1.1.
|
|
203
|
-
crawlee-1.1.
|
|
204
|
-
crawlee-1.1.
|
|
205
|
-
crawlee-1.1.
|
|
206
|
-
crawlee-1.1.
|
|
202
|
+
crawlee-1.1.1.dist-info/METADATA,sha256=6tizu9idfwDdUbSajQ-pM0nSUcC0B7GrR0fqlpxSku4,29530
|
|
203
|
+
crawlee-1.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
204
|
+
crawlee-1.1.1.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
205
|
+
crawlee-1.1.1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
206
|
+
crawlee-1.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|