crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +62 -32
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +52 -19
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +160 -134
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
- crawlee/storage_clients/_memory/_dataset_client.py +2 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_dataset_client.py +2 -2
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
- crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
- crawlee/storage_clients/_sql/_storage_client.py +1 -1
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +3 -0
- crawlee/storages/_key_value_store.py +8 -2
- crawlee/storages/_request_queue.py +3 -0
- crawlee/storages/_storage_instance_manager.py +109 -42
- crawlee/storages/_utils.py +11 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from redis.asyncio import Redis
|
|
7
|
+
from typing_extensions import override
|
|
8
|
+
|
|
9
|
+
from crawlee._utils.docs import docs_group
|
|
10
|
+
from crawlee.configuration import Configuration
|
|
11
|
+
from crawlee.storage_clients._base import StorageClient
|
|
12
|
+
|
|
13
|
+
from ._dataset_client import RedisDatasetClient
|
|
14
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
15
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@docs_group('Storage clients')
|
|
19
|
+
class RedisStorageClient(StorageClient):
|
|
20
|
+
"""Redis implementation of the storage client.
|
|
21
|
+
|
|
22
|
+
This storage client provides access to datasets, key-value stores, and request queues that persist data
|
|
23
|
+
to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for
|
|
24
|
+
efficient storage and retrieval.
|
|
25
|
+
|
|
26
|
+
The client accepts either a Redis connection string or a pre-configured Redis client instance.
|
|
27
|
+
Exactly one of these parameters must be provided during initialization.
|
|
28
|
+
|
|
29
|
+
Storage types use the following Redis data structures:
|
|
30
|
+
- **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects
|
|
31
|
+
- **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage
|
|
32
|
+
- **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,
|
|
33
|
+
and Bloom filters for request deduplication
|
|
34
|
+
|
|
35
|
+
Warning:
|
|
36
|
+
This is an experimental feature. The behavior and interface may change in future versions.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
*,
|
|
42
|
+
connection_string: str | None = None,
|
|
43
|
+
redis: Redis | None = None,
|
|
44
|
+
queue_dedup_strategy: Literal['default', 'bloom'] = 'default',
|
|
45
|
+
queue_bloom_error_rate: float = 1e-7,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Initialize the Redis storage client.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
connection_string: Redis connection string (e.g., "redis://localhost:6379").
|
|
51
|
+
Supports standard Redis URL format with optional database selection.
|
|
52
|
+
redis: Pre-configured Redis client instance.
|
|
53
|
+
queue_dedup_strategy: Strategy for request queue deduplication. Options are:
|
|
54
|
+
- 'default': Uses Redis sets for exact deduplication.
|
|
55
|
+
- 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
|
|
56
|
+
this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.
|
|
57
|
+
queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
|
|
58
|
+
`queue_dedup_strategy` is set to 'bloom'.
|
|
59
|
+
"""
|
|
60
|
+
if redis is None and connection_string is None:
|
|
61
|
+
raise ValueError('Either redis or connection_string must be provided.')
|
|
62
|
+
|
|
63
|
+
if redis is not None and connection_string is not None:
|
|
64
|
+
raise ValueError('Either redis or connection_string must be provided, not both.')
|
|
65
|
+
|
|
66
|
+
if isinstance(redis, Redis) and connection_string is None:
|
|
67
|
+
self._redis = redis
|
|
68
|
+
|
|
69
|
+
if isinstance(connection_string, str) and redis is None:
|
|
70
|
+
self._redis = Redis.from_url(connection_string)
|
|
71
|
+
|
|
72
|
+
self._redis: Redis # to help type checker
|
|
73
|
+
self._queue_dedup_strategy = queue_dedup_strategy
|
|
74
|
+
self._queue_bloom_error_rate = queue_bloom_error_rate
|
|
75
|
+
|
|
76
|
+
# Call the notification only once
|
|
77
|
+
warnings.warn(
|
|
78
|
+
(
|
|
79
|
+
'RedisStorageClient is experimental and its API, behavior, and key structure may change in future '
|
|
80
|
+
'releases.'
|
|
81
|
+
),
|
|
82
|
+
category=UserWarning,
|
|
83
|
+
stacklevel=2,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@override
|
|
87
|
+
async def create_dataset_client(
|
|
88
|
+
self,
|
|
89
|
+
*,
|
|
90
|
+
id: str | None = None,
|
|
91
|
+
name: str | None = None,
|
|
92
|
+
alias: str | None = None,
|
|
93
|
+
configuration: Configuration | None = None,
|
|
94
|
+
) -> RedisDatasetClient:
|
|
95
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
96
|
+
|
|
97
|
+
client = await RedisDatasetClient.open(
|
|
98
|
+
id=id,
|
|
99
|
+
name=name,
|
|
100
|
+
alias=alias,
|
|
101
|
+
redis=self._redis,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
await self._purge_if_needed(client, configuration)
|
|
105
|
+
return client
|
|
106
|
+
|
|
107
|
+
@override
|
|
108
|
+
async def create_kvs_client(
|
|
109
|
+
self,
|
|
110
|
+
*,
|
|
111
|
+
id: str | None = None,
|
|
112
|
+
name: str | None = None,
|
|
113
|
+
alias: str | None = None,
|
|
114
|
+
configuration: Configuration | None = None,
|
|
115
|
+
) -> RedisKeyValueStoreClient:
|
|
116
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
117
|
+
|
|
118
|
+
client = await RedisKeyValueStoreClient.open(
|
|
119
|
+
id=id,
|
|
120
|
+
name=name,
|
|
121
|
+
alias=alias,
|
|
122
|
+
redis=self._redis,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
await self._purge_if_needed(client, configuration)
|
|
126
|
+
return client
|
|
127
|
+
|
|
128
|
+
@override
|
|
129
|
+
async def create_rq_client(
|
|
130
|
+
self,
|
|
131
|
+
*,
|
|
132
|
+
id: str | None = None,
|
|
133
|
+
name: str | None = None,
|
|
134
|
+
alias: str | None = None,
|
|
135
|
+
configuration: Configuration | None = None,
|
|
136
|
+
) -> RedisRequestQueueClient:
|
|
137
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
138
|
+
|
|
139
|
+
client = await RedisRequestQueueClient.open(
|
|
140
|
+
id=id,
|
|
141
|
+
name=name,
|
|
142
|
+
alias=alias,
|
|
143
|
+
redis=self._redis,
|
|
144
|
+
dedup_strategy=self._queue_dedup_strategy,
|
|
145
|
+
bloom_error_rate=self._queue_bloom_error_rate,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
await self._purge_if_needed(client, configuration)
|
|
149
|
+
return client
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from collections.abc import Awaitable
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TypeVar, overload
|
|
4
|
+
|
|
5
|
+
T = TypeVar('T')
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@overload
|
|
9
|
+
async def await_redis_response(response: Awaitable[T]) -> T: ...
|
|
10
|
+
@overload
|
|
11
|
+
async def await_redis_response(response: T) -> T: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def await_redis_response(response: Awaitable[T] | T) -> T:
|
|
15
|
+
"""Solve the problem of ambiguous typing for redis."""
|
|
16
|
+
return await response if isinstance(response, Awaitable) else response
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_lua_script(script_name: str) -> str:
|
|
20
|
+
"""Read a Lua script from a file."""
|
|
21
|
+
file_path = Path(__file__).parent / 'lua_scripts' / script_name
|
|
22
|
+
with file_path.open(mode='r', encoding='utf-8') as file:
|
|
23
|
+
return file.read()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
local added_filter_key = KEYS[1]
|
|
2
|
+
local queue_key = KEYS[2]
|
|
3
|
+
local data_key = KEYS[3]
|
|
4
|
+
|
|
5
|
+
local forefront = ARGV[1] == '1'
|
|
6
|
+
local unique_keys = cjson.decode(ARGV[2])
|
|
7
|
+
local requests_data = cjson.decode(ARGV[3])
|
|
8
|
+
|
|
9
|
+
-- Add and check which unique keys are actually new using Bloom filter
|
|
10
|
+
local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys))
|
|
11
|
+
|
|
12
|
+
local actually_added = {}
|
|
13
|
+
local hset_args = {}
|
|
14
|
+
|
|
15
|
+
-- Process the results
|
|
16
|
+
for i, unique_key in ipairs(unique_keys) do
|
|
17
|
+
if bf_results[i] == 1 then
|
|
18
|
+
-- This key was added by us (did not exist before)
|
|
19
|
+
table.insert(hset_args, unique_key)
|
|
20
|
+
table.insert(hset_args, requests_data[unique_key])
|
|
21
|
+
table.insert(actually_added, unique_key)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
-- Add only those that are actually new
|
|
26
|
+
if #actually_added > 0 then
|
|
27
|
+
redis.call('hset', data_key, unpack(hset_args))
|
|
28
|
+
|
|
29
|
+
if forefront then
|
|
30
|
+
redis.call('lpush', queue_key, unpack(actually_added))
|
|
31
|
+
else
|
|
32
|
+
redis.call('rpush', queue_key, unpack(actually_added))
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
return cjson.encode(actually_added)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
local queue_key = KEYS[1]
|
|
2
|
+
local in_progress_key = KEYS[2]
|
|
3
|
+
local data_key = KEYS[3]
|
|
4
|
+
local client_id = ARGV[1]
|
|
5
|
+
local blocked_until_timestamp = ARGV[2]
|
|
6
|
+
local batch_size = tonumber(ARGV[3])
|
|
7
|
+
|
|
8
|
+
-- Pop batch unique_key from queue
|
|
9
|
+
local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size)
|
|
10
|
+
if not batch_result then
|
|
11
|
+
return nil
|
|
12
|
+
end
|
|
13
|
+
local unique_keys = batch_result[2]
|
|
14
|
+
|
|
15
|
+
-- Get requests data
|
|
16
|
+
local requests_data = redis.call('HMGET', data_key, unpack(unique_keys))
|
|
17
|
+
if not requests_data then
|
|
18
|
+
-- Data missing, skip this request
|
|
19
|
+
return nil
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
-- Prepare results and update in_progress
|
|
23
|
+
local final_result = {}
|
|
24
|
+
local in_progress_hmset = {}
|
|
25
|
+
local pending_decrement = 0
|
|
26
|
+
local in_progress_data = cjson.encode({
|
|
27
|
+
client_id = client_id,
|
|
28
|
+
blocked_until_timestamp = tonumber(blocked_until_timestamp)
|
|
29
|
+
})
|
|
30
|
+
for i = 1, #unique_keys do
|
|
31
|
+
local unique_key = unique_keys[i]
|
|
32
|
+
local request_data = requests_data[i]
|
|
33
|
+
|
|
34
|
+
if request_data then
|
|
35
|
+
-- Add to in_progress hash
|
|
36
|
+
table.insert(in_progress_hmset, unique_key)
|
|
37
|
+
table.insert(in_progress_hmset, in_progress_data)
|
|
38
|
+
|
|
39
|
+
table.insert(final_result, request_data)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
-- Update in_progress hash
|
|
44
|
+
if #in_progress_hmset > 0 then
|
|
45
|
+
redis.call('HMSET', in_progress_key, unpack(in_progress_hmset))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
-- Return result with requests data
|
|
49
|
+
return final_result
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
local added_filter_key = KEYS[1]
|
|
2
|
+
local queue_key = KEYS[2]
|
|
3
|
+
local data_key = KEYS[3]
|
|
4
|
+
|
|
5
|
+
local forefront = ARGV[1] == '1'
|
|
6
|
+
local unique_keys = cjson.decode(ARGV[2])
|
|
7
|
+
local requests_data = cjson.decode(ARGV[3])
|
|
8
|
+
|
|
9
|
+
-- Add and check which unique keys are actually new using Redis set
|
|
10
|
+
local actually_added = {}
|
|
11
|
+
local hset_args = {}
|
|
12
|
+
|
|
13
|
+
-- Process each unique key
|
|
14
|
+
for _, unique_key in ipairs(unique_keys) do
|
|
15
|
+
-- Try to add the key to the set, returns 1 if added, 0 if already existed
|
|
16
|
+
local set_result = redis.call('sadd', added_filter_key, unique_key)
|
|
17
|
+
|
|
18
|
+
if set_result == 1 then
|
|
19
|
+
-- This key was added by us (did not exist before)
|
|
20
|
+
table.insert(hset_args, unique_key)
|
|
21
|
+
table.insert(hset_args, requests_data[unique_key])
|
|
22
|
+
table.insert(actually_added, unique_key)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
-- Add only those that are actually new
|
|
27
|
+
if #actually_added > 0 then
|
|
28
|
+
redis.call('hset', data_key, unpack(hset_args))
|
|
29
|
+
|
|
30
|
+
if forefront then
|
|
31
|
+
redis.call('lpush', queue_key, unpack(actually_added))
|
|
32
|
+
else
|
|
33
|
+
redis.call('rpush', queue_key, unpack(actually_added))
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
return cjson.encode(actually_added)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
local in_progress_key = KEYS[1]
|
|
2
|
+
local queue_key = KEYS[2]
|
|
3
|
+
local data_key = KEYS[3]
|
|
4
|
+
local current_time = tonumber(ARGV[1])
|
|
5
|
+
|
|
6
|
+
local max_reclaim = 1000
|
|
7
|
+
|
|
8
|
+
local cursor = "0"
|
|
9
|
+
local count = 0
|
|
10
|
+
|
|
11
|
+
repeat
|
|
12
|
+
local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100)
|
|
13
|
+
cursor = result[1]
|
|
14
|
+
local entries = result[2]
|
|
15
|
+
|
|
16
|
+
for i = 1, #entries, 2 do
|
|
17
|
+
if count >= max_reclaim then
|
|
18
|
+
break
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
local unique_key = entries[i]
|
|
22
|
+
local data = cjson.decode(entries[i + 1])
|
|
23
|
+
|
|
24
|
+
-- Check if timed out
|
|
25
|
+
if current_time > data.blocked_until_timestamp then
|
|
26
|
+
-- Atomically remove from in_progress and add back to queue
|
|
27
|
+
redis.call('hdel', in_progress_key, unique_key)
|
|
28
|
+
redis.call('rpush', queue_key, unique_key)
|
|
29
|
+
count = count + 1
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
until cursor == "0" or count >= max_reclaim
|
|
33
|
+
|
|
34
|
+
return count
|
|
File without changes
|
|
@@ -105,7 +105,7 @@ class SqlClientMixin(ABC):
|
|
|
105
105
|
else:
|
|
106
106
|
stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
|
|
107
107
|
result = await session.execute(stmt)
|
|
108
|
-
orm_metadata = result.scalar_one_or_none()
|
|
108
|
+
orm_metadata = result.scalar_one_or_none()
|
|
109
109
|
|
|
110
110
|
if orm_metadata:
|
|
111
111
|
client = cls(id=orm_metadata.id, storage_client=storage_client)
|
|
@@ -4,7 +4,7 @@ from logging import getLogger
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import Select, insert, select
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee.storage_clients._base import DatasetClient
|
|
10
10
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
@@ -78,7 +78,7 @@ class SqlDatasetClient(DatasetClient, SqlClientMixin):
|
|
|
78
78
|
name: str | None,
|
|
79
79
|
alias: str | None,
|
|
80
80
|
storage_client: SqlStorageClient,
|
|
81
|
-
) ->
|
|
81
|
+
) -> Self:
|
|
82
82
|
"""Open an existing dataset or create a new one.
|
|
83
83
|
|
|
84
84
|
Args:
|
|
@@ -205,9 +205,8 @@ class RequestDb(Base):
|
|
|
205
205
|
'idx_fetch_available',
|
|
206
206
|
'request_queue_id',
|
|
207
207
|
'is_handled',
|
|
208
|
-
'time_blocked_until',
|
|
209
208
|
'sequence_number',
|
|
210
|
-
postgresql_where=text('is_handled
|
|
209
|
+
postgresql_where=text('is_handled is false'),
|
|
211
210
|
),
|
|
212
211
|
)
|
|
213
212
|
|
|
@@ -2,10 +2,10 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from logging import getLogger
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
6
|
|
|
7
|
-
from sqlalchemy import delete, select
|
|
8
|
-
from typing_extensions import override
|
|
7
|
+
from sqlalchemy import CursorResult, delete, select
|
|
8
|
+
from typing_extensions import Self, override
|
|
9
9
|
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
11
11
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
@@ -77,7 +77,7 @@ class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
|
|
|
77
77
|
name: str | None,
|
|
78
78
|
alias: str | None,
|
|
79
79
|
storage_client: SqlStorageClient,
|
|
80
|
-
) ->
|
|
80
|
+
) -> Self:
|
|
81
81
|
"""Open or create a SQL key-value store client.
|
|
82
82
|
|
|
83
83
|
This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified
|
|
@@ -227,6 +227,7 @@ class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
|
|
|
227
227
|
async with self.get_session(with_simple_commit=True) as session:
|
|
228
228
|
# Delete the record if it exists
|
|
229
229
|
result = await session.execute(stmt)
|
|
230
|
+
result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
|
|
230
231
|
|
|
231
232
|
# Update metadata if we actually deleted something
|
|
232
233
|
if result.rowcount > 0:
|
|
@@ -5,12 +5,12 @@ from datetime import datetime, timedelta, timezone
|
|
|
5
5
|
from functools import lru_cache
|
|
6
6
|
from hashlib import sha256
|
|
7
7
|
from logging import getLogger
|
|
8
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
9
9
|
|
|
10
|
-
from sqlalchemy import func, or_, select, update
|
|
10
|
+
from sqlalchemy import CursorResult, func, or_, select, update
|
|
11
11
|
from sqlalchemy.exc import SQLAlchemyError
|
|
12
12
|
from sqlalchemy.orm import load_only
|
|
13
|
-
from typing_extensions import NotRequired, override
|
|
13
|
+
from typing_extensions import NotRequired, Self, override
|
|
14
14
|
|
|
15
15
|
from crawlee import Request
|
|
16
16
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
@@ -119,7 +119,7 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
|
|
|
119
119
|
name: str | None,
|
|
120
120
|
alias: str | None,
|
|
121
121
|
storage_client: SqlStorageClient,
|
|
122
|
-
) ->
|
|
122
|
+
) -> Self:
|
|
123
123
|
"""Open an existing request queue or create a new one.
|
|
124
124
|
|
|
125
125
|
This method first tries to find an existing queue by ID or name.
|
|
@@ -231,6 +231,7 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
|
|
|
231
231
|
|
|
232
232
|
async with self.get_session() as session:
|
|
233
233
|
result = await session.execute(stmt)
|
|
234
|
+
result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
|
|
234
235
|
existing_requests = {req.request_id: req for req in result.scalars()}
|
|
235
236
|
state = await self._get_state(session)
|
|
236
237
|
insert_values: list[dict] = []
|
|
@@ -498,9 +499,12 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
|
|
|
498
499
|
)
|
|
499
500
|
async with self.get_session() as session:
|
|
500
501
|
result = await session.execute(stmt)
|
|
502
|
+
result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
|
|
503
|
+
|
|
501
504
|
if result.rowcount == 0:
|
|
502
505
|
logger.warning(f'Request {request.unique_key} not found in database.')
|
|
503
506
|
return None
|
|
507
|
+
|
|
504
508
|
await self._update_metadata(
|
|
505
509
|
session,
|
|
506
510
|
**_QueueMetadataUpdateParams(
|
|
@@ -542,14 +546,24 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
|
|
|
542
546
|
block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)
|
|
543
547
|
# Extend blocking for forefront request, it is considered blocked by the current client.
|
|
544
548
|
stmt = stmt.values(
|
|
545
|
-
sequence_number=new_sequence,
|
|
549
|
+
sequence_number=new_sequence,
|
|
550
|
+
time_blocked_until=block_until,
|
|
551
|
+
client_key=self.client_key,
|
|
552
|
+
data=request.model_dump_json(),
|
|
546
553
|
)
|
|
547
554
|
else:
|
|
548
555
|
new_sequence = state.sequence_counter
|
|
549
556
|
state.sequence_counter += 1
|
|
550
|
-
stmt = stmt.values(
|
|
557
|
+
stmt = stmt.values(
|
|
558
|
+
sequence_number=new_sequence,
|
|
559
|
+
time_blocked_until=None,
|
|
560
|
+
client_key=None,
|
|
561
|
+
data=request.model_dump_json(),
|
|
562
|
+
)
|
|
551
563
|
|
|
552
564
|
result = await session.execute(stmt)
|
|
565
|
+
result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
|
|
566
|
+
|
|
553
567
|
if result.rowcount == 0:
|
|
554
568
|
logger.warning(f'Request {request.unique_key} not found in database.')
|
|
555
569
|
return None
|
|
@@ -149,7 +149,7 @@ class SqlStorageClient(StorageClient):
|
|
|
149
149
|
# Raise an error if the new version creates breaking changes in the database schema.
|
|
150
150
|
if db_version and db_version != __version__:
|
|
151
151
|
warnings.warn(
|
|
152
|
-
f'Database version {db_version
|
|
152
|
+
f'Database version {db_version} does not match library version {__version__}. '
|
|
153
153
|
'This may lead to unexpected behavior. Drop the db if you want to make sure that '
|
|
154
154
|
'everything will work fine.',
|
|
155
155
|
category=UserWarning,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Annotated, Any, Generic
|
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generic
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
|
|
7
7
|
from typing_extensions import TypeVar
|
|
@@ -127,8 +127,13 @@ class DatasetItemsListPage(BaseModel):
|
|
|
127
127
|
desc: Annotated[bool, Field(default=False)]
|
|
128
128
|
"""Indicates if the returned list is in descending order."""
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
131
|
+
if TYPE_CHECKING:
|
|
132
|
+
items: list[dict] = []
|
|
133
|
+
"""The list of dataset items returned on this page."""
|
|
134
|
+
else:
|
|
135
|
+
items: Annotated[list[dict], Field(default_factory=list)]
|
|
136
|
+
"""The list of dataset items returned on this page."""
|
|
132
137
|
|
|
133
138
|
|
|
134
139
|
@docs_group('Storage data')
|
crawlee/storages/_base.py
CHANGED
|
@@ -44,7 +44,9 @@ class Storage(ABC):
|
|
|
44
44
|
|
|
45
45
|
Args:
|
|
46
46
|
id: The storage ID.
|
|
47
|
-
name: The storage name (global scope, persists across runs).
|
|
47
|
+
name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
|
|
48
|
+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
|
|
49
|
+
(e.g. "my-value-1").
|
|
48
50
|
alias: The storage alias (run scope, creates unnamed storage).
|
|
49
51
|
configuration: Configuration object used during the storage creation or restoration process.
|
|
50
52
|
storage_client: Underlying storage client to use. If not provided, the default global storage client
|
crawlee/storages/_dataset.py
CHANGED
|
@@ -12,6 +12,7 @@ from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
|
12
12
|
|
|
13
13
|
from ._base import Storage
|
|
14
14
|
from ._key_value_store import KeyValueStore
|
|
15
|
+
from ._utils import validate_storage_name
|
|
15
16
|
|
|
16
17
|
if TYPE_CHECKING:
|
|
17
18
|
from collections.abc import AsyncIterator
|
|
@@ -75,6 +76,8 @@ class Dataset(Storage):
|
|
|
75
76
|
id: The unique identifier of the storage.
|
|
76
77
|
name: The name of the storage, if available.
|
|
77
78
|
"""
|
|
79
|
+
validate_storage_name(name)
|
|
80
|
+
|
|
78
81
|
self._client = client
|
|
79
82
|
self._id = id
|
|
80
83
|
self._name = name
|
|
@@ -15,6 +15,7 @@ from crawlee._utils.recoverable_state import RecoverableState
|
|
|
15
15
|
from crawlee.storage_clients.models import KeyValueStoreMetadata
|
|
16
16
|
|
|
17
17
|
from ._base import Storage
|
|
18
|
+
from ._utils import validate_storage_name
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from collections.abc import AsyncIterator
|
|
@@ -84,6 +85,8 @@ class KeyValueStore(Storage):
|
|
|
84
85
|
id: The unique identifier of the storage.
|
|
85
86
|
name: The name of the storage, if available.
|
|
86
87
|
"""
|
|
88
|
+
validate_storage_name(name)
|
|
89
|
+
|
|
87
90
|
self._client = client
|
|
88
91
|
self._id = id
|
|
89
92
|
self._name = name
|
|
@@ -278,11 +281,14 @@ class KeyValueStore(Storage):
|
|
|
278
281
|
if key in cache:
|
|
279
282
|
return cache[key].current_value.root
|
|
280
283
|
|
|
284
|
+
async def kvs_factory() -> KeyValueStore:
|
|
285
|
+
return self
|
|
286
|
+
|
|
281
287
|
cache[key] = recoverable_state = RecoverableState(
|
|
282
288
|
default_state=AutosavedValue(default_value),
|
|
283
|
-
persistence_enabled=True,
|
|
284
|
-
persist_state_kvs_id=self.id,
|
|
285
289
|
persist_state_key=key,
|
|
290
|
+
persistence_enabled=True,
|
|
291
|
+
persist_state_kvs_factory=kvs_factory,
|
|
286
292
|
logger=logger,
|
|
287
293
|
)
|
|
288
294
|
|
|
@@ -13,6 +13,7 @@ from crawlee._utils.wait import wait_for_all_tasks_for_finish
|
|
|
13
13
|
from crawlee.request_loaders import RequestManager
|
|
14
14
|
|
|
15
15
|
from ._base import Storage
|
|
16
|
+
from ._utils import validate_storage_name
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
19
|
from collections.abc import Sequence
|
|
@@ -80,6 +81,8 @@ class RequestQueue(Storage, RequestManager):
|
|
|
80
81
|
id: The unique identifier of the storage.
|
|
81
82
|
name: The name of the storage, if available.
|
|
82
83
|
"""
|
|
84
|
+
validate_storage_name(name)
|
|
85
|
+
|
|
83
86
|
self._client = client
|
|
84
87
|
self._id = id
|
|
85
88
|
self._name = name
|