crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -1,21 +1,68 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import Awaitable, Callable
4
- from typing import TYPE_CHECKING, TypeVar, cast
5
-
3
+ from asyncio import Lock
4
+ from collections import defaultdict
5
+ from collections.abc import Coroutine, Hashable
6
+ from dataclasses import dataclass, field
7
+ from typing import TYPE_CHECKING, TypeVar
8
+ from weakref import WeakValueDictionary
9
+
10
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
6
11
  from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
7
12
 
8
- from ._base import Storage
13
+ from ._utils import validate_storage_name
9
14
 
10
15
  if TYPE_CHECKING:
11
- from crawlee.configuration import Configuration
16
+ from ._base import Storage
12
17
 
13
18
  T = TypeVar('T', bound='Storage')
14
19
 
15
- StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient
16
- """Type alias for the storage client types."""
17
20
 
18
- ClientOpener = Callable[..., Awaitable[StorageClientType]]
21
+ @dataclass
22
+ class _StorageCache:
23
+ """Cache for storage instances."""
24
+
25
+ by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
26
+ default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
27
+ )
28
+ """Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']."""
29
+
30
+ by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
31
+ default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
32
+ )
33
+ """Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']"""
34
+
35
+ by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
36
+ default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
37
+ )
38
+ """Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']"""
39
+
40
+ def remove_from_cache(self, storage_instance: Storage) -> None:
41
+ """Remove a storage instance from the cache.
42
+
43
+ Args:
44
+ storage_instance: The storage instance to remove.
45
+ """
46
+ storage_type = type(storage_instance)
47
+
48
+ # Remove from ID cache
49
+ for additional_key in self.by_id[storage_type][storage_instance.id]:
50
+ del self.by_id[storage_type][storage_instance.id][additional_key]
51
+ break
52
+
53
+ # Remove from name cache or alias cache. It can never be in both.
54
+ if storage_instance.name is not None:
55
+ for additional_key in self.by_name[storage_type][storage_instance.name]:
56
+ del self.by_name[storage_type][storage_instance.name][additional_key]
57
+ break
58
+ else:
59
+ for alias_key in self.by_alias[storage_type]:
60
+ for additional_key in self.by_alias[storage_type][alias_key]:
61
+ del self.by_alias[storage_type][alias_key][additional_key]
62
+ break
63
+
64
+
65
+ ClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient]
19
66
  """Type alias for the client opener function."""
20
67
 
21
68
 
@@ -26,15 +73,12 @@ class StorageInstanceManager:
26
73
  and provides a unified interface for opening and managing storage instances.
27
74
  """
28
75
 
29
- def __init__(self) -> None:
30
- self._cache_by_id = dict[type[Storage], dict[str, Storage]]()
31
- """Cache for storage instances by ID, separated by storage type."""
32
-
33
- self._cache_by_name = dict[type[Storage], dict[str, Storage]]()
34
- """Cache for storage instances by name, separated by storage type."""
76
+ _DEFAULT_STORAGE_ALIAS = '__default__'
77
+ """Reserved alias for default unnamed storage."""
35
78
 
36
- self._default_instances = dict[type[Storage], Storage]()
37
- """Cache for default instances of each storage type."""
79
+ def __init__(self) -> None:
80
+ self._cache: _StorageCache = _StorageCache()
81
+ self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary()
38
82
 
39
83
  async def open_storage_instance(
40
84
  self,
@@ -42,66 +86,112 @@ class StorageInstanceManager:
42
86
  *,
43
87
  id: str | None,
44
88
  name: str | None,
45
- configuration: Configuration,
46
- client_opener: ClientOpener,
89
+ alias: str | None,
90
+ client_opener_coro: ClientOpenerCoro,
91
+ storage_client_cache_key: Hashable = '',
47
92
  ) -> T:
48
93
  """Open a storage instance with caching support.
49
94
 
50
95
  Args:
51
96
  cls: The storage class to instantiate.
52
97
  id: Storage ID.
53
- name: Storage name.
54
- configuration: Configuration object.
55
- client_opener: Function to create the storage client.
98
+ name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
99
+ the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
100
+ (e.g. "my-value-1").
101
+ alias: Storage alias (run scope, creates unnamed storage).
102
+ client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
103
+ storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
56
104
 
57
105
  Returns:
58
106
  The storage instance.
59
107
 
60
108
  Raises:
61
- ValueError: If both id and name are specified.
109
+ ValueError: If multiple parameters out of `id`, `name`, and `alias` are specified.
62
110
  """
63
- if id and name:
64
- raise ValueError('Only one of "id" or "name" can be specified, not both.')
65
-
66
- # Check for default instance
67
- if id is None and name is None and cls in self._default_instances:
68
- return cast('T', self._default_instances[cls])
69
-
70
- # Check cache
71
- if id is not None:
72
- type_cache_by_id = self._cache_by_id.get(cls, {})
73
- if id in type_cache_by_id:
74
- cached_instance = type_cache_by_id[id]
75
- if isinstance(cached_instance, cls):
111
+ try:
112
+ if name == self._DEFAULT_STORAGE_ALIAS:
113
+ raise ValueError(
114
+ f'Storage name cannot be "{self._DEFAULT_STORAGE_ALIAS}" as it is reserved for default alias.'
115
+ )
116
+
117
+ # Validate input parameters.
118
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
119
+
120
+ # Auto-set alias='default' when no parameters are specified.
121
+ # Default unnamed storage is equal to alias=default unnamed storage.
122
+ if not any([name, alias, id]):
123
+ alias = self._DEFAULT_STORAGE_ALIAS
124
+
125
+ # Check cache without lock first for performance.
126
+ if cached_instance := self._get_from_cache(
127
+ cls,
128
+ id=id,
129
+ name=name,
130
+ alias=alias,
131
+ storage_client_cache_key=storage_client_cache_key,
132
+ ):
133
+ return cached_instance
134
+
135
+ # Validate storage name
136
+ if name is not None:
137
+ validate_storage_name(name)
138
+
139
+ # Acquire lock for this opener
140
+ opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
141
+ if not (lock := self._opener_locks.get(opener_lock_key)):
142
+ lock = Lock()
143
+ self._opener_locks[opener_lock_key] = lock
144
+
145
+ async with lock:
146
+ # Another task could have created the storage while we were waiting for the lock - check if that
147
+ # happened
148
+ if cached_instance := self._get_from_cache(
149
+ cls,
150
+ id=id,
151
+ name=name,
152
+ alias=alias,
153
+ storage_client_cache_key=storage_client_cache_key,
154
+ ):
76
155
  return cached_instance
77
156
 
78
- if name is not None:
79
- type_cache_by_name = self._cache_by_name.get(cls, {})
80
- if name in type_cache_by_name:
81
- cached_instance = type_cache_by_name[name]
82
- if isinstance(cached_instance, cls):
83
- return cached_instance
157
+ # Check for conflicts between named and alias storages
158
+ self._check_name_alias_conflict(
159
+ cls,
160
+ name=name,
161
+ alias=alias,
162
+ storage_client_cache_key=storage_client_cache_key,
163
+ )
164
+
165
+ # Create new instance
166
+ client: KeyValueStoreClient | DatasetClient | RequestQueueClient
167
+ client = await client_opener_coro
168
+
169
+ metadata = await client.get_metadata()
84
170
 
85
- # Create new instance
86
- client = await client_opener(id=id, name=name, configuration=configuration)
87
- metadata = await client.get_metadata()
171
+ instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg]
172
+ instance_name = getattr(instance, 'name', None)
88
173
 
89
- instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg]
90
- instance_name = getattr(instance, 'name', None)
174
+ # Cache the instance.
175
+ # Note: No awaits in this section. All cache entries must be written
176
+ # atomically to ensure pre-checks outside the lock see consistent state.
91
177
 
92
- # Cache the instance
93
- type_cache_by_id = self._cache_by_id.setdefault(cls, {})
94
- type_cache_by_name = self._cache_by_name.setdefault(cls, {})
178
+ # Always cache by id.
179
+ self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
95
180
 
96
- type_cache_by_id[instance.id] = instance
97
- if instance_name is not None:
98
- type_cache_by_name[instance_name] = instance
181
+ # Cache named storage.
182
+ if instance_name is not None:
183
+ self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
99
184
 
100
- # Set as default if no id/name specified
101
- if id is None and name is None:
102
- self._default_instances[cls] = instance
185
+ # Cache unnamed storage.
186
+ if alias is not None:
187
+ self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
103
188
 
104
- return instance
189
+ return instance
190
+
191
+ finally:
192
+ # Make sure the client opener is closed.
193
+ # If it was awaited, then closing is no operation, if it was not awaited, this is the cleanup.
194
+ client_opener_coro.close()
105
195
 
106
196
  def remove_from_cache(self, storage_instance: Storage) -> None:
107
197
  """Remove a storage instance from the cache.
@@ -109,25 +199,56 @@ class StorageInstanceManager:
109
199
  Args:
110
200
  storage_instance: The storage instance to remove.
111
201
  """
112
- storage_type = type(storage_instance)
113
-
114
- # Remove from ID cache
115
- type_cache_by_id = self._cache_by_id.get(storage_type, {})
116
- if storage_instance.id in type_cache_by_id:
117
- del type_cache_by_id[storage_instance.id]
118
-
119
- # Remove from name cache
120
- if storage_instance.name is not None:
121
- type_cache_by_name = self._cache_by_name.get(storage_type, {})
122
- if storage_instance.name in type_cache_by_name:
123
- del type_cache_by_name[storage_instance.name]
124
-
125
- # Remove from default instances
126
- if storage_type in self._default_instances and self._default_instances[storage_type] is storage_instance:
127
- del self._default_instances[storage_type]
202
+ self._cache.remove_from_cache(storage_instance)
128
203
 
129
204
  def clear_cache(self) -> None:
130
205
  """Clear all cached storage instances."""
131
- self._cache_by_id.clear()
132
- self._cache_by_name.clear()
133
- self._default_instances.clear()
206
+ self._cache = _StorageCache()
207
+
208
+ def _get_from_cache(
209
+ self,
210
+ cls: type[T],
211
+ *,
212
+ id: str | None = None,
213
+ name: str | None = None,
214
+ alias: str | None = None,
215
+ storage_client_cache_key: Hashable = '',
216
+ ) -> T | None:
217
+ """Get a storage instance from the cache."""
218
+ if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
219
+ if isinstance(cached_instance, cls):
220
+ return cached_instance
221
+ raise RuntimeError('Cached instance type mismatch.')
222
+
223
+ if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
224
+ if isinstance(cached_instance, cls):
225
+ return cached_instance
226
+ raise RuntimeError('Cached instance type mismatch.')
227
+
228
+ if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)):
229
+ if isinstance(cached_instance, cls):
230
+ return cached_instance
231
+ raise RuntimeError('Cached instance type mismatch.')
232
+
233
+ return None
234
+
235
+ def _check_name_alias_conflict(
236
+ self,
237
+ cls: type[T],
238
+ *,
239
+ name: str | None = None,
240
+ alias: str | None = None,
241
+ storage_client_cache_key: Hashable = '',
242
+ ) -> None:
243
+ """Check for conflicts between named and alias storages."""
244
+ if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
245
+ raise ValueError(
246
+ f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
247
+ f'Use a different alias or drop the existing named storage first.'
248
+ )
249
+
250
+ if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
251
+ raise ValueError(
252
+ f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
253
+ f'Use a different name or drop the existing alias storage first.'
254
+ )
@@ -0,0 +1,11 @@
1
+ import re
2
+
3
+ NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')
4
+
5
+
6
+ def validate_storage_name(name: str | None) -> None:
7
+ if name and not NAME_REGEX.match(name):
8
+ raise ValueError(
9
+ f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
10
+ '"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
11
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 0.6.13b15
3
+ Version: 1.3.1b3
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -223,16 +223,18 @@ Classifier: Programming Language :: Python :: 3.10
223
223
  Classifier: Programming Language :: Python :: 3.11
224
224
  Classifier: Programming Language :: Python :: 3.12
225
225
  Classifier: Programming Language :: Python :: 3.13
226
+ Classifier: Programming Language :: Python :: 3.14
226
227
  Classifier: Topic :: Software Development :: Libraries
227
228
  Requires-Python: >=3.10
229
+ Requires-Dist: async-timeout>=5.0.1
228
230
  Requires-Dist: cachetools>=5.5.0
229
231
  Requires-Dist: colorama>=0.4.0
230
- Requires-Dist: impit>=0.5.2
232
+ Requires-Dist: impit>=0.8.0
231
233
  Requires-Dist: more-itertools>=10.2.0
232
234
  Requires-Dist: protego>=0.5.0
233
235
  Requires-Dist: psutil>=6.0.0
234
- Requires-Dist: pydantic!=2.10.0,!=2.10.1,!=2.10.2,>=2.8.0
235
- Requires-Dist: pydantic-settings!=2.7.0,!=2.7.1,!=2.8.0,>=2.2.0
236
+ Requires-Dist: pydantic-settings>=2.12.0
237
+ Requires-Dist: pydantic>=2.11.0
236
238
  Requires-Dist: pyee>=9.0.0
237
239
  Requires-Dist: tldextract>=5.1.0
238
240
  Requires-Dist: typing-extensions>=4.1.0
@@ -244,7 +246,9 @@ Requires-Dist: jaro-winkler>=2.0.3; extra == 'adaptive-crawler'
244
246
  Requires-Dist: playwright>=1.27.0; extra == 'adaptive-crawler'
245
247
  Requires-Dist: scikit-learn>=1.6.0; extra == 'adaptive-crawler'
246
248
  Provides-Extra: all
249
+ Requires-Dist: aiosqlite>=0.21.0; extra == 'all'
247
250
  Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'all'
251
+ Requires-Dist: asyncpg>=0.24.0; extra == 'all'
248
252
  Requires-Dist: beautifulsoup4[lxml]>=4.12.0; extra == 'all'
249
253
  Requires-Dist: browserforge>=1.2.3; extra == 'all'
250
254
  Requires-Dist: cookiecutter>=2.6.0; extra == 'all'
@@ -261,8 +265,10 @@ Requires-Dist: opentelemetry-sdk>=1.34.1; extra == 'all'
261
265
  Requires-Dist: opentelemetry-semantic-conventions>=0.54; extra == 'all'
262
266
  Requires-Dist: parsel>=1.10.0; extra == 'all'
263
267
  Requires-Dist: playwright>=1.27.0; extra == 'all'
268
+ Requires-Dist: redis[hiredis]>=7.0.0; extra == 'all'
264
269
  Requires-Dist: rich>=13.9.0; extra == 'all'
265
270
  Requires-Dist: scikit-learn>=1.6.0; extra == 'all'
271
+ Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'all'
266
272
  Requires-Dist: typer>=0.12.0; extra == 'all'
267
273
  Requires-Dist: wrapt>=1.17.0; extra == 'all'
268
274
  Provides-Extra: beautifulsoup
@@ -293,6 +299,14 @@ Provides-Extra: playwright
293
299
  Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'playwright'
294
300
  Requires-Dist: browserforge>=1.2.3; extra == 'playwright'
295
301
  Requires-Dist: playwright>=1.27.0; extra == 'playwright'
302
+ Provides-Extra: redis
303
+ Requires-Dist: redis[hiredis]>=7.0.0; extra == 'redis'
304
+ Provides-Extra: sql-postgres
305
+ Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
306
+ Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
307
+ Provides-Extra: sql-sqlite
308
+ Requires-Dist: aiosqlite>=0.21.0; extra == 'sql-sqlite'
309
+ Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-sqlite'
296
310
  Description-Content-Type: text/markdown
297
311
 
298
312
  <h1 align="center">
@@ -310,25 +324,16 @@ Description-Content-Type: text/markdown
310
324
  <a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
311
325
  </p>
312
326
 
313
- <p align=center>
314
- <a href="https://badge.fury.io/py/crawlee" rel="nofollow">
315
- <img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI version" style="max-width: 100%;">
316
- </a>
317
- <a href="https://pypi.org/project/crawlee/" rel="nofollow">
318
- <img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI - Downloads" style="max-width: 100%;">
319
- </a>
320
- <a href="https://pypi.org/project/crawlee/" rel="nofollow">
321
- <img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI - Python Version" style="max-width: 100%;">
322
- </a>
323
- <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow">
324
- <img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on discord" style="max-width: 100%;">
325
- </a>
327
+ <p align="center">
328
+ <a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
329
+ <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
330
+ <a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
331
+ <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
332
+ <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
326
333
  </p>
327
334
 
328
335
  Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
329
336
 
330
- > 🚀 Crawlee for Python is open to early adopters!
331
-
332
337
  Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.
333
338
 
334
339
  > 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈