crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import functools
4
5
  import json
5
6
  import shutil
6
7
  import urllib.parse
@@ -10,11 +11,12 @@ from pathlib import Path
10
11
  from typing import TYPE_CHECKING, Any
11
12
 
12
13
  from pydantic import ValidationError
13
- from typing_extensions import override
14
+ from typing_extensions import Self, override
14
15
 
15
16
  from crawlee._consts import METADATA_FILENAME
16
17
  from crawlee._utils.crypto import crypto_random_object_id
17
18
  from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
19
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
18
20
  from crawlee.storage_clients._base import KeyValueStoreClient
19
21
  from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
20
22
 
@@ -55,7 +57,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
55
57
  self,
56
58
  *,
57
59
  metadata: KeyValueStoreMetadata,
58
- storage_dir: Path,
60
+ path_to_kvs: Path,
59
61
  lock: asyncio.Lock,
60
62
  ) -> None:
61
63
  """Initialize a new instance.
@@ -64,8 +66,8 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
64
66
  """
65
67
  self._metadata = metadata
66
68
 
67
- self._storage_dir = storage_dir
68
- """The base directory where the storage data are being persisted."""
69
+ self._path_to_kvs = path_to_kvs
70
+ """The full path to the key-value store directory."""
69
71
 
70
72
  self._lock = lock
71
73
  """A lock to ensure that only one operation is performed at a time."""
@@ -77,10 +79,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
77
79
  @property
78
80
  def path_to_kvs(self) -> Path:
79
81
  """The full path to the key-value store directory."""
80
- if self._metadata.name is None:
81
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
82
-
83
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
82
+ return self._path_to_kvs
84
83
 
85
84
  @property
86
85
  def path_to_metadata(self) -> Path:
@@ -93,8 +92,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
93
92
  *,
94
93
  id: str | None,
95
94
  name: str | None,
95
+ alias: str | None,
96
96
  configuration: Configuration,
97
- ) -> FileSystemKeyValueStoreClient:
97
+ ) -> Self:
98
98
  """Open or create a file system key-value store client.
99
99
 
100
100
  This method attempts to open an existing key-value store from the file system. If a KVS with the specified
@@ -103,17 +103,21 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
103
103
 
104
104
  Args:
105
105
  id: The ID of the key-value store to open. If provided, searches for existing store by ID.
106
- name: The name of the key-value store to open. If not provided, uses the default store.
106
+ name: The name of the key-value store for named (global scope) storages.
107
+ alias: The alias of the key-value store for unnamed (run scope) storages.
107
108
  configuration: The configuration object containing storage directory settings.
108
109
 
109
110
  Returns:
110
111
  An instance for the opened or created storage client.
111
112
 
112
113
  Raises:
113
- ValueError: If a store with the specified ID is not found, or if metadata is invalid.
114
+ ValueError: If a store with the specified ID is not found, if metadata is invalid,
115
+ or if both name and alias are provided.
114
116
  """
115
- storage_dir = Path(configuration.storage_dir)
116
- kvs_base_path = storage_dir / cls._STORAGE_SUBDIR
117
+ # Validate input parameters.
118
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
119
+
120
+ kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
117
121
 
118
122
  if not kvs_base_path.exists():
119
123
  await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)
@@ -125,19 +129,19 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
125
129
  if not kvs_dir.is_dir():
126
130
  continue
127
131
 
128
- metadata_path = kvs_dir / METADATA_FILENAME
129
- if not metadata_path.exists():
132
+ path_to_metadata = kvs_dir / METADATA_FILENAME
133
+ if not path_to_metadata.exists():
130
134
  continue
131
135
 
132
136
  try:
133
- file = await asyncio.to_thread(metadata_path.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
134
138
  try:
135
139
  file_content = json.load(file)
136
140
  metadata = KeyValueStoreMetadata(**file_content)
137
141
  if metadata.id == id:
138
142
  client = cls(
139
143
  metadata=metadata,
140
- storage_dir=storage_dir,
144
+ path_to_kvs=kvs_base_path / kvs_dir,
141
145
  lock=asyncio.Lock(),
142
146
  )
143
147
  await client._update_metadata(update_accessed_at=True)
@@ -151,14 +155,15 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
151
155
  if not found:
152
156
  raise ValueError(f'Key-value store with ID "{id}" not found.')
153
157
 
154
- # Get a new instance by name.
158
+ # Get a new instance by name or alias.
155
159
  else:
156
- kvs_path = kvs_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else kvs_base_path / name
157
- metadata_path = kvs_path / METADATA_FILENAME
160
+ kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
161
+ path_to_kvs = kvs_base_path / kvs_dir
162
+ path_to_metadata = path_to_kvs / METADATA_FILENAME
158
163
 
159
164
  # If the key-value store directory exists, reconstruct the client from the metadata file.
160
- if kvs_path.exists() and metadata_path.exists():
161
- file = await asyncio.to_thread(open, metadata_path)
165
+ if path_to_kvs.exists() and path_to_metadata.exists():
166
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
162
167
  try:
163
168
  file_content = json.load(file)
164
169
  finally:
@@ -166,11 +171,11 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
166
171
  try:
167
172
  metadata = KeyValueStoreMetadata(**file_content)
168
173
  except ValidationError as exc:
169
- raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc
174
+ raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc
170
175
 
171
176
  client = cls(
172
177
  metadata=metadata,
173
- storage_dir=storage_dir,
178
+ path_to_kvs=path_to_kvs,
174
179
  lock=asyncio.Lock(),
175
180
  )
176
181
 
@@ -188,7 +193,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
188
193
  )
189
194
  client = cls(
190
195
  metadata=metadata,
191
- storage_dir=storage_dir,
196
+ path_to_kvs=path_to_kvs,
192
197
  lock=asyncio.Lock(),
193
198
  )
194
199
  await client._update_metadata()
@@ -235,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
235
240
  # Read the metadata file
236
241
  async with self._lock:
237
242
  try:
238
- file = await asyncio.to_thread(open, record_metadata_filepath)
243
+ file = await asyncio.to_thread(
244
+ functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
245
+ )
239
246
  except FileNotFoundError:
240
247
  logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
241
248
  return None
@@ -369,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
369
376
 
370
377
  # List and sort all files *inside* a brief lock, then release it immediately:
371
378
  async with self._lock:
372
- files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*')))
379
+ files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
373
380
 
374
381
  count = 0
375
382