apify 3.4.2b28__tar.gz → 3.4.2b30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {apify-3.4.2b28 → apify-3.4.2b30}/CHANGELOG.md +2 -0
  2. {apify-3.4.2b28 → apify-3.4.2b30}/CONTRIBUTING.md +28 -15
  3. {apify-3.4.2b28 → apify-3.4.2b30}/PKG-INFO +3 -3
  4. {apify-3.4.2b28 → apify-3.4.2b30}/README.md +1 -1
  5. {apify-3.4.2b28 → apify-3.4.2b30}/pyproject.toml +2 -2
  6. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_client.py +4 -0
  7. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +85 -29
  8. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_single_client.py +73 -23
  9. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_utils.py +50 -0
  10. {apify-3.4.2b28 → apify-3.4.2b30}/.gitignore +0 -0
  11. {apify-3.4.2b28 → apify-3.4.2b30}/LICENSE +0 -0
  12. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/__init__.py +0 -0
  13. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_actor.py +0 -0
  14. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_charging.py +0 -0
  15. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_configuration.py +0 -0
  16. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_consts.py +0 -0
  17. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_crypto.py +0 -0
  18. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_proxy_configuration.py +0 -0
  19. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_utils.py +0 -0
  20. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_webhook.py +0 -0
  21. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/errors.py +0 -0
  22. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/__init__.py +0 -0
  23. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/_apify_event_manager.py +0 -0
  24. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/_types.py +0 -0
  25. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/py.typed +0 -0
  26. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/log.py +0 -0
  27. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/py.typed +0 -0
  28. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/request_loaders/__init__.py +0 -0
  29. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/request_loaders/_apify_request_list.py +0 -0
  30. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/request_loaders/py.typed +0 -0
  31. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/__init__.py +0 -0
  32. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_actor_runner.py +0 -0
  33. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_async_thread.py +0 -0
  34. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_logging_config.py +0 -0
  35. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_serialization.py +0 -0
  36. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/extensions/__init__.py +0 -0
  37. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/extensions/_httpcache.py +0 -0
  38. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/middlewares/__init__.py +0 -0
  39. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
  40. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/middlewares/py.typed +0 -0
  41. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/pipelines/__init__.py +0 -0
  42. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
  43. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/pipelines/py.typed +0 -0
  44. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/py.typed +0 -0
  45. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/requests.py +0 -0
  46. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/scheduler.py +0 -0
  47. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/utils.py +0 -0
  48. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/__init__.py +0 -0
  49. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/__init__.py +0 -0
  50. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_alias_resolving.py +0 -0
  51. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_api_client_creation.py +0 -0
  52. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
  53. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
  54. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_models.py +0 -0
  55. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
  56. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/py.typed +0 -0
  57. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/__init__.py +0 -0
  58. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
  59. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
  60. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
  61. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
  62. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
  63. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
  64. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/py.typed +0 -0
  65. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storages/__init__.py +0 -0
  66. {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storages/py.typed +0 -0
@@ -31,6 +31,8 @@ All notable changes to this project will be documented in this file.
31
31
  - Forward all `Webhook` fields to ad-hoc webhooks ([#963](https://github.com/apify/apify-sdk-python/pull/963)) ([726620b](https://github.com/apify/apify-sdk-python/commit/726620be25da85b74b3f0d1e4f8c1f8f1b29d9b1)) by [@vdusek](https://github.com/vdusek)
32
32
  - **scrapy:** Avoid mutating request userData during Scrapy-Apify conversion ([#978](https://github.com/apify/apify-sdk-python/pull/978)) ([b0b7df7](https://github.com/apify/apify-sdk-python/commit/b0b7df72eb169778ab88be04d8b30bb0bdc307d3)) by [@vdusek](https://github.com/vdusek)
33
33
  - **scrapy:** Async-thread startup race, shutdown lifecycle, and timeout setting ([#979](https://github.com/apify/apify-sdk-python/pull/979)) ([ae12935](https://github.com/apify/apify-sdk-python/commit/ae1293512f5ee781533dab5b1dd1f0af0fcc2497)) by [@vdusek](https://github.com/vdusek)
34
+ - Commit request queue dedup cache only after batch_add_requests succeeds ([#975](https://github.com/apify/apify-sdk-python/pull/975)) ([078ab87](https://github.com/apify/apify-sdk-python/commit/078ab8744c96e61a5226a9d19869b5e0df71ab23)) by [@vdusek](https://github.com/vdusek)
35
+ - Prevent request queue softlock by adding `is_finished` and correcting `is_empty` ([#1008](https://github.com/apify/apify-sdk-python/pull/1008)) ([4ead0c6](https://github.com/apify/apify-sdk-python/commit/4ead0c64d2a95263b2fa970f5a8fff9141db62b2)) by [@Mantisus](https://github.com/Mantisus), closes [#987](https://github.com/apify/apify-sdk-python/issues/987)
34
36
 
35
37
  ### 🚜 Refactor
36
38
 
@@ -118,6 +118,22 @@ To run the documentation locally (requires Node.js):
118
118
  uv run poe run-docs
119
119
  ```
120
120
 
121
+ ### Linting the docs and website
122
+
123
+ Markdown content (this guide, `README.md`, and the `docs/` folder) is checked with
124
+ [markdownlint](https://github.com/DavidAnson/markdownlint). The Docusaurus website code is linted
125
+ with [oxlint](https://oxc.rs/) and formatted with [oxfmt](https://oxc.rs/). All of them run in CI.
126
+ To run them locally (requires Node.js 22.12 or newer and pnpm), from the `website/` directory:
127
+
128
+ ```sh
129
+ pnpm lint # lint Markdown and website code
130
+ pnpm lint:fix # auto-fix both
131
+ pnpm format # format the website code
132
+ ```
133
+
134
+ Doc images are committed as optimized `.webp`. To convert a new image, run
135
+ `pnpm opt:images <path-to-image>` from the `website/` directory.
136
+
121
137
  ## Commits
122
138
 
123
139
  We use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process.
@@ -149,25 +165,22 @@ Publishing new versions to [PyPI](https://pypi.org/project/apify) is automated t
149
165
 
150
166
  1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead.
151
167
  2. **Make sure you know what you're doing.**
168
+ 3. Update the version number by modifying the `version` field under `project` in `pyproject.toml`:
152
169
 
153
- 3. Update the version number:
154
-
155
- - Modify the `version` field under `project` in `pyproject.toml`.
156
-
157
- ```toml
158
- [project]
159
- name = "apify"
160
- version = "x.z.y"
161
- ```
170
+ ```toml
171
+ [project]
172
+ name = "apify"
173
+ version = "x.z.y"
174
+ ```
162
175
 
163
176
  4. Build the package:
164
177
 
165
- ```sh
166
- uv run poe build
167
- ```
178
+ ```sh
179
+ uv run poe build
180
+ ```
168
181
 
169
182
  5. Upload to PyPI:
170
183
 
171
- ```sh
172
- uv publish --token YOUR_API_TOKEN
173
- ```
184
+ ```sh
185
+ uv publish --token YOUR_API_TOKEN
186
+ ```
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.4.2b28
3
+ Version: 3.4.2b30
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -228,7 +228,7 @@ Classifier: Typing :: Typed
228
228
  Requires-Python: >=3.11
229
229
  Requires-Dist: apify-client<4.0.0,>=3.0.0
230
230
  Requires-Dist: cachetools>=5.5.0
231
- Requires-Dist: crawlee<2.0.0,>=1.0.4
231
+ Requires-Dist: crawlee<2.0.0,>=1.8.0
232
232
  Requires-Dist: cryptography>=42.0.0
233
233
  Requires-Dist: impit>=0.8.0
234
234
  Requires-Dist: lazy-object-proxy>=1.11.0
@@ -438,7 +438,7 @@ async def main() -> None:
438
438
  The full SDK documentation lives at **[docs.apify.com/sdk/python](https://docs.apify.com/sdk/python)**. For the Apify platform itself, see the [Apify documentation](https://docs.apify.com/).
439
439
 
440
440
  | Section | What you'll find |
441
- |---|---|
441
+ | --- | --- |
442
442
  | [Overview](https://docs.apify.com/sdk/python/docs/overview) | What the SDK is, what Actors are, and how the pieces fit together. |
443
443
  | [Quick start](https://docs.apify.com/sdk/python/docs/quick-start) | Create, run, and deploy your first Python Actor. |
444
444
  | [Concepts](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle) | Actor lifecycle, input, storages, events, proxy management, interacting with other Actors, webhooks, accessing the Apify API, logging, configuration, and pay-per-event. |
@@ -195,7 +195,7 @@ async def main() -> None:
195
195
  The full SDK documentation lives at **[docs.apify.com/sdk/python](https://docs.apify.com/sdk/python)**. For the Apify platform itself, see the [Apify documentation](https://docs.apify.com/).
196
196
 
197
197
  | Section | What you'll find |
198
- |---|---|
198
+ | --- | --- |
199
199
  | [Overview](https://docs.apify.com/sdk/python/docs/overview) | What the SDK is, what Actors are, and how the pieces fit together. |
200
200
  | [Quick start](https://docs.apify.com/sdk/python/docs/quick-start) | Create, run, and deploy your first Python Actor. |
201
201
  | [Concepts](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle) | Actor lifecycle, input, storages, events, proxy management, interacting with other Actors, webhooks, accessing the Apify API, logging, configuration, and pay-per-event. |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "apify"
7
- version = "3.4.2b28"
7
+ version = "3.4.2b30"
8
8
  description = "Apify SDK for Python"
9
9
  authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
10
10
  license = { file = "LICENSE" }
@@ -36,7 +36,7 @@ keywords = [
36
36
  ]
37
37
  dependencies = [
38
38
  "apify-client>=3.0.0,<4.0.0",
39
- "crawlee>=1.0.4,<2.0.0",
39
+ "crawlee>=1.8.0,<2.0.0",
40
40
  "cachetools>=5.5.0",
41
41
  "cryptography>=42.0.0",
42
42
  "impit>=0.8.0",
@@ -198,3 +198,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
198
198
  @override
199
199
  async def is_empty(self) -> bool:
200
200
  return await self._implementation.is_empty()
201
+
202
+ @override
203
+ async def is_finished(self) -> bool:
204
+ return await self._implementation.is_finished()
@@ -11,7 +11,12 @@ from cachetools import LRUCache
11
11
  from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
12
12
 
13
13
  from ._models import ApifyRequestQueueMetadata, CachedRequest, RequestQueueHead
14
- from ._utils import to_crawlee_request, unique_key_to_request_id
14
+ from ._utils import (
15
+ resolve_awaited_in_flight,
16
+ settle_pending_addition,
17
+ to_crawlee_request,
18
+ unique_key_to_request_id,
19
+ )
15
20
 
16
21
  if TYPE_CHECKING:
17
22
  from collections.abc import Callable, Coroutine, Sequence
@@ -71,6 +76,19 @@ class ApifyRequestQueueSharedClient:
71
76
  self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=cache_size)
72
77
  """LRU cache storing request objects, keyed by request ID."""
73
78
 
79
+ self._requests_being_added: dict[str, asyncio.Future[bool]] = {}
80
+ """In-flight `add_batch_of_requests` markers, keyed by request ID.
81
+
82
+ Coordinates only concurrent `add_batch_of_requests` calls sharing this one client instance (e.g. several
83
+ producer coroutines adding requests in the same process). It does not coordinate separate client instances
84
+ or processes, which each keep their own markers; deduplication across clients still relies on the platform.
85
+
86
+ Each future resolves once the platform call that is adding the request settles: `True` if the request was
87
+ committed, `False` otherwise. A concurrent call adding the same request awaits the future instead of
88
+ re-sending it, which avoids a duplicate platform write while still avoiding false success when the original
89
+ add fails.
90
+ """
91
+
74
92
  self._queue_has_locked_requests: bool | None = None
75
93
  """Whether the queue contains requests currently locked by other clients."""
76
94
 
@@ -87,9 +105,13 @@ class ApifyRequestQueueSharedClient:
87
105
  forefront: bool = False,
88
106
  ) -> AddRequestsResponse:
89
107
  """Specific implementation of this method for the RQ shared access mode."""
108
+ loop = asyncio.get_running_loop()
90
109
  # Do not try to add previously added requests to avoid pointless expensive calls to API
91
110
  new_requests: list[Request] = []
92
111
  already_present_requests: list[ProcessedRequest] = []
112
+ # Requests a concurrent `add_batch_of_requests` call is already sending. We await its outcome instead of
113
+ # re-sending them, as (request, that call's in-flight future) pairs.
114
+ awaited_in_flight: list[tuple[Request, asyncio.Future[bool]]] = []
93
115
 
94
116
  for request in requests:
95
117
  request_id = unique_key_to_request_id(request.unique_key)
@@ -106,46 +128,70 @@ class ApifyRequestQueueSharedClient:
106
128
  )
107
129
  )
108
130
 
131
+ elif request_id in self._requests_being_added:
132
+ # A concurrent call is already adding this request; await its outcome rather than re-sending it.
133
+ awaited_in_flight.append((request, self._requests_being_added[request_id]))
134
+
109
135
  else:
110
- # Add new request to the cache.
111
- processed_request = ProcessedRequest(
112
- id=request_id,
113
- unique_key=request.unique_key,
114
- was_already_present=True,
115
- was_already_handled=request.was_already_handled,
116
- )
117
- self._cache_request(
118
- request_id,
119
- processed_request,
120
- )
136
+ # Register an in-flight marker so a concurrent call dedupes against it; caching is deferred
137
+ # until the platform confirms the request was accepted (see below).
121
138
  new_requests.append(request)
139
+ self._requests_being_added[request_id] = loop.create_future()
122
140
 
123
141
  if new_requests:
124
142
  # Prepare requests for API by converting to dictionaries.
125
143
  requests_dict = [request.model_dump(by_alias=True) for request in new_requests]
126
144
 
127
- # Send requests to API.
128
- batch_response = await self._api_client.batch_add_requests(
129
- requests=requests_dict,
130
- forefront=forefront,
131
- )
132
-
133
- batch_response_dict = batch_response.model_dump(by_alias=True)
134
- api_response = AddRequestsResponse.model_validate(batch_response_dict)
135
-
136
- # Add the locally known already present processed requests based on the local cache.
137
- api_response.processed_requests.extend(already_present_requests)
145
+ committed_request_ids: set[str] = set()
146
+ try:
147
+ # Send requests to API.
148
+ batch_response = await self._api_client.batch_add_requests(
149
+ requests=requests_dict,
150
+ forefront=forefront,
151
+ )
138
152
 
139
- # Remove unprocessed requests from the cache
140
- for unprocessed_request in api_response.unprocessed_requests:
141
- unprocessed_request_id = unique_key_to_request_id(unprocessed_request.unique_key)
142
- self._requests_cache.pop(unprocessed_request_id, None)
153
+ batch_response_dict = batch_response.model_dump(by_alias=True)
154
+ api_response = AddRequestsResponse.model_validate(batch_response_dict)
155
+
156
+ # Commit only the requests the platform actually accepted to the local dedup cache. Caching after
157
+ # the call succeeds (not before) keeps a failed call from poisoning the cache and silently
158
+ # deduplicating a later retry of the same request.
159
+ unprocessed_unique_keys = {request.unique_key for request in api_response.unprocessed_requests}
160
+ for request in new_requests:
161
+ if request.unique_key in unprocessed_unique_keys:
162
+ continue
163
+ request_id = unique_key_to_request_id(request.unique_key)
164
+ self._cache_request(
165
+ request_id,
166
+ ProcessedRequest(
167
+ id=request_id,
168
+ unique_key=request.unique_key,
169
+ was_already_present=True,
170
+ was_already_handled=request.was_already_handled,
171
+ ),
172
+ )
173
+ committed_request_ids.add(request_id)
174
+
175
+ # Add the locally known already present processed requests based on the local cache.
176
+ api_response.processed_requests.extend(already_present_requests)
177
+ finally:
178
+ # Release the in-flight markers we registered. Committed requests tell concurrent callers the
179
+ # request reached the platform; everything else (unprocessed, API error, cancellation) tells them
180
+ # it did not, so they retry instead of reporting false success.
181
+ for request in new_requests:
182
+ request_id = unique_key_to_request_id(request.unique_key)
183
+ settle_pending_addition(
184
+ self._requests_being_added, request_id, committed=request_id in committed_request_ids
185
+ )
143
186
 
144
187
  else:
145
188
  api_response = AddRequestsResponse.model_validate(
146
189
  {'unprocessedRequests': [], 'processedRequests': already_present_requests}
147
190
  )
148
191
 
192
+ # Fold in requests a concurrent call was already adding.
193
+ await resolve_awaited_in_flight(awaited_in_flight, api_response)
194
+
149
195
  logger.debug(
150
196
  f'Tried to add new requests: {len(new_requests)}, '
151
197
  f'succeeded to add new requests: {len(api_response.processed_requests) - len(already_present_requests)}, '
@@ -292,8 +338,18 @@ class ApifyRequestQueueSharedClient:
292
338
  # Check _list_head.
293
339
  # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
294
340
  async with self._fetch_lock:
295
- head = await self._list_head(limit=1)
296
- return len(head.items) == 0 and not self._queue_has_locked_requests
341
+ return await self._is_empty()
342
+
343
+ async def is_finished(self) -> bool:
344
+ """Specific implementation of this method for the RQ shared access mode."""
345
+ async with self._fetch_lock:
346
+ # Order of operations is important here, because affects on `_queue_has_locked_requests`.
347
+ return await self._is_empty() and not self._queue_has_locked_requests
348
+
349
+ async def _is_empty(self) -> bool:
350
+ """Check whether anything is available to fetch. Lock-free core of `is_empty`, caller must hold the lock."""
351
+ head = await self._list_head(limit=1)
352
+ return len(head.items) == 0
297
353
 
298
354
  async def _get_metadata_estimate(self) -> RequestQueueMetadata:
299
355
  """Try to get cached metadata first. If multiple clients, fuse with global metadata.
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from collections import deque
4
5
  from datetime import UTC, datetime
5
6
  from logging import getLogger
@@ -9,7 +10,12 @@ from cachetools import LRUCache
9
10
 
10
11
  from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
11
12
 
12
- from ._utils import to_crawlee_request, unique_key_to_request_id
13
+ from ._utils import (
14
+ resolve_awaited_in_flight,
15
+ settle_pending_addition,
16
+ to_crawlee_request,
17
+ unique_key_to_request_id,
18
+ )
13
19
 
14
20
  if TYPE_CHECKING:
15
21
  from collections.abc import Sequence
@@ -90,6 +96,19 @@ class ApifyRequestQueueSingleClient:
90
96
  Tracked locally to accurately determine when the queue is empty for this single consumer.
91
97
  """
92
98
 
99
+ self._requests_being_added: dict[str, asyncio.Future[bool]] = {}
100
+ """In-flight `add_batch_of_requests` markers, keyed by request ID.
101
+
102
+ Coordinates only concurrent `add_batch_of_requests` calls sharing this one client instance (e.g. several
103
+ producer coroutines adding requests in the same process). It does not coordinate separate client instances
104
+ or processes, which each keep their own markers; deduplication across clients still relies on the platform.
105
+
106
+ Each future resolves once the platform call that is adding the request settles: `True` if the request was
107
+ committed, `False` otherwise. A concurrent call adding the same request awaits the future instead of
108
+ re-sending it, which avoids a duplicate platform write while still avoiding false success when the original
109
+ add fails.
110
+ """
111
+
93
112
  self._initialized_caches = False
94
113
  """Flag indicating whether local caches have been populated from existing queue contents.
95
114
 
@@ -108,8 +127,12 @@ class ApifyRequestQueueSingleClient:
108
127
  await self._init_caches()
109
128
  self._initialized_caches = True
110
129
 
130
+ loop = asyncio.get_running_loop()
111
131
  new_requests: list[Request] = []
112
132
  already_present_requests: list[ProcessedRequest] = []
133
+ # Requests a concurrent `add_batch_of_requests` call is already sending. We await its outcome instead of
134
+ # re-sending them, as (request, that call's in-flight future) pairs.
135
+ awaited_in_flight: list[tuple[Request, asyncio.Future[bool]]] = []
113
136
 
114
137
  for request in requests:
115
138
  # Calculate id for request
@@ -135,33 +158,54 @@ class ApifyRequestQueueSingleClient:
135
158
  was_already_handled=request.was_already_handled,
136
159
  )
137
160
  )
161
+ # Check if a concurrent call is already adding this request, and await its outcome rather than
162
+ # re-sending it.
163
+ elif request_id in self._requests_being_added:
164
+ awaited_in_flight.append((request, self._requests_being_added[request_id]))
138
165
  else:
139
- # Push the request to the platform. Probably not there, or we are not aware of it
166
+ # Push the request to the platform. Probably not there, or we are not aware of it. Register an
167
+ # in-flight marker so a concurrent call dedupes against it; caching is deferred until the
168
+ # platform confirms the request was accepted (see below).
140
169
  new_requests.append(request)
141
-
142
- # Update local caches
143
- self._requests_cache[request_id] = request
144
- if forefront:
145
- self._head_requests.append(request_id)
146
- else:
147
- self._head_requests.appendleft(request_id)
170
+ self._requests_being_added[request_id] = loop.create_future()
148
171
 
149
172
  if new_requests:
150
173
  # Prepare requests for API by converting to dictionaries.
151
174
  requests_dict = [request.model_dump(by_alias=True) for request in new_requests]
152
175
 
153
- # Send requests to API.
154
- batch_response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront)
155
- batch_response_dict = batch_response.model_dump(by_alias=True)
156
- api_response = AddRequestsResponse.model_validate(batch_response_dict)
157
-
158
- # Add the locally known already present processed requests based on the local cache.
159
- api_response.processed_requests.extend(already_present_requests)
160
-
161
- # Remove unprocessed requests from the cache
162
- for unprocessed_request in api_response.unprocessed_requests:
163
- request_id = unique_key_to_request_id(unprocessed_request.unique_key)
164
- self._requests_cache.pop(request_id, None)
176
+ committed_request_ids: set[str] = set()
177
+ try:
178
+ # Send requests to API.
179
+ batch_response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront)
180
+ batch_response_dict = batch_response.model_dump(by_alias=True)
181
+ api_response = AddRequestsResponse.model_validate(batch_response_dict)
182
+
183
+ # Commit only the requests the platform actually accepted to the local caches. Caching after the
184
+ # call succeeds (not before) keeps a failed call from poisoning the cache and silently
185
+ # deduplicating a later retry of the same request.
186
+ unprocessed_unique_keys = {request.unique_key for request in api_response.unprocessed_requests}
187
+ for request in new_requests:
188
+ if request.unique_key in unprocessed_unique_keys:
189
+ continue
190
+ request_id = unique_key_to_request_id(request.unique_key)
191
+ self._requests_cache[request_id] = request
192
+ if forefront:
193
+ self._head_requests.append(request_id)
194
+ else:
195
+ self._head_requests.appendleft(request_id)
196
+ committed_request_ids.add(request_id)
197
+
198
+ # Add the locally known already present processed requests based on the local cache.
199
+ api_response.processed_requests.extend(already_present_requests)
200
+ finally:
201
+ # Release the in-flight markers we registered. Committed requests tell concurrent callers the
202
+ # request reached the platform; everything else (unprocessed, API error, cancellation) tells them
203
+ # it did not, so they retry instead of reporting false success.
204
+ for request in new_requests:
205
+ request_id = unique_key_to_request_id(request.unique_key)
206
+ settle_pending_addition(
207
+ self._requests_being_added, request_id, committed=request_id in committed_request_ids
208
+ )
165
209
 
166
210
  else:
167
211
  api_response = AddRequestsResponse(
@@ -169,6 +213,9 @@ class ApifyRequestQueueSingleClient:
169
213
  processed_requests=already_present_requests,
170
214
  )
171
215
 
216
+ # Fold in requests a concurrent call was already adding.
217
+ await resolve_awaited_in_flight(awaited_in_flight, api_response)
218
+
172
219
  # Update assumed total count for newly added requests.
173
220
  new_request_count = 0
174
221
  for processed_request in api_response.processed_requests:
@@ -277,9 +324,12 @@ class ApifyRequestQueueSingleClient:
277
324
 
278
325
  async def is_empty(self) -> bool:
279
326
  """Specific implementation of this method for the RQ single access mode."""
280
- # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
281
327
  await self._ensure_head_is_non_empty()
282
- return not self._head_requests and not self._requests_in_progress
328
+ return not self._head_requests
329
+
330
+ async def is_finished(self) -> bool:
331
+ """Specific implementation of this method for the RQ single access mode."""
332
+ return await self.is_empty() and not self._requests_in_progress
283
333
 
284
334
  async def _ensure_head_is_non_empty(self) -> None:
285
335
  """Ensure that the queue head has requests if they are available in the queue."""
@@ -1,17 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import re
4
5
  from base64 import b64encode
5
6
  from hashlib import sha256
6
7
  from typing import TYPE_CHECKING
7
8
 
8
9
  from crawlee._utils.crypto import compute_short_hash
10
+ from crawlee.storage_clients.models import ProcessedRequest, UnprocessedRequest
9
11
 
10
12
  from apify import Request
11
13
 
12
14
  if TYPE_CHECKING:
15
+ from collections.abc import Iterable
16
+
13
17
  from apify_client._models import HeadRequest, LockedHeadRequest
14
18
  from apify_client._models import Request as ClientRequest
19
+ from crawlee.storage_clients.models import AddRequestsResponse
15
20
 
16
21
  from apify import Configuration
17
22
 
@@ -60,3 +65,48 @@ def to_crawlee_request(client_request: ClientRequest | HeadRequest | LockedHeadR
60
65
 
61
66
  # Validate and construct Crawlee Request from the serialized dict
62
67
  return Request.model_validate(request_dict)
68
+
69
+
70
+ def settle_pending_addition(
71
+ requests_being_added: dict[str, asyncio.Future[bool]],
72
+ request_id: str,
73
+ *,
74
+ committed: bool,
75
+ ) -> None:
76
+ """Resolve the in-flight add marker for a request, unblocking any concurrent call awaiting it.
77
+
78
+ Args:
79
+ requests_being_added: The client's map of in-flight `add_batch_of_requests` markers.
80
+ request_id: ID of the request whose in-flight add has settled.
81
+ committed: Whether the request was committed to the platform.
82
+ """
83
+ future = requests_being_added.pop(request_id, None)
84
+ if future is not None and not future.done():
85
+ future.set_result(committed)
86
+
87
+
88
+ async def resolve_awaited_in_flight(
89
+ awaited_in_flight: Iterable[tuple[Request, asyncio.Future[bool]]],
90
+ api_response: AddRequestsResponse,
91
+ ) -> None:
92
+ """Await concurrent in-flight adds of these requests and fold the outcome into `api_response`.
93
+
94
+ Requests the concurrent add committed are reported as already present; the rest are reported unprocessed
95
+ so the caller retries them rather than receiving false success.
96
+ """
97
+ for request, future in awaited_in_flight:
98
+ # Shield the shared in-flight marker: cancelling this awaiting caller must not cancel the future, which
99
+ # is owned by the original producer and may have other callers waiting on it.
100
+ if await asyncio.shield(future):
101
+ api_response.processed_requests.append(
102
+ ProcessedRequest(
103
+ id=unique_key_to_request_id(request.unique_key),
104
+ unique_key=request.unique_key,
105
+ was_already_present=True,
106
+ was_already_handled=request.was_already_handled,
107
+ )
108
+ )
109
+ else:
110
+ api_response.unprocessed_requests.append(
111
+ UnprocessedRequest(unique_key=request.unique_key, url=request.url, method=request.method)
112
+ )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes