apify 3.4.2b28__tar.gz → 3.4.2b30__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {apify-3.4.2b28 → apify-3.4.2b30}/CHANGELOG.md +2 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/CONTRIBUTING.md +28 -15
- {apify-3.4.2b28 → apify-3.4.2b30}/PKG-INFO +3 -3
- {apify-3.4.2b28 → apify-3.4.2b30}/README.md +1 -1
- {apify-3.4.2b28 → apify-3.4.2b30}/pyproject.toml +2 -2
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_client.py +4 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +85 -29
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_single_client.py +73 -23
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_utils.py +50 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/.gitignore +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/LICENSE +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_actor.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_charging.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_configuration.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_consts.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_crypto.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_proxy_configuration.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_utils.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/_webhook.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/errors.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/_apify_event_manager.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/_types.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/events/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/log.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/request_loaders/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/request_loaders/_apify_request_list.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/request_loaders/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_actor_runner.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_async_thread.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_logging_config.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/_serialization.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/extensions/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/extensions/_httpcache.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/middlewares/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/pipelines/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/requests.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/scheduler.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/scrapy/utils.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_alias_resolving.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_api_client_creation.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_models.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/py.typed +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storages/__init__.py +0 -0
- {apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storages/py.typed +0 -0
|
@@ -31,6 +31,8 @@ All notable changes to this project will be documented in this file.
|
|
|
31
31
|
- Forward all `Webhook` fields to ad-hoc webhooks ([#963](https://github.com/apify/apify-sdk-python/pull/963)) ([726620b](https://github.com/apify/apify-sdk-python/commit/726620be25da85b74b3f0d1e4f8c1f8f1b29d9b1)) by [@vdusek](https://github.com/vdusek)
|
|
32
32
|
- **scrapy:** Avoid mutating request userData during Scrapy-Apify conversion ([#978](https://github.com/apify/apify-sdk-python/pull/978)) ([b0b7df7](https://github.com/apify/apify-sdk-python/commit/b0b7df72eb169778ab88be04d8b30bb0bdc307d3)) by [@vdusek](https://github.com/vdusek)
|
|
33
33
|
- **scrapy:** Async-thread startup race, shutdown lifecycle, and timeout setting ([#979](https://github.com/apify/apify-sdk-python/pull/979)) ([ae12935](https://github.com/apify/apify-sdk-python/commit/ae1293512f5ee781533dab5b1dd1f0af0fcc2497)) by [@vdusek](https://github.com/vdusek)
|
|
34
|
+
- Commit request queue dedup cache only after batch_add_requests succeeds ([#975](https://github.com/apify/apify-sdk-python/pull/975)) ([078ab87](https://github.com/apify/apify-sdk-python/commit/078ab8744c96e61a5226a9d19869b5e0df71ab23)) by [@vdusek](https://github.com/vdusek)
|
|
35
|
+
- Prevent request queue softlock by adding `is_finished` and correcting `is_empty` ([#1008](https://github.com/apify/apify-sdk-python/pull/1008)) ([4ead0c6](https://github.com/apify/apify-sdk-python/commit/4ead0c64d2a95263b2fa970f5a8fff9141db62b2)) by [@Mantisus](https://github.com/Mantisus), closes [#987](https://github.com/apify/apify-sdk-python/issues/987)
|
|
34
36
|
|
|
35
37
|
### 🚜 Refactor
|
|
36
38
|
|
|
@@ -118,6 +118,22 @@ To run the documentation locally (requires Node.js):
|
|
|
118
118
|
uv run poe run-docs
|
|
119
119
|
```
|
|
120
120
|
|
|
121
|
+
### Linting the docs and website
|
|
122
|
+
|
|
123
|
+
Markdown content (this guide, `README.md`, and the `docs/` folder) is checked with
|
|
124
|
+
[markdownlint](https://github.com/DavidAnson/markdownlint). The Docusaurus website code is linted
|
|
125
|
+
with [oxlint](https://oxc.rs/) and formatted with [oxfmt](https://oxc.rs/). All of them run in CI.
|
|
126
|
+
To run them locally (requires Node.js 22.12 or newer and pnpm), from the `website/` directory:
|
|
127
|
+
|
|
128
|
+
```sh
|
|
129
|
+
pnpm lint # lint Markdown and website code
|
|
130
|
+
pnpm lint:fix # auto-fix both
|
|
131
|
+
pnpm format # format the website code
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Doc images are committed as optimized `.webp`. To convert a new image, run
|
|
135
|
+
`pnpm opt:images <path-to-image>` from the `website/` directory.
|
|
136
|
+
|
|
121
137
|
## Commits
|
|
122
138
|
|
|
123
139
|
We use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process.
|
|
@@ -149,25 +165,22 @@ Publishing new versions to [PyPI](https://pypi.org/project/apify) is automated t
|
|
|
149
165
|
|
|
150
166
|
1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead.
|
|
151
167
|
2. **Make sure you know what you're doing.**
|
|
168
|
+
3. Update the version number by modifying the `version` field under `project` in `pyproject.toml`:
|
|
152
169
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
```
|
|
158
|
-
[project]
|
|
159
|
-
name = "apify"
|
|
160
|
-
version = "x.z.y"
|
|
161
|
-
```
|
|
170
|
+
```toml
|
|
171
|
+
[project]
|
|
172
|
+
name = "apify"
|
|
173
|
+
version = "x.z.y"
|
|
174
|
+
```
|
|
162
175
|
|
|
163
176
|
4. Build the package:
|
|
164
177
|
|
|
165
|
-
```sh
|
|
166
|
-
uv run poe build
|
|
167
|
-
```
|
|
178
|
+
```sh
|
|
179
|
+
uv run poe build
|
|
180
|
+
```
|
|
168
181
|
|
|
169
182
|
5. Upload to PyPI:
|
|
170
183
|
|
|
171
|
-
```sh
|
|
172
|
-
uv publish --token YOUR_API_TOKEN
|
|
173
|
-
```
|
|
184
|
+
```sh
|
|
185
|
+
uv publish --token YOUR_API_TOKEN
|
|
186
|
+
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.2b30
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
|
|
@@ -228,7 +228,7 @@ Classifier: Typing :: Typed
|
|
|
228
228
|
Requires-Python: >=3.11
|
|
229
229
|
Requires-Dist: apify-client<4.0.0,>=3.0.0
|
|
230
230
|
Requires-Dist: cachetools>=5.5.0
|
|
231
|
-
Requires-Dist: crawlee<2.0.0,>=1.0
|
|
231
|
+
Requires-Dist: crawlee<2.0.0,>=1.8.0
|
|
232
232
|
Requires-Dist: cryptography>=42.0.0
|
|
233
233
|
Requires-Dist: impit>=0.8.0
|
|
234
234
|
Requires-Dist: lazy-object-proxy>=1.11.0
|
|
@@ -438,7 +438,7 @@ async def main() -> None:
|
|
|
438
438
|
The full SDK documentation lives at **[docs.apify.com/sdk/python](https://docs.apify.com/sdk/python)**. For the Apify platform itself, see the [Apify documentation](https://docs.apify.com/).
|
|
439
439
|
|
|
440
440
|
| Section | What you'll find |
|
|
441
|
-
|
|
441
|
+
| --- | --- |
|
|
442
442
|
| [Overview](https://docs.apify.com/sdk/python/docs/overview) | What the SDK is, what Actors are, and how the pieces fit together. |
|
|
443
443
|
| [Quick start](https://docs.apify.com/sdk/python/docs/quick-start) | Create, run, and deploy your first Python Actor. |
|
|
444
444
|
| [Concepts](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle) | Actor lifecycle, input, storages, events, proxy management, interacting with other Actors, webhooks, accessing the Apify API, logging, configuration, and pay-per-event. |
|
|
@@ -195,7 +195,7 @@ async def main() -> None:
|
|
|
195
195
|
The full SDK documentation lives at **[docs.apify.com/sdk/python](https://docs.apify.com/sdk/python)**. For the Apify platform itself, see the [Apify documentation](https://docs.apify.com/).
|
|
196
196
|
|
|
197
197
|
| Section | What you'll find |
|
|
198
|
-
|
|
198
|
+
| --- | --- |
|
|
199
199
|
| [Overview](https://docs.apify.com/sdk/python/docs/overview) | What the SDK is, what Actors are, and how the pieces fit together. |
|
|
200
200
|
| [Quick start](https://docs.apify.com/sdk/python/docs/quick-start) | Create, run, and deploy your first Python Actor. |
|
|
201
201
|
| [Concepts](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle) | Actor lifecycle, input, storages, events, proxy management, interacting with other Actors, webhooks, accessing the Apify API, logging, configuration, and pay-per-event. |
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "apify"
|
|
7
|
-
version = "3.4.
|
|
7
|
+
version = "3.4.2b30"
|
|
8
8
|
description = "Apify SDK for Python"
|
|
9
9
|
authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -36,7 +36,7 @@ keywords = [
|
|
|
36
36
|
]
|
|
37
37
|
dependencies = [
|
|
38
38
|
"apify-client>=3.0.0,<4.0.0",
|
|
39
|
-
"crawlee>=1.0
|
|
39
|
+
"crawlee>=1.8.0,<2.0.0",
|
|
40
40
|
"cachetools>=5.5.0",
|
|
41
41
|
"cryptography>=42.0.0",
|
|
42
42
|
"impit>=0.8.0",
|
|
@@ -198,3 +198,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
198
198
|
@override
|
|
199
199
|
async def is_empty(self) -> bool:
|
|
200
200
|
return await self._implementation.is_empty()
|
|
201
|
+
|
|
202
|
+
@override
|
|
203
|
+
async def is_finished(self) -> bool:
|
|
204
|
+
return await self._implementation.is_finished()
|
{apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_shared_client.py
RENAMED
|
@@ -11,7 +11,12 @@ from cachetools import LRUCache
|
|
|
11
11
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
12
12
|
|
|
13
13
|
from ._models import ApifyRequestQueueMetadata, CachedRequest, RequestQueueHead
|
|
14
|
-
from ._utils import
|
|
14
|
+
from ._utils import (
|
|
15
|
+
resolve_awaited_in_flight,
|
|
16
|
+
settle_pending_addition,
|
|
17
|
+
to_crawlee_request,
|
|
18
|
+
unique_key_to_request_id,
|
|
19
|
+
)
|
|
15
20
|
|
|
16
21
|
if TYPE_CHECKING:
|
|
17
22
|
from collections.abc import Callable, Coroutine, Sequence
|
|
@@ -71,6 +76,19 @@ class ApifyRequestQueueSharedClient:
|
|
|
71
76
|
self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=cache_size)
|
|
72
77
|
"""LRU cache storing request objects, keyed by request ID."""
|
|
73
78
|
|
|
79
|
+
self._requests_being_added: dict[str, asyncio.Future[bool]] = {}
|
|
80
|
+
"""In-flight `add_batch_of_requests` markers, keyed by request ID.
|
|
81
|
+
|
|
82
|
+
Coordinates only concurrent `add_batch_of_requests` calls sharing this one client instance (e.g. several
|
|
83
|
+
producer coroutines adding requests in the same process). It does not coordinate separate client instances
|
|
84
|
+
or processes, which each keep their own markers; deduplication across clients still relies on the platform.
|
|
85
|
+
|
|
86
|
+
Each future resolves once the platform call that is adding the request settles: `True` if the request was
|
|
87
|
+
committed, `False` otherwise. A concurrent call adding the same request awaits the future instead of
|
|
88
|
+
re-sending it, which avoids a duplicate platform write while still avoiding false success when the original
|
|
89
|
+
add fails.
|
|
90
|
+
"""
|
|
91
|
+
|
|
74
92
|
self._queue_has_locked_requests: bool | None = None
|
|
75
93
|
"""Whether the queue contains requests currently locked by other clients."""
|
|
76
94
|
|
|
@@ -87,9 +105,13 @@ class ApifyRequestQueueSharedClient:
|
|
|
87
105
|
forefront: bool = False,
|
|
88
106
|
) -> AddRequestsResponse:
|
|
89
107
|
"""Specific implementation of this method for the RQ shared access mode."""
|
|
108
|
+
loop = asyncio.get_running_loop()
|
|
90
109
|
# Do not try to add previously added requests to avoid pointless expensive calls to API
|
|
91
110
|
new_requests: list[Request] = []
|
|
92
111
|
already_present_requests: list[ProcessedRequest] = []
|
|
112
|
+
# Requests a concurrent `add_batch_of_requests` call is already sending. We await its outcome instead of
|
|
113
|
+
# re-sending them, as (request, that call's in-flight future) pairs.
|
|
114
|
+
awaited_in_flight: list[tuple[Request, asyncio.Future[bool]]] = []
|
|
93
115
|
|
|
94
116
|
for request in requests:
|
|
95
117
|
request_id = unique_key_to_request_id(request.unique_key)
|
|
@@ -106,46 +128,70 @@ class ApifyRequestQueueSharedClient:
|
|
|
106
128
|
)
|
|
107
129
|
)
|
|
108
130
|
|
|
131
|
+
elif request_id in self._requests_being_added:
|
|
132
|
+
# A concurrent call is already adding this request; await its outcome rather than re-sending it.
|
|
133
|
+
awaited_in_flight.append((request, self._requests_being_added[request_id]))
|
|
134
|
+
|
|
109
135
|
else:
|
|
110
|
-
#
|
|
111
|
-
|
|
112
|
-
id=request_id,
|
|
113
|
-
unique_key=request.unique_key,
|
|
114
|
-
was_already_present=True,
|
|
115
|
-
was_already_handled=request.was_already_handled,
|
|
116
|
-
)
|
|
117
|
-
self._cache_request(
|
|
118
|
-
request_id,
|
|
119
|
-
processed_request,
|
|
120
|
-
)
|
|
136
|
+
# Register an in-flight marker so a concurrent call dedupes against it; caching is deferred
|
|
137
|
+
# until the platform confirms the request was accepted (see below).
|
|
121
138
|
new_requests.append(request)
|
|
139
|
+
self._requests_being_added[request_id] = loop.create_future()
|
|
122
140
|
|
|
123
141
|
if new_requests:
|
|
124
142
|
# Prepare requests for API by converting to dictionaries.
|
|
125
143
|
requests_dict = [request.model_dump(by_alias=True) for request in new_requests]
|
|
126
144
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
requests
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
api_response = AddRequestsResponse.model_validate(batch_response_dict)
|
|
135
|
-
|
|
136
|
-
# Add the locally known already present processed requests based on the local cache.
|
|
137
|
-
api_response.processed_requests.extend(already_present_requests)
|
|
145
|
+
committed_request_ids: set[str] = set()
|
|
146
|
+
try:
|
|
147
|
+
# Send requests to API.
|
|
148
|
+
batch_response = await self._api_client.batch_add_requests(
|
|
149
|
+
requests=requests_dict,
|
|
150
|
+
forefront=forefront,
|
|
151
|
+
)
|
|
138
152
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
153
|
+
batch_response_dict = batch_response.model_dump(by_alias=True)
|
|
154
|
+
api_response = AddRequestsResponse.model_validate(batch_response_dict)
|
|
155
|
+
|
|
156
|
+
# Commit only the requests the platform actually accepted to the local dedup cache. Caching after
|
|
157
|
+
# the call succeeds (not before) keeps a failed call from poisoning the cache and silently
|
|
158
|
+
# deduplicating a later retry of the same request.
|
|
159
|
+
unprocessed_unique_keys = {request.unique_key for request in api_response.unprocessed_requests}
|
|
160
|
+
for request in new_requests:
|
|
161
|
+
if request.unique_key in unprocessed_unique_keys:
|
|
162
|
+
continue
|
|
163
|
+
request_id = unique_key_to_request_id(request.unique_key)
|
|
164
|
+
self._cache_request(
|
|
165
|
+
request_id,
|
|
166
|
+
ProcessedRequest(
|
|
167
|
+
id=request_id,
|
|
168
|
+
unique_key=request.unique_key,
|
|
169
|
+
was_already_present=True,
|
|
170
|
+
was_already_handled=request.was_already_handled,
|
|
171
|
+
),
|
|
172
|
+
)
|
|
173
|
+
committed_request_ids.add(request_id)
|
|
174
|
+
|
|
175
|
+
# Add the locally known already present processed requests based on the local cache.
|
|
176
|
+
api_response.processed_requests.extend(already_present_requests)
|
|
177
|
+
finally:
|
|
178
|
+
# Release the in-flight markers we registered. Committed requests tell concurrent callers the
|
|
179
|
+
# request reached the platform; everything else (unprocessed, API error, cancellation) tells them
|
|
180
|
+
# it did not, so they retry instead of reporting false success.
|
|
181
|
+
for request in new_requests:
|
|
182
|
+
request_id = unique_key_to_request_id(request.unique_key)
|
|
183
|
+
settle_pending_addition(
|
|
184
|
+
self._requests_being_added, request_id, committed=request_id in committed_request_ids
|
|
185
|
+
)
|
|
143
186
|
|
|
144
187
|
else:
|
|
145
188
|
api_response = AddRequestsResponse.model_validate(
|
|
146
189
|
{'unprocessedRequests': [], 'processedRequests': already_present_requests}
|
|
147
190
|
)
|
|
148
191
|
|
|
192
|
+
# Fold in requests a concurrent call was already adding.
|
|
193
|
+
await resolve_awaited_in_flight(awaited_in_flight, api_response)
|
|
194
|
+
|
|
149
195
|
logger.debug(
|
|
150
196
|
f'Tried to add new requests: {len(new_requests)}, '
|
|
151
197
|
f'succeeded to add new requests: {len(api_response.processed_requests) - len(already_present_requests)}, '
|
|
@@ -292,8 +338,18 @@ class ApifyRequestQueueSharedClient:
|
|
|
292
338
|
# Check _list_head.
|
|
293
339
|
# Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
|
|
294
340
|
async with self._fetch_lock:
|
|
295
|
-
|
|
296
|
-
|
|
341
|
+
return await self._is_empty()
|
|
342
|
+
|
|
343
|
+
async def is_finished(self) -> bool:
|
|
344
|
+
"""Specific implementation of this method for the RQ shared access mode."""
|
|
345
|
+
async with self._fetch_lock:
|
|
346
|
+
# Order of operations is important here, because affects on `_queue_has_locked_requests`.
|
|
347
|
+
return await self._is_empty() and not self._queue_has_locked_requests
|
|
348
|
+
|
|
349
|
+
async def _is_empty(self) -> bool:
|
|
350
|
+
"""Check whether anything is available to fetch. Lock-free core of `is_empty`, caller must hold the lock."""
|
|
351
|
+
head = await self._list_head(limit=1)
|
|
352
|
+
return len(head.items) == 0
|
|
297
353
|
|
|
298
354
|
async def _get_metadata_estimate(self) -> RequestQueueMetadata:
|
|
299
355
|
"""Try to get cached metadata first. If multiple clients, fuse with global metadata.
|
{apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_request_queue_single_client.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from collections import deque
|
|
4
5
|
from datetime import UTC, datetime
|
|
5
6
|
from logging import getLogger
|
|
@@ -9,7 +10,12 @@ from cachetools import LRUCache
|
|
|
9
10
|
|
|
10
11
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
11
12
|
|
|
12
|
-
from ._utils import
|
|
13
|
+
from ._utils import (
|
|
14
|
+
resolve_awaited_in_flight,
|
|
15
|
+
settle_pending_addition,
|
|
16
|
+
to_crawlee_request,
|
|
17
|
+
unique_key_to_request_id,
|
|
18
|
+
)
|
|
13
19
|
|
|
14
20
|
if TYPE_CHECKING:
|
|
15
21
|
from collections.abc import Sequence
|
|
@@ -90,6 +96,19 @@ class ApifyRequestQueueSingleClient:
|
|
|
90
96
|
Tracked locally to accurately determine when the queue is empty for this single consumer.
|
|
91
97
|
"""
|
|
92
98
|
|
|
99
|
+
self._requests_being_added: dict[str, asyncio.Future[bool]] = {}
|
|
100
|
+
"""In-flight `add_batch_of_requests` markers, keyed by request ID.
|
|
101
|
+
|
|
102
|
+
Coordinates only concurrent `add_batch_of_requests` calls sharing this one client instance (e.g. several
|
|
103
|
+
producer coroutines adding requests in the same process). It does not coordinate separate client instances
|
|
104
|
+
or processes, which each keep their own markers; deduplication across clients still relies on the platform.
|
|
105
|
+
|
|
106
|
+
Each future resolves once the platform call that is adding the request settles: `True` if the request was
|
|
107
|
+
committed, `False` otherwise. A concurrent call adding the same request awaits the future instead of
|
|
108
|
+
re-sending it, which avoids a duplicate platform write while still avoiding false success when the original
|
|
109
|
+
add fails.
|
|
110
|
+
"""
|
|
111
|
+
|
|
93
112
|
self._initialized_caches = False
|
|
94
113
|
"""Flag indicating whether local caches have been populated from existing queue contents.
|
|
95
114
|
|
|
@@ -108,8 +127,12 @@ class ApifyRequestQueueSingleClient:
|
|
|
108
127
|
await self._init_caches()
|
|
109
128
|
self._initialized_caches = True
|
|
110
129
|
|
|
130
|
+
loop = asyncio.get_running_loop()
|
|
111
131
|
new_requests: list[Request] = []
|
|
112
132
|
already_present_requests: list[ProcessedRequest] = []
|
|
133
|
+
# Requests a concurrent `add_batch_of_requests` call is already sending. We await its outcome instead of
|
|
134
|
+
# re-sending them, as (request, that call's in-flight future) pairs.
|
|
135
|
+
awaited_in_flight: list[tuple[Request, asyncio.Future[bool]]] = []
|
|
113
136
|
|
|
114
137
|
for request in requests:
|
|
115
138
|
# Calculate id for request
|
|
@@ -135,33 +158,54 @@ class ApifyRequestQueueSingleClient:
|
|
|
135
158
|
was_already_handled=request.was_already_handled,
|
|
136
159
|
)
|
|
137
160
|
)
|
|
161
|
+
# Check if a concurrent call is already adding this request, and await its outcome rather than
|
|
162
|
+
# re-sending it.
|
|
163
|
+
elif request_id in self._requests_being_added:
|
|
164
|
+
awaited_in_flight.append((request, self._requests_being_added[request_id]))
|
|
138
165
|
else:
|
|
139
|
-
# Push the request to the platform. Probably not there, or we are not aware of it
|
|
166
|
+
# Push the request to the platform. Probably not there, or we are not aware of it. Register an
|
|
167
|
+
# in-flight marker so a concurrent call dedupes against it; caching is deferred until the
|
|
168
|
+
# platform confirms the request was accepted (see below).
|
|
140
169
|
new_requests.append(request)
|
|
141
|
-
|
|
142
|
-
# Update local caches
|
|
143
|
-
self._requests_cache[request_id] = request
|
|
144
|
-
if forefront:
|
|
145
|
-
self._head_requests.append(request_id)
|
|
146
|
-
else:
|
|
147
|
-
self._head_requests.appendleft(request_id)
|
|
170
|
+
self._requests_being_added[request_id] = loop.create_future()
|
|
148
171
|
|
|
149
172
|
if new_requests:
|
|
150
173
|
# Prepare requests for API by converting to dictionaries.
|
|
151
174
|
requests_dict = [request.model_dump(by_alias=True) for request in new_requests]
|
|
152
175
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
176
|
+
committed_request_ids: set[str] = set()
|
|
177
|
+
try:
|
|
178
|
+
# Send requests to API.
|
|
179
|
+
batch_response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront)
|
|
180
|
+
batch_response_dict = batch_response.model_dump(by_alias=True)
|
|
181
|
+
api_response = AddRequestsResponse.model_validate(batch_response_dict)
|
|
182
|
+
|
|
183
|
+
# Commit only the requests the platform actually accepted to the local caches. Caching after the
|
|
184
|
+
# call succeeds (not before) keeps a failed call from poisoning the cache and silently
|
|
185
|
+
# deduplicating a later retry of the same request.
|
|
186
|
+
unprocessed_unique_keys = {request.unique_key for request in api_response.unprocessed_requests}
|
|
187
|
+
for request in new_requests:
|
|
188
|
+
if request.unique_key in unprocessed_unique_keys:
|
|
189
|
+
continue
|
|
190
|
+
request_id = unique_key_to_request_id(request.unique_key)
|
|
191
|
+
self._requests_cache[request_id] = request
|
|
192
|
+
if forefront:
|
|
193
|
+
self._head_requests.append(request_id)
|
|
194
|
+
else:
|
|
195
|
+
self._head_requests.appendleft(request_id)
|
|
196
|
+
committed_request_ids.add(request_id)
|
|
197
|
+
|
|
198
|
+
# Add the locally known already present processed requests based on the local cache.
|
|
199
|
+
api_response.processed_requests.extend(already_present_requests)
|
|
200
|
+
finally:
|
|
201
|
+
# Release the in-flight markers we registered. Committed requests tell concurrent callers the
|
|
202
|
+
# request reached the platform; everything else (unprocessed, API error, cancellation) tells them
|
|
203
|
+
# it did not, so they retry instead of reporting false success.
|
|
204
|
+
for request in new_requests:
|
|
205
|
+
request_id = unique_key_to_request_id(request.unique_key)
|
|
206
|
+
settle_pending_addition(
|
|
207
|
+
self._requests_being_added, request_id, committed=request_id in committed_request_ids
|
|
208
|
+
)
|
|
165
209
|
|
|
166
210
|
else:
|
|
167
211
|
api_response = AddRequestsResponse(
|
|
@@ -169,6 +213,9 @@ class ApifyRequestQueueSingleClient:
|
|
|
169
213
|
processed_requests=already_present_requests,
|
|
170
214
|
)
|
|
171
215
|
|
|
216
|
+
# Fold in requests a concurrent call was already adding.
|
|
217
|
+
await resolve_awaited_in_flight(awaited_in_flight, api_response)
|
|
218
|
+
|
|
172
219
|
# Update assumed total count for newly added requests.
|
|
173
220
|
new_request_count = 0
|
|
174
221
|
for processed_request in api_response.processed_requests:
|
|
@@ -277,9 +324,12 @@ class ApifyRequestQueueSingleClient:
|
|
|
277
324
|
|
|
278
325
|
async def is_empty(self) -> bool:
|
|
279
326
|
"""Specific implementation of this method for the RQ single access mode."""
|
|
280
|
-
# Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
|
|
281
327
|
await self._ensure_head_is_non_empty()
|
|
282
|
-
return not self._head_requests
|
|
328
|
+
return not self._head_requests
|
|
329
|
+
|
|
330
|
+
async def is_finished(self) -> bool:
|
|
331
|
+
"""Specific implementation of this method for the RQ single access mode."""
|
|
332
|
+
return await self.is_empty() and not self._requests_in_progress
|
|
283
333
|
|
|
284
334
|
async def _ensure_head_is_non_empty(self) -> None:
|
|
285
335
|
"""Ensure that the queue head has requests if they are available in the queue."""
|
|
@@ -1,17 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import re
|
|
4
5
|
from base64 import b64encode
|
|
5
6
|
from hashlib import sha256
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
from crawlee._utils.crypto import compute_short_hash
|
|
10
|
+
from crawlee.storage_clients.models import ProcessedRequest, UnprocessedRequest
|
|
9
11
|
|
|
10
12
|
from apify import Request
|
|
11
13
|
|
|
12
14
|
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Iterable
|
|
16
|
+
|
|
13
17
|
from apify_client._models import HeadRequest, LockedHeadRequest
|
|
14
18
|
from apify_client._models import Request as ClientRequest
|
|
19
|
+
from crawlee.storage_clients.models import AddRequestsResponse
|
|
15
20
|
|
|
16
21
|
from apify import Configuration
|
|
17
22
|
|
|
@@ -60,3 +65,48 @@ def to_crawlee_request(client_request: ClientRequest | HeadRequest | LockedHeadR
|
|
|
60
65
|
|
|
61
66
|
# Validate and construct Crawlee Request from the serialized dict
|
|
62
67
|
return Request.model_validate(request_dict)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def settle_pending_addition(
|
|
71
|
+
requests_being_added: dict[str, asyncio.Future[bool]],
|
|
72
|
+
request_id: str,
|
|
73
|
+
*,
|
|
74
|
+
committed: bool,
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Resolve the in-flight add marker for a request, unblocking any concurrent call awaiting it.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
requests_being_added: The client's map of in-flight `add_batch_of_requests` markers.
|
|
80
|
+
request_id: ID of the request whose in-flight add has settled.
|
|
81
|
+
committed: Whether the request was committed to the platform.
|
|
82
|
+
"""
|
|
83
|
+
future = requests_being_added.pop(request_id, None)
|
|
84
|
+
if future is not None and not future.done():
|
|
85
|
+
future.set_result(committed)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def resolve_awaited_in_flight(
|
|
89
|
+
awaited_in_flight: Iterable[tuple[Request, asyncio.Future[bool]]],
|
|
90
|
+
api_response: AddRequestsResponse,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Await concurrent in-flight adds of these requests and fold the outcome into `api_response`.
|
|
93
|
+
|
|
94
|
+
Requests the concurrent add committed are reported as already present; the rest are reported unprocessed
|
|
95
|
+
so the caller retries them rather than receiving false success.
|
|
96
|
+
"""
|
|
97
|
+
for request, future in awaited_in_flight:
|
|
98
|
+
# Shield the shared in-flight marker: cancelling this awaiting caller must not cancel the future, which
|
|
99
|
+
# is owned by the original producer and may have other callers waiting on it.
|
|
100
|
+
if await asyncio.shield(future):
|
|
101
|
+
api_response.processed_requests.append(
|
|
102
|
+
ProcessedRequest(
|
|
103
|
+
id=unique_key_to_request_id(request.unique_key),
|
|
104
|
+
unique_key=request.unique_key,
|
|
105
|
+
was_already_present=True,
|
|
106
|
+
was_already_handled=request.was_already_handled,
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
api_response.unprocessed_requests.append(
|
|
111
|
+
UnprocessedRequest(unique_key=request.unique_key, url=request.url, method=request.method)
|
|
112
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_apify/_key_value_store_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-3.4.2b28 → apify-3.4.2b30}/src/apify/storage_clients/_file_system/_key_value_store_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|