apify 1.4.0b1__tar.gz → 1.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- {apify-1.4.0b1 → apify-1.4.1}/PKG-INFO +6 -6
- {apify-1.4.0b1 → apify-1.4.1}/pyproject.toml +10 -6
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/base_resource_client.py +8 -6
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/dataset.py +17 -17
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/key_value_store.py +22 -22
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/request_queue.py +32 -32
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/scrapy/middlewares.py +27 -20
- {apify-1.4.0b1 → apify-1.4.1}/src/apify.egg-info/PKG-INFO +6 -6
- {apify-1.4.0b1 → apify-1.4.1}/src/apify.egg-info/requires.txt +5 -5
- {apify-1.4.0b1 → apify-1.4.1}/LICENSE +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/README.md +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/setup.cfg +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/__init__.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_crypto.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/__init__.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/file_storage_utils.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/memory_storage_client.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/__init__.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/dataset_collection.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/request_queue_collection.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/_utils.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/actor.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/config.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/consts.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/event_manager.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/log.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/proxy_configuration.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/py.typed +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/scrapy/__init__.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/scrapy/pipelines.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/scrapy/scheduler.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/scrapy/utils.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/storages/__init__.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/storages/base_storage.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/storages/dataset.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/storages/key_value_store.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/storages/request_queue.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify/storages/storage_client_manager.py +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify.egg-info/SOURCES.txt +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify.egg-info/dependency_links.txt +0 -0
- {apify-1.4.0b1 → apify-1.4.1}/src/apify.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
Author-email: "Apify Technologies s.r.o." <support@apify.com>
|
|
6
6
|
License: Apache Software License
|
|
@@ -24,10 +24,10 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist: aiofiles>=22.1.0
|
|
28
|
-
Requires-Dist: aioshutil>=1.0
|
|
29
27
|
Requires-Dist: apify-client~=1.6.0
|
|
30
28
|
Requires-Dist: apify-shared~=1.1.0
|
|
29
|
+
Requires-Dist: aiofiles>=22.1.0
|
|
30
|
+
Requires-Dist: aioshutil>=1.0
|
|
31
31
|
Requires-Dist: colorama>=0.4.6
|
|
32
32
|
Requires-Dist: cryptography>=39.0.0
|
|
33
33
|
Requires-Dist: httpx>=0.24.1
|
|
@@ -51,10 +51,10 @@ Requires-Dist: respx~=0.20.1; extra == "dev"
|
|
|
51
51
|
Requires-Dist: ruff~=0.1.6; extra == "dev"
|
|
52
52
|
Requires-Dist: twine~=4.0.2; extra == "dev"
|
|
53
53
|
Requires-Dist: types-aiofiles~=23.2.0.0; extra == "dev"
|
|
54
|
-
Requires-Dist: types-colorama~=0.4.15.
|
|
55
|
-
Requires-Dist: types-psutil~=5.9.5.
|
|
54
|
+
Requires-Dist: types-colorama~=0.4.15.12; extra == "dev"
|
|
55
|
+
Requires-Dist: types-psutil~=5.9.5.17; extra == "dev"
|
|
56
56
|
Provides-Extra: scrapy
|
|
57
|
-
Requires-Dist: scrapy
|
|
57
|
+
Requires-Dist: scrapy>=2.11.0; extra == "scrapy"
|
|
58
58
|
|
|
59
59
|
# Apify SDK for Python
|
|
60
60
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "apify"
|
|
3
|
-
version = "1.4.
|
|
3
|
+
version = "1.4.1"
|
|
4
4
|
description = "Apify SDK for Python"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { text = "Apache Software License" }
|
|
@@ -21,11 +21,15 @@ classifiers = [
|
|
|
21
21
|
]
|
|
22
22
|
|
|
23
23
|
requires-python = ">=3.8"
|
|
24
|
+
|
|
25
|
+
# We use inclusive ordered comparison clause for non-Apify packages intentionally in order to enhance the Apify SDK's
|
|
26
|
+
# compatibility with a wide range of external packages. This decision was discussed in detail in the following PR:
|
|
27
|
+
# https://github.com/apify/apify-sdk-python/pull/154
|
|
24
28
|
dependencies = [
|
|
25
|
-
"aiofiles >= 22.1.0",
|
|
26
|
-
"aioshutil >= 1.0",
|
|
27
29
|
"apify-client ~= 1.6.0",
|
|
28
30
|
"apify-shared ~= 1.1.0",
|
|
31
|
+
"aiofiles >= 22.1.0",
|
|
32
|
+
"aioshutil >= 1.0",
|
|
29
33
|
"colorama >= 0.4.6",
|
|
30
34
|
"cryptography >= 39.0.0",
|
|
31
35
|
"httpx >= 0.24.1",
|
|
@@ -52,11 +56,11 @@ dev = [
|
|
|
52
56
|
"ruff ~= 0.1.6",
|
|
53
57
|
"twine ~= 4.0.2",
|
|
54
58
|
"types-aiofiles ~= 23.2.0.0",
|
|
55
|
-
"types-colorama ~= 0.4.15.
|
|
56
|
-
"types-psutil ~= 5.9.5.
|
|
59
|
+
"types-colorama ~= 0.4.15.12",
|
|
60
|
+
"types-psutil ~= 5.9.5.17",
|
|
57
61
|
]
|
|
58
62
|
scrapy = [
|
|
59
|
-
"scrapy
|
|
63
|
+
"scrapy >= 2.11.0",
|
|
60
64
|
]
|
|
61
65
|
|
|
62
66
|
[project.urls]
|
{apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/base_resource_client.py
RENAMED
|
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING
|
|
|
8
8
|
from apify_shared.utils import ignore_docs
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
|
+
from typing_extensions import Self
|
|
12
|
+
|
|
11
13
|
from ..memory_storage_client import MemoryStorageClient
|
|
12
14
|
|
|
13
15
|
|
|
@@ -48,9 +50,9 @@ class BaseResourceClient(ABC):
|
|
|
48
50
|
@classmethod
|
|
49
51
|
@abstractmethod
|
|
50
52
|
def _get_storage_client_cache(
|
|
51
|
-
cls: type
|
|
53
|
+
cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
|
|
52
54
|
memory_storage_client: MemoryStorageClient,
|
|
53
|
-
) -> list[
|
|
55
|
+
) -> list[Self]:
|
|
54
56
|
raise NotImplementedError('You must override this method in the subclass!')
|
|
55
57
|
|
|
56
58
|
@abstractmethod
|
|
@@ -60,21 +62,21 @@ class BaseResourceClient(ABC):
|
|
|
60
62
|
@classmethod
|
|
61
63
|
@abstractmethod
|
|
62
64
|
def _create_from_directory(
|
|
63
|
-
cls: type
|
|
65
|
+
cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
|
|
64
66
|
storage_directory: str,
|
|
65
67
|
memory_storage_client: MemoryStorageClient,
|
|
66
68
|
id: str | None = None, # noqa: A002
|
|
67
69
|
name: str | None = None,
|
|
68
|
-
) ->
|
|
70
|
+
) -> Self:
|
|
69
71
|
raise NotImplementedError('You must override this method in the subclass!')
|
|
70
72
|
|
|
71
73
|
@classmethod
|
|
72
74
|
def _find_or_create_client_by_id_or_name(
|
|
73
|
-
cls: type
|
|
75
|
+
cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
|
|
74
76
|
memory_storage_client: MemoryStorageClient,
|
|
75
77
|
id: str | None = None, # noqa: A002
|
|
76
78
|
name: str | None = None,
|
|
77
|
-
) ->
|
|
79
|
+
) -> Self | None:
|
|
78
80
|
assert id is not None or name is not None # noqa: S101
|
|
79
81
|
|
|
80
82
|
storage_client_cache = cls._get_storage_client_cache(memory_storage_client)
|
|
@@ -74,8 +74,8 @@ class DatasetClient(BaseResourceClient):
|
|
|
74
74
|
found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
|
|
75
75
|
|
|
76
76
|
if found:
|
|
77
|
-
async with found._file_operation_lock:
|
|
78
|
-
await found._update_timestamps(has_been_modified=False)
|
|
77
|
+
async with found._file_operation_lock:
|
|
78
|
+
await found._update_timestamps(has_been_modified=False)
|
|
79
79
|
return found._to_resource_info()
|
|
80
80
|
|
|
81
81
|
return None
|
|
@@ -103,7 +103,7 @@ class DatasetClient(BaseResourceClient):
|
|
|
103
103
|
if name is None:
|
|
104
104
|
return existing_dataset_by_id._to_resource_info()
|
|
105
105
|
|
|
106
|
-
async with existing_dataset_by_id._file_operation_lock:
|
|
106
|
+
async with existing_dataset_by_id._file_operation_lock:
|
|
107
107
|
# Check that name is not in use already
|
|
108
108
|
existing_dataset_by_name = next(
|
|
109
109
|
(dataset for dataset in self._memory_storage_client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()),
|
|
@@ -122,7 +122,7 @@ class DatasetClient(BaseResourceClient):
|
|
|
122
122
|
await force_rename(previous_dir, existing_dataset_by_id._resource_directory)
|
|
123
123
|
|
|
124
124
|
# Update timestamps
|
|
125
|
-
await existing_dataset_by_id._update_timestamps(has_been_modified=True)
|
|
125
|
+
await existing_dataset_by_id._update_timestamps(has_been_modified=True)
|
|
126
126
|
|
|
127
127
|
return existing_dataset_by_id._to_resource_info()
|
|
128
128
|
|
|
@@ -193,9 +193,9 @@ class DatasetClient(BaseResourceClient):
|
|
|
193
193
|
if existing_dataset_by_id is None:
|
|
194
194
|
raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
|
|
195
195
|
|
|
196
|
-
async with existing_dataset_by_id._file_operation_lock:
|
|
197
|
-
start, end = existing_dataset_by_id._get_start_and_end_indexes(
|
|
198
|
-
max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0,
|
|
196
|
+
async with existing_dataset_by_id._file_operation_lock:
|
|
197
|
+
start, end = existing_dataset_by_id._get_start_and_end_indexes(
|
|
198
|
+
max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0,
|
|
199
199
|
limit,
|
|
200
200
|
)
|
|
201
201
|
|
|
@@ -203,9 +203,9 @@ class DatasetClient(BaseResourceClient):
|
|
|
203
203
|
|
|
204
204
|
for idx in range(start, end):
|
|
205
205
|
entry_number = self._generate_local_entry_name(idx)
|
|
206
|
-
items.append(existing_dataset_by_id._dataset_entries[entry_number])
|
|
206
|
+
items.append(existing_dataset_by_id._dataset_entries[entry_number])
|
|
207
207
|
|
|
208
|
-
await existing_dataset_by_id._update_timestamps(has_been_modified=False)
|
|
208
|
+
await existing_dataset_by_id._update_timestamps(has_been_modified=False)
|
|
209
209
|
|
|
210
210
|
if desc:
|
|
211
211
|
items.reverse()
|
|
@@ -217,7 +217,7 @@ class DatasetClient(BaseResourceClient):
|
|
|
217
217
|
'items': items,
|
|
218
218
|
'limit': limit or LIST_ITEMS_LIMIT,
|
|
219
219
|
'offset': offset or 0,
|
|
220
|
-
'total': existing_dataset_by_id._item_count,
|
|
220
|
+
'total': existing_dataset_by_id._item_count,
|
|
221
221
|
}
|
|
222
222
|
)
|
|
223
223
|
|
|
@@ -308,16 +308,16 @@ class DatasetClient(BaseResourceClient):
|
|
|
308
308
|
|
|
309
309
|
added_ids: list[str] = []
|
|
310
310
|
for entry in normalized:
|
|
311
|
-
existing_dataset_by_id._item_count += 1
|
|
312
|
-
idx = self._generate_local_entry_name(existing_dataset_by_id._item_count)
|
|
311
|
+
existing_dataset_by_id._item_count += 1
|
|
312
|
+
idx = self._generate_local_entry_name(existing_dataset_by_id._item_count)
|
|
313
313
|
|
|
314
|
-
existing_dataset_by_id._dataset_entries[idx] = entry
|
|
314
|
+
existing_dataset_by_id._dataset_entries[idx] = entry
|
|
315
315
|
added_ids.append(idx)
|
|
316
316
|
|
|
317
|
-
data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids]
|
|
317
|
+
data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001
|
|
318
318
|
|
|
319
|
-
async with existing_dataset_by_id._file_operation_lock:
|
|
320
|
-
await existing_dataset_by_id._update_timestamps(has_been_modified=True)
|
|
319
|
+
async with existing_dataset_by_id._file_operation_lock:
|
|
320
|
+
await existing_dataset_by_id._update_timestamps(has_been_modified=True)
|
|
321
321
|
|
|
322
322
|
await _update_dataset_items(
|
|
323
323
|
data=data_entries,
|
|
@@ -385,7 +385,7 @@ class DatasetClient(BaseResourceClient):
|
|
|
385
385
|
return memory_storage_client._datasets_directory
|
|
386
386
|
|
|
387
387
|
@classmethod
|
|
388
|
-
def _get_storage_client_cache(
|
|
388
|
+
def _get_storage_client_cache(
|
|
389
389
|
cls: type[DatasetClient],
|
|
390
390
|
memory_storage_client: MemoryStorageClient,
|
|
391
391
|
) -> list[DatasetClient]:
|
|
@@ -100,8 +100,8 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
100
100
|
found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
|
|
101
101
|
|
|
102
102
|
if found:
|
|
103
|
-
async with found._file_operation_lock:
|
|
104
|
-
await found._update_timestamps(has_been_modified=False)
|
|
103
|
+
async with found._file_operation_lock:
|
|
104
|
+
await found._update_timestamps(has_been_modified=False)
|
|
105
105
|
return found._to_resource_info()
|
|
106
106
|
|
|
107
107
|
return None
|
|
@@ -127,7 +127,7 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
127
127
|
if name is None:
|
|
128
128
|
return existing_store_by_id._to_resource_info()
|
|
129
129
|
|
|
130
|
-
async with existing_store_by_id._file_operation_lock:
|
|
130
|
+
async with existing_store_by_id._file_operation_lock:
|
|
131
131
|
# Check that name is not in use already
|
|
132
132
|
existing_store_by_name = next(
|
|
133
133
|
(store for store in self._memory_storage_client._key_value_stores_handled if store._name and store._name.lower() == name.lower()),
|
|
@@ -146,7 +146,7 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
146
146
|
await force_rename(previous_dir, existing_store_by_id._resource_directory)
|
|
147
147
|
|
|
148
148
|
# Update timestamps
|
|
149
|
-
await existing_store_by_id._update_timestamps(has_been_modified=True)
|
|
149
|
+
await existing_store_by_id._update_timestamps(has_been_modified=True)
|
|
150
150
|
|
|
151
151
|
return existing_store_by_id._to_resource_info()
|
|
152
152
|
|
|
@@ -187,7 +187,7 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
187
187
|
|
|
188
188
|
items = []
|
|
189
189
|
|
|
190
|
-
for record in existing_store_by_id._records.values():
|
|
190
|
+
for record in existing_store_by_id._records.values():
|
|
191
191
|
size = len(record['value'])
|
|
192
192
|
items.append(
|
|
193
193
|
{
|
|
@@ -222,8 +222,8 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
222
222
|
is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item
|
|
223
223
|
next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item['key']
|
|
224
224
|
|
|
225
|
-
async with existing_store_by_id._file_operation_lock:
|
|
226
|
-
await existing_store_by_id._update_timestamps(has_been_modified=False)
|
|
225
|
+
async with existing_store_by_id._file_operation_lock:
|
|
226
|
+
await existing_store_by_id._update_timestamps(has_been_modified=False)
|
|
227
227
|
|
|
228
228
|
return {
|
|
229
229
|
'count': len(items),
|
|
@@ -247,7 +247,7 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
247
247
|
if existing_store_by_id is None:
|
|
248
248
|
raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id)
|
|
249
249
|
|
|
250
|
-
stored_record = existing_store_by_id._records.get(key)
|
|
250
|
+
stored_record = existing_store_by_id._records.get(key)
|
|
251
251
|
|
|
252
252
|
if stored_record is None:
|
|
253
253
|
return None
|
|
@@ -264,8 +264,8 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
264
264
|
except ValueError:
|
|
265
265
|
logger.exception('Error parsing key-value store record')
|
|
266
266
|
|
|
267
|
-
async with existing_store_by_id._file_operation_lock:
|
|
268
|
-
await existing_store_by_id._update_timestamps(has_been_modified=False)
|
|
267
|
+
async with existing_store_by_id._file_operation_lock:
|
|
268
|
+
await existing_store_by_id._update_timestamps(has_been_modified=False)
|
|
269
269
|
|
|
270
270
|
return record
|
|
271
271
|
|
|
@@ -324,22 +324,22 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
324
324
|
if 'application/json' in content_type and not is_file_or_bytes(value) and not isinstance(value, str):
|
|
325
325
|
value = json_dumps(value).encode('utf-8')
|
|
326
326
|
|
|
327
|
-
async with existing_store_by_id._file_operation_lock:
|
|
328
|
-
await existing_store_by_id._update_timestamps(has_been_modified=True)
|
|
327
|
+
async with existing_store_by_id._file_operation_lock:
|
|
328
|
+
await existing_store_by_id._update_timestamps(has_been_modified=True)
|
|
329
329
|
record: KeyValueStoreRecord = {
|
|
330
330
|
'key': key,
|
|
331
331
|
'value': value,
|
|
332
332
|
'contentType': content_type,
|
|
333
333
|
}
|
|
334
334
|
|
|
335
|
-
old_record = existing_store_by_id._records.get(key)
|
|
336
|
-
existing_store_by_id._records[key] = record
|
|
335
|
+
old_record = existing_store_by_id._records.get(key)
|
|
336
|
+
existing_store_by_id._records[key] = record
|
|
337
337
|
|
|
338
338
|
if self._memory_storage_client._persist_storage:
|
|
339
339
|
if old_record is not None and _filename_from_record(old_record) != _filename_from_record(record):
|
|
340
|
-
await existing_store_by_id._delete_persisted_record(old_record)
|
|
340
|
+
await existing_store_by_id._delete_persisted_record(old_record)
|
|
341
341
|
|
|
342
|
-
await existing_store_by_id._persist_record(record)
|
|
342
|
+
await existing_store_by_id._persist_record(record)
|
|
343
343
|
|
|
344
344
|
async def _persist_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None:
|
|
345
345
|
store_directory = self._resource_directory
|
|
@@ -385,14 +385,14 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
385
385
|
if existing_store_by_id is None:
|
|
386
386
|
raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id)
|
|
387
387
|
|
|
388
|
-
record = existing_store_by_id._records.get(key)
|
|
388
|
+
record = existing_store_by_id._records.get(key)
|
|
389
389
|
|
|
390
390
|
if record is not None:
|
|
391
|
-
async with existing_store_by_id._file_operation_lock:
|
|
392
|
-
del existing_store_by_id._records[key]
|
|
393
|
-
await existing_store_by_id._update_timestamps(has_been_modified=True)
|
|
391
|
+
async with existing_store_by_id._file_operation_lock:
|
|
392
|
+
del existing_store_by_id._records[key]
|
|
393
|
+
await existing_store_by_id._update_timestamps(has_been_modified=True)
|
|
394
394
|
if self._memory_storage_client._persist_storage:
|
|
395
|
-
await existing_store_by_id._delete_persisted_record(record)
|
|
395
|
+
await existing_store_by_id._delete_persisted_record(record)
|
|
396
396
|
|
|
397
397
|
async def _delete_persisted_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None:
|
|
398
398
|
store_directory = self._resource_directory
|
|
@@ -437,7 +437,7 @@ class KeyValueStoreClient(BaseResourceClient):
|
|
|
437
437
|
return memory_storage_client._key_value_stores_directory
|
|
438
438
|
|
|
439
439
|
@classmethod
|
|
440
|
-
def _get_storage_client_cache(
|
|
440
|
+
def _get_storage_client_cache(
|
|
441
441
|
cls: type[KeyValueStoreClient],
|
|
442
442
|
memory_storage_client: MemoryStorageClient,
|
|
443
443
|
) -> list[KeyValueStoreClient]:
|
|
@@ -67,8 +67,8 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
67
67
|
found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
|
|
68
68
|
|
|
69
69
|
if found:
|
|
70
|
-
async with found._file_operation_lock:
|
|
71
|
-
await found._update_timestamps(has_been_modified=False)
|
|
70
|
+
async with found._file_operation_lock:
|
|
71
|
+
await found._update_timestamps(has_been_modified=False)
|
|
72
72
|
return found._to_resource_info()
|
|
73
73
|
|
|
74
74
|
return None
|
|
@@ -94,7 +94,7 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
94
94
|
if name is None:
|
|
95
95
|
return existing_queue_by_id._to_resource_info()
|
|
96
96
|
|
|
97
|
-
async with existing_queue_by_id._file_operation_lock:
|
|
97
|
+
async with existing_queue_by_id._file_operation_lock:
|
|
98
98
|
# Check that name is not in use already
|
|
99
99
|
existing_queue_by_name = next(
|
|
100
100
|
(queue for queue in self._memory_storage_client._request_queues_handled if queue._name and queue._name.lower() == name.lower()), None
|
|
@@ -112,7 +112,7 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
112
112
|
await force_rename(previous_dir, existing_queue_by_id._resource_directory)
|
|
113
113
|
|
|
114
114
|
# Update timestamps
|
|
115
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
115
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
116
116
|
|
|
117
117
|
return existing_queue_by_id._to_resource_info()
|
|
118
118
|
|
|
@@ -146,18 +146,18 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
146
146
|
if existing_queue_by_id is None:
|
|
147
147
|
raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
|
|
148
148
|
|
|
149
|
-
async with existing_queue_by_id._file_operation_lock:
|
|
150
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=False)
|
|
149
|
+
async with existing_queue_by_id._file_operation_lock:
|
|
150
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=False)
|
|
151
151
|
|
|
152
152
|
items: list[dict] = []
|
|
153
153
|
|
|
154
154
|
# Iterate all requests in the queue which have sorted key larger than infinity, which means `orderNo` is not `None`
|
|
155
155
|
# This will iterate them in order of `orderNo`
|
|
156
|
-
for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)):
|
|
156
|
+
for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)):
|
|
157
157
|
if len(items) == limit:
|
|
158
158
|
break
|
|
159
159
|
|
|
160
|
-
request = existing_queue_by_id._requests.get(request_key)
|
|
160
|
+
request = existing_queue_by_id._requests.get(request_key)
|
|
161
161
|
|
|
162
162
|
# Check that the request still exists and was not handled,
|
|
163
163
|
# in case something deleted it or marked it as handled concurrenctly
|
|
@@ -167,7 +167,7 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
167
167
|
return {
|
|
168
168
|
'limit': limit,
|
|
169
169
|
'hadMultipleClients': False,
|
|
170
|
-
'queueModifiedAt': existing_queue_by_id._modified_at,
|
|
170
|
+
'queueModifiedAt': existing_queue_by_id._modified_at,
|
|
171
171
|
'items': [self._json_to_request(item['json']) for item in items],
|
|
172
172
|
}
|
|
173
173
|
|
|
@@ -190,12 +190,12 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
190
190
|
|
|
191
191
|
request_model = self._create_internal_request(request, forefront)
|
|
192
192
|
|
|
193
|
-
async with existing_queue_by_id._file_operation_lock:
|
|
194
|
-
existing_request_with_id = existing_queue_by_id._requests.get(request_model['id'])
|
|
193
|
+
async with existing_queue_by_id._file_operation_lock:
|
|
194
|
+
existing_request_with_id = existing_queue_by_id._requests.get(request_model['id'])
|
|
195
195
|
|
|
196
196
|
# We already have the request present, so we return information about it
|
|
197
197
|
if existing_request_with_id is not None:
|
|
198
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=False)
|
|
198
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=False)
|
|
199
199
|
|
|
200
200
|
return {
|
|
201
201
|
'requestId': existing_request_with_id['id'],
|
|
@@ -203,12 +203,12 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
203
203
|
'wasAlreadyPresent': True,
|
|
204
204
|
}
|
|
205
205
|
|
|
206
|
-
existing_queue_by_id._requests[request_model['id']] = request_model
|
|
206
|
+
existing_queue_by_id._requests[request_model['id']] = request_model
|
|
207
207
|
if request_model['orderNo'] is None:
|
|
208
|
-
existing_queue_by_id._handled_request_count += 1
|
|
208
|
+
existing_queue_by_id._handled_request_count += 1
|
|
209
209
|
else:
|
|
210
|
-
existing_queue_by_id._pending_request_count += 1
|
|
211
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
210
|
+
existing_queue_by_id._pending_request_count += 1
|
|
211
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
212
212
|
await update_request_queue_item(
|
|
213
213
|
request=request_model,
|
|
214
214
|
request_id=request_model['id'],
|
|
@@ -240,10 +240,10 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
240
240
|
if existing_queue_by_id is None:
|
|
241
241
|
raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
|
|
242
242
|
|
|
243
|
-
async with existing_queue_by_id._file_operation_lock:
|
|
244
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=False)
|
|
243
|
+
async with existing_queue_by_id._file_operation_lock:
|
|
244
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=False)
|
|
245
245
|
|
|
246
|
-
request = existing_queue_by_id._requests.get(request_id)
|
|
246
|
+
request = existing_queue_by_id._requests.get(request_id)
|
|
247
247
|
return self._json_to_request(request['json'] if request is not None else None)
|
|
248
248
|
|
|
249
249
|
async def update_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict:
|
|
@@ -268,17 +268,17 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
268
268
|
# First we need to check the existing request to be
|
|
269
269
|
# able to return information about its handled state.
|
|
270
270
|
|
|
271
|
-
existing_request = existing_queue_by_id._requests.get(request_model['id'])
|
|
271
|
+
existing_request = existing_queue_by_id._requests.get(request_model['id'])
|
|
272
272
|
|
|
273
273
|
# Undefined means that the request is not present in the queue.
|
|
274
274
|
# We need to insert it, to behave the same as API.
|
|
275
275
|
if existing_request is None:
|
|
276
276
|
return await self.add_request(request, forefront=forefront)
|
|
277
277
|
|
|
278
|
-
async with existing_queue_by_id._file_operation_lock:
|
|
278
|
+
async with existing_queue_by_id._file_operation_lock:
|
|
279
279
|
# When updating the request, we need to make sure that
|
|
280
280
|
# the handled counts are updated correctly in all cases.
|
|
281
|
-
existing_queue_by_id._requests[request_model['id']] = request_model
|
|
281
|
+
existing_queue_by_id._requests[request_model['id']] = request_model
|
|
282
282
|
|
|
283
283
|
pending_count_adjustment = 0
|
|
284
284
|
is_request_handled_state_changing = not isinstance(existing_request['orderNo'], type(request_model['orderNo']))
|
|
@@ -288,9 +288,9 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
288
288
|
if is_request_handled_state_changing:
|
|
289
289
|
pending_count_adjustment = 1 if request_was_handled_before_update else -1
|
|
290
290
|
|
|
291
|
-
existing_queue_by_id._pending_request_count += pending_count_adjustment
|
|
292
|
-
existing_queue_by_id._handled_request_count -= pending_count_adjustment
|
|
293
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
291
|
+
existing_queue_by_id._pending_request_count += pending_count_adjustment
|
|
292
|
+
existing_queue_by_id._handled_request_count -= pending_count_adjustment
|
|
293
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
294
294
|
await update_request_queue_item(
|
|
295
295
|
request=request_model,
|
|
296
296
|
request_id=request_model['id'],
|
|
@@ -317,16 +317,16 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
317
317
|
if existing_queue_by_id is None:
|
|
318
318
|
raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
|
|
319
319
|
|
|
320
|
-
async with existing_queue_by_id._file_operation_lock:
|
|
321
|
-
request = existing_queue_by_id._requests.get(request_id)
|
|
320
|
+
async with existing_queue_by_id._file_operation_lock:
|
|
321
|
+
request = existing_queue_by_id._requests.get(request_id)
|
|
322
322
|
|
|
323
323
|
if request:
|
|
324
|
-
del existing_queue_by_id._requests[request_id]
|
|
324
|
+
del existing_queue_by_id._requests[request_id]
|
|
325
325
|
if request['orderNo'] is None:
|
|
326
|
-
existing_queue_by_id._handled_request_count -= 1
|
|
326
|
+
existing_queue_by_id._handled_request_count -= 1
|
|
327
327
|
else:
|
|
328
|
-
existing_queue_by_id._pending_request_count -= 1
|
|
329
|
-
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
328
|
+
existing_queue_by_id._pending_request_count -= 1
|
|
329
|
+
await existing_queue_by_id._update_timestamps(has_been_modified=True)
|
|
330
330
|
await delete_request(entity_directory=existing_queue_by_id._resource_directory, request_id=request_id)
|
|
331
331
|
|
|
332
332
|
def _to_resource_info(self: RequestQueueClient) -> dict:
|
|
@@ -403,7 +403,7 @@ class RequestQueueClient(BaseResourceClient):
|
|
|
403
403
|
return memory_storage_client._request_queues_directory
|
|
404
404
|
|
|
405
405
|
@classmethod
|
|
406
|
-
def _get_storage_client_cache(
|
|
406
|
+
def _get_storage_client_cache(
|
|
407
407
|
cls: type[RequestQueueClient],
|
|
408
408
|
memory_storage_client: MemoryStorageClient,
|
|
409
409
|
) -> list[RequestQueueClient]:
|
|
@@ -4,10 +4,7 @@ import traceback
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
|
-
from scrapy import Spider # noqa: TCH002
|
|
8
7
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware
|
|
9
|
-
from scrapy.exceptions import IgnoreRequest
|
|
10
|
-
from scrapy.http import Request, Response # noqa: TCH002
|
|
11
8
|
from scrapy.utils.response import response_status_message
|
|
12
9
|
except ImportError as exc:
|
|
13
10
|
raise ImportError(
|
|
@@ -18,6 +15,9 @@ from ..actor import Actor
|
|
|
18
15
|
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
|
|
19
16
|
|
|
20
17
|
if TYPE_CHECKING:
|
|
18
|
+
from scrapy import Spider
|
|
19
|
+
from scrapy.http import Request, Response
|
|
20
|
+
|
|
21
21
|
from ..storages import RequestQueue
|
|
22
22
|
|
|
23
23
|
|
|
@@ -33,11 +33,6 @@ class ApifyRetryMiddleware(RetryMiddleware):
|
|
|
33
33
|
traceback.print_exc()
|
|
34
34
|
raise
|
|
35
35
|
|
|
36
|
-
def __del__(self: ApifyRetryMiddleware) -> None:
|
|
37
|
-
"""Before deleting the instance, close the nested event loop."""
|
|
38
|
-
nested_event_loop.stop()
|
|
39
|
-
nested_event_loop.close()
|
|
40
|
-
|
|
41
36
|
def process_response(
|
|
42
37
|
self: ApifyRetryMiddleware,
|
|
43
38
|
request: Request,
|
|
@@ -54,9 +49,11 @@ class ApifyRetryMiddleware(RetryMiddleware):
|
|
|
54
49
|
Returns:
|
|
55
50
|
The response, or a new request if the request should be retried.
|
|
56
51
|
"""
|
|
52
|
+
if not isinstance(request.url, str):
|
|
53
|
+
raise TypeError(f'Expected request.url to be a string, got {type(request.url)} instead.')
|
|
54
|
+
|
|
57
55
|
# Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our
|
|
58
56
|
# Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details.
|
|
59
|
-
assert isinstance(request.url, str) # noqa: S101
|
|
60
57
|
if request.url.endswith('robots.txt'):
|
|
61
58
|
return response
|
|
62
59
|
|
|
@@ -72,20 +69,30 @@ class ApifyRetryMiddleware(RetryMiddleware):
|
|
|
72
69
|
exception: BaseException,
|
|
73
70
|
spider: Spider,
|
|
74
71
|
) -> Request | Response | None:
|
|
75
|
-
"""Handle the exception and decide whether the request should be retried.
|
|
76
|
-
|
|
72
|
+
"""Handle the exception and decide whether the request should be retried.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
request: The request that encountered an exception.
|
|
76
|
+
exception: The exception that occurred.
|
|
77
|
+
spider: The Spider that sent the request.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
None: The request will not be retried.
|
|
81
|
+
"""
|
|
82
|
+
Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (request={request}, exception={exception})...')
|
|
77
83
|
apify_request = to_apify_request(request, spider=spider)
|
|
78
84
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
# Unlike the default Scrapy RetryMiddleware, we do not attempt to retry requests on exception.
|
|
86
|
+
# It was causing issues with the Apify request queue, because the request was not marked as handled and was
|
|
87
|
+
# stucked in the request queue forever - Scrapy crawling never finished. The solution would be to completely
|
|
88
|
+
# rewrite the retry logic from default RetryMiddleware.
|
|
89
|
+
try:
|
|
90
|
+
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
|
|
91
|
+
except BaseException:
|
|
92
|
+
traceback.print_exc()
|
|
93
|
+
raise
|
|
87
94
|
|
|
88
|
-
return
|
|
95
|
+
return None
|
|
89
96
|
|
|
90
97
|
async def _handle_retry_logic(
|
|
91
98
|
self: ApifyRetryMiddleware,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
Author-email: "Apify Technologies s.r.o." <support@apify.com>
|
|
6
6
|
License: Apache Software License
|
|
@@ -24,10 +24,10 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist: aiofiles>=22.1.0
|
|
28
|
-
Requires-Dist: aioshutil>=1.0
|
|
29
27
|
Requires-Dist: apify-client~=1.6.0
|
|
30
28
|
Requires-Dist: apify-shared~=1.1.0
|
|
29
|
+
Requires-Dist: aiofiles>=22.1.0
|
|
30
|
+
Requires-Dist: aioshutil>=1.0
|
|
31
31
|
Requires-Dist: colorama>=0.4.6
|
|
32
32
|
Requires-Dist: cryptography>=39.0.0
|
|
33
33
|
Requires-Dist: httpx>=0.24.1
|
|
@@ -51,10 +51,10 @@ Requires-Dist: respx~=0.20.1; extra == "dev"
|
|
|
51
51
|
Requires-Dist: ruff~=0.1.6; extra == "dev"
|
|
52
52
|
Requires-Dist: twine~=4.0.2; extra == "dev"
|
|
53
53
|
Requires-Dist: types-aiofiles~=23.2.0.0; extra == "dev"
|
|
54
|
-
Requires-Dist: types-colorama~=0.4.15.
|
|
55
|
-
Requires-Dist: types-psutil~=5.9.5.
|
|
54
|
+
Requires-Dist: types-colorama~=0.4.15.12; extra == "dev"
|
|
55
|
+
Requires-Dist: types-psutil~=5.9.5.17; extra == "dev"
|
|
56
56
|
Provides-Extra: scrapy
|
|
57
|
-
Requires-Dist: scrapy
|
|
57
|
+
Requires-Dist: scrapy>=2.11.0; extra == "scrapy"
|
|
58
58
|
|
|
59
59
|
# Apify SDK for Python
|
|
60
60
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
aiofiles>=22.1.0
|
|
2
|
-
aioshutil>=1.0
|
|
3
1
|
apify-client~=1.6.0
|
|
4
2
|
apify-shared~=1.1.0
|
|
3
|
+
aiofiles>=22.1.0
|
|
4
|
+
aioshutil>=1.0
|
|
5
5
|
colorama>=0.4.6
|
|
6
6
|
cryptography>=39.0.0
|
|
7
7
|
httpx>=0.24.1
|
|
@@ -26,8 +26,8 @@ respx~=0.20.1
|
|
|
26
26
|
ruff~=0.1.6
|
|
27
27
|
twine~=4.0.2
|
|
28
28
|
types-aiofiles~=23.2.0.0
|
|
29
|
-
types-colorama~=0.4.15.
|
|
30
|
-
types-psutil~=5.9.5.
|
|
29
|
+
types-colorama~=0.4.15.12
|
|
30
|
+
types-psutil~=5.9.5.17
|
|
31
31
|
|
|
32
32
|
[scrapy]
|
|
33
|
-
scrapy
|
|
33
|
+
scrapy>=2.11.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/dataset_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
{apify-1.4.0b1 → apify-1.4.1}/src/apify/_memory_storage/resource_clients/request_queue_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|