apify 1.4.0__tar.gz → 1.4.1a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (43) hide show
  1. {apify-1.4.0 → apify-1.4.1a1}/PKG-INFO +6 -6
  2. {apify-1.4.0 → apify-1.4.1a1}/pyproject.toml +10 -6
  3. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/base_resource_client.py +8 -6
  4. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/dataset.py +17 -17
  5. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/key_value_store.py +22 -22
  6. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/request_queue.py +32 -32
  7. {apify-1.4.0 → apify-1.4.1a1}/src/apify/scrapy/middlewares.py +13 -19
  8. {apify-1.4.0 → apify-1.4.1a1}/src/apify.egg-info/PKG-INFO +6 -6
  9. {apify-1.4.0 → apify-1.4.1a1}/src/apify.egg-info/requires.txt +5 -5
  10. {apify-1.4.0 → apify-1.4.1a1}/LICENSE +0 -0
  11. {apify-1.4.0 → apify-1.4.1a1}/README.md +0 -0
  12. {apify-1.4.0 → apify-1.4.1a1}/setup.cfg +0 -0
  13. {apify-1.4.0 → apify-1.4.1a1}/src/apify/__init__.py +0 -0
  14. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_crypto.py +0 -0
  15. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/__init__.py +0 -0
  16. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/file_storage_utils.py +0 -0
  17. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/memory_storage_client.py +0 -0
  18. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/__init__.py +0 -0
  19. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -0
  20. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/dataset_collection.py +0 -0
  21. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -0
  22. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_memory_storage/resource_clients/request_queue_collection.py +0 -0
  23. {apify-1.4.0 → apify-1.4.1a1}/src/apify/_utils.py +0 -0
  24. {apify-1.4.0 → apify-1.4.1a1}/src/apify/actor.py +0 -0
  25. {apify-1.4.0 → apify-1.4.1a1}/src/apify/config.py +0 -0
  26. {apify-1.4.0 → apify-1.4.1a1}/src/apify/consts.py +0 -0
  27. {apify-1.4.0 → apify-1.4.1a1}/src/apify/event_manager.py +0 -0
  28. {apify-1.4.0 → apify-1.4.1a1}/src/apify/log.py +0 -0
  29. {apify-1.4.0 → apify-1.4.1a1}/src/apify/proxy_configuration.py +0 -0
  30. {apify-1.4.0 → apify-1.4.1a1}/src/apify/py.typed +0 -0
  31. {apify-1.4.0 → apify-1.4.1a1}/src/apify/scrapy/__init__.py +0 -0
  32. {apify-1.4.0 → apify-1.4.1a1}/src/apify/scrapy/pipelines.py +0 -0
  33. {apify-1.4.0 → apify-1.4.1a1}/src/apify/scrapy/scheduler.py +0 -0
  34. {apify-1.4.0 → apify-1.4.1a1}/src/apify/scrapy/utils.py +0 -0
  35. {apify-1.4.0 → apify-1.4.1a1}/src/apify/storages/__init__.py +0 -0
  36. {apify-1.4.0 → apify-1.4.1a1}/src/apify/storages/base_storage.py +0 -0
  37. {apify-1.4.0 → apify-1.4.1a1}/src/apify/storages/dataset.py +0 -0
  38. {apify-1.4.0 → apify-1.4.1a1}/src/apify/storages/key_value_store.py +0 -0
  39. {apify-1.4.0 → apify-1.4.1a1}/src/apify/storages/request_queue.py +0 -0
  40. {apify-1.4.0 → apify-1.4.1a1}/src/apify/storages/storage_client_manager.py +0 -0
  41. {apify-1.4.0 → apify-1.4.1a1}/src/apify.egg-info/SOURCES.txt +0 -0
  42. {apify-1.4.0 → apify-1.4.1a1}/src/apify.egg-info/dependency_links.txt +0 -0
  43. {apify-1.4.0 → apify-1.4.1a1}/src/apify.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.4.0
3
+ Version: 1.4.1a1
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -24,10 +24,10 @@ Classifier: Topic :: Software Development :: Libraries
24
24
  Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: aiofiles>=22.1.0
28
- Requires-Dist: aioshutil>=1.0
29
27
  Requires-Dist: apify-client~=1.6.0
30
28
  Requires-Dist: apify-shared~=1.1.0
29
+ Requires-Dist: aiofiles>=22.1.0
30
+ Requires-Dist: aioshutil>=1.0
31
31
  Requires-Dist: colorama>=0.4.6
32
32
  Requires-Dist: cryptography>=39.0.0
33
33
  Requires-Dist: httpx>=0.24.1
@@ -51,10 +51,10 @@ Requires-Dist: respx~=0.20.1; extra == "dev"
51
51
  Requires-Dist: ruff~=0.1.6; extra == "dev"
52
52
  Requires-Dist: twine~=4.0.2; extra == "dev"
53
53
  Requires-Dist: types-aiofiles~=23.2.0.0; extra == "dev"
54
- Requires-Dist: types-colorama~=0.4.15.11; extra == "dev"
55
- Requires-Dist: types-psutil~=5.9.5.12; extra == "dev"
54
+ Requires-Dist: types-colorama~=0.4.15.12; extra == "dev"
55
+ Requires-Dist: types-psutil~=5.9.5.17; extra == "dev"
56
56
  Provides-Extra: scrapy
57
- Requires-Dist: scrapy~=2.11.0; extra == "scrapy"
57
+ Requires-Dist: scrapy>=2.11.0; extra == "scrapy"
58
58
 
59
59
  # Apify SDK for Python
60
60
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "apify"
3
- version = "1.4.0"
3
+ version = "1.4.1a1"
4
4
  description = "Apify SDK for Python"
5
5
  readme = "README.md"
6
6
  license = { text = "Apache Software License" }
@@ -21,11 +21,15 @@ classifiers = [
21
21
  ]
22
22
 
23
23
  requires-python = ">=3.8"
24
+
25
+ # We use inclusive ordered comparison clause for non-Apify packages intentionally in order to enhance the Apify SDK's
26
+ # compatibility with a wide range of external packages. This decision was discussed in detail in the following PR:
27
+ # https://github.com/apify/apify-sdk-python/pull/154
24
28
  dependencies = [
25
- "aiofiles >= 22.1.0",
26
- "aioshutil >= 1.0",
27
29
  "apify-client ~= 1.6.0",
28
30
  "apify-shared ~= 1.1.0",
31
+ "aiofiles >= 22.1.0",
32
+ "aioshutil >= 1.0",
29
33
  "colorama >= 0.4.6",
30
34
  "cryptography >= 39.0.0",
31
35
  "httpx >= 0.24.1",
@@ -52,11 +56,11 @@ dev = [
52
56
  "ruff ~= 0.1.6",
53
57
  "twine ~= 4.0.2",
54
58
  "types-aiofiles ~= 23.2.0.0",
55
- "types-colorama ~= 0.4.15.11",
56
- "types-psutil ~= 5.9.5.12",
59
+ "types-colorama ~= 0.4.15.12",
60
+ "types-psutil ~= 5.9.5.17",
57
61
  ]
58
62
  scrapy = [
59
- "scrapy ~= 2.11.0",
63
+ "scrapy >= 2.11.0",
60
64
  ]
61
65
 
62
66
  [project.urls]
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING
8
8
  from apify_shared.utils import ignore_docs
9
9
 
10
10
  if TYPE_CHECKING:
11
+ from typing_extensions import Self
12
+
11
13
  from ..memory_storage_client import MemoryStorageClient
12
14
 
13
15
 
@@ -48,9 +50,9 @@ class BaseResourceClient(ABC):
48
50
  @classmethod
49
51
  @abstractmethod
50
52
  def _get_storage_client_cache(
51
- cls: type[BaseResourceClient],
53
+ cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
52
54
  memory_storage_client: MemoryStorageClient,
53
- ) -> list[BaseResourceClient]:
55
+ ) -> list[Self]:
54
56
  raise NotImplementedError('You must override this method in the subclass!')
55
57
 
56
58
  @abstractmethod
@@ -60,21 +62,21 @@ class BaseResourceClient(ABC):
60
62
  @classmethod
61
63
  @abstractmethod
62
64
  def _create_from_directory(
63
- cls: type[BaseResourceClient],
65
+ cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
64
66
  storage_directory: str,
65
67
  memory_storage_client: MemoryStorageClient,
66
68
  id: str | None = None, # noqa: A002
67
69
  name: str | None = None,
68
- ) -> BaseResourceClient:
70
+ ) -> Self:
69
71
  raise NotImplementedError('You must override this method in the subclass!')
70
72
 
71
73
  @classmethod
72
74
  def _find_or_create_client_by_id_or_name(
73
- cls: type[BaseResourceClient],
75
+ cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type
74
76
  memory_storage_client: MemoryStorageClient,
75
77
  id: str | None = None, # noqa: A002
76
78
  name: str | None = None,
77
- ) -> BaseResourceClient | None:
79
+ ) -> Self | None:
78
80
  assert id is not None or name is not None # noqa: S101
79
81
 
80
82
  storage_client_cache = cls._get_storage_client_cache(memory_storage_client)
@@ -74,8 +74,8 @@ class DatasetClient(BaseResourceClient):
74
74
  found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
75
75
 
76
76
  if found:
77
- async with found._file_operation_lock: # type: ignore
78
- await found._update_timestamps(has_been_modified=False) # type: ignore
77
+ async with found._file_operation_lock:
78
+ await found._update_timestamps(has_been_modified=False)
79
79
  return found._to_resource_info()
80
80
 
81
81
  return None
@@ -103,7 +103,7 @@ class DatasetClient(BaseResourceClient):
103
103
  if name is None:
104
104
  return existing_dataset_by_id._to_resource_info()
105
105
 
106
- async with existing_dataset_by_id._file_operation_lock: # type: ignore
106
+ async with existing_dataset_by_id._file_operation_lock:
107
107
  # Check that name is not in use already
108
108
  existing_dataset_by_name = next(
109
109
  (dataset for dataset in self._memory_storage_client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()),
@@ -122,7 +122,7 @@ class DatasetClient(BaseResourceClient):
122
122
  await force_rename(previous_dir, existing_dataset_by_id._resource_directory)
123
123
 
124
124
  # Update timestamps
125
- await existing_dataset_by_id._update_timestamps(has_been_modified=True) # type: ignore
125
+ await existing_dataset_by_id._update_timestamps(has_been_modified=True)
126
126
 
127
127
  return existing_dataset_by_id._to_resource_info()
128
128
 
@@ -193,9 +193,9 @@ class DatasetClient(BaseResourceClient):
193
193
  if existing_dataset_by_id is None:
194
194
  raise_on_non_existing_storage(StorageTypes.DATASET, self._id)
195
195
 
196
- async with existing_dataset_by_id._file_operation_lock: # type: ignore
197
- start, end = existing_dataset_by_id._get_start_and_end_indexes( # type: ignore
198
- max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0, # type: ignore
196
+ async with existing_dataset_by_id._file_operation_lock:
197
+ start, end = existing_dataset_by_id._get_start_and_end_indexes(
198
+ max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0,
199
199
  limit,
200
200
  )
201
201
 
@@ -203,9 +203,9 @@ class DatasetClient(BaseResourceClient):
203
203
 
204
204
  for idx in range(start, end):
205
205
  entry_number = self._generate_local_entry_name(idx)
206
- items.append(existing_dataset_by_id._dataset_entries[entry_number]) # type: ignore
206
+ items.append(existing_dataset_by_id._dataset_entries[entry_number])
207
207
 
208
- await existing_dataset_by_id._update_timestamps(has_been_modified=False) # type: ignore
208
+ await existing_dataset_by_id._update_timestamps(has_been_modified=False)
209
209
 
210
210
  if desc:
211
211
  items.reverse()
@@ -217,7 +217,7 @@ class DatasetClient(BaseResourceClient):
217
217
  'items': items,
218
218
  'limit': limit or LIST_ITEMS_LIMIT,
219
219
  'offset': offset or 0,
220
- 'total': existing_dataset_by_id._item_count, # type: ignore
220
+ 'total': existing_dataset_by_id._item_count,
221
221
  }
222
222
  )
223
223
 
@@ -308,16 +308,16 @@ class DatasetClient(BaseResourceClient):
308
308
 
309
309
  added_ids: list[str] = []
310
310
  for entry in normalized:
311
- existing_dataset_by_id._item_count += 1 # type: ignore
312
- idx = self._generate_local_entry_name(existing_dataset_by_id._item_count) # type: ignore
311
+ existing_dataset_by_id._item_count += 1
312
+ idx = self._generate_local_entry_name(existing_dataset_by_id._item_count)
313
313
 
314
- existing_dataset_by_id._dataset_entries[idx] = entry # type: ignore
314
+ existing_dataset_by_id._dataset_entries[idx] = entry
315
315
  added_ids.append(idx)
316
316
 
317
- data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # type: ignore # noqa: A001
317
+ data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001
318
318
 
319
- async with existing_dataset_by_id._file_operation_lock: # type: ignore
320
- await existing_dataset_by_id._update_timestamps(has_been_modified=True) # type: ignore
319
+ async with existing_dataset_by_id._file_operation_lock:
320
+ await existing_dataset_by_id._update_timestamps(has_been_modified=True)
321
321
 
322
322
  await _update_dataset_items(
323
323
  data=data_entries,
@@ -385,7 +385,7 @@ class DatasetClient(BaseResourceClient):
385
385
  return memory_storage_client._datasets_directory
386
386
 
387
387
  @classmethod
388
- def _get_storage_client_cache( # type: ignore
388
+ def _get_storage_client_cache(
389
389
  cls: type[DatasetClient],
390
390
  memory_storage_client: MemoryStorageClient,
391
391
  ) -> list[DatasetClient]:
@@ -100,8 +100,8 @@ class KeyValueStoreClient(BaseResourceClient):
100
100
  found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
101
101
 
102
102
  if found:
103
- async with found._file_operation_lock: # type: ignore
104
- await found._update_timestamps(has_been_modified=False) # type: ignore
103
+ async with found._file_operation_lock:
104
+ await found._update_timestamps(has_been_modified=False)
105
105
  return found._to_resource_info()
106
106
 
107
107
  return None
@@ -127,7 +127,7 @@ class KeyValueStoreClient(BaseResourceClient):
127
127
  if name is None:
128
128
  return existing_store_by_id._to_resource_info()
129
129
 
130
- async with existing_store_by_id._file_operation_lock: # type: ignore
130
+ async with existing_store_by_id._file_operation_lock:
131
131
  # Check that name is not in use already
132
132
  existing_store_by_name = next(
133
133
  (store for store in self._memory_storage_client._key_value_stores_handled if store._name and store._name.lower() == name.lower()),
@@ -146,7 +146,7 @@ class KeyValueStoreClient(BaseResourceClient):
146
146
  await force_rename(previous_dir, existing_store_by_id._resource_directory)
147
147
 
148
148
  # Update timestamps
149
- await existing_store_by_id._update_timestamps(has_been_modified=True) # type: ignore
149
+ await existing_store_by_id._update_timestamps(has_been_modified=True)
150
150
 
151
151
  return existing_store_by_id._to_resource_info()
152
152
 
@@ -187,7 +187,7 @@ class KeyValueStoreClient(BaseResourceClient):
187
187
 
188
188
  items = []
189
189
 
190
- for record in existing_store_by_id._records.values(): # type: ignore
190
+ for record in existing_store_by_id._records.values():
191
191
  size = len(record['value'])
192
192
  items.append(
193
193
  {
@@ -222,8 +222,8 @@ class KeyValueStoreClient(BaseResourceClient):
222
222
  is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item
223
223
  next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item['key']
224
224
 
225
- async with existing_store_by_id._file_operation_lock: # type: ignore
226
- await existing_store_by_id._update_timestamps(has_been_modified=False) # type: ignore
225
+ async with existing_store_by_id._file_operation_lock:
226
+ await existing_store_by_id._update_timestamps(has_been_modified=False)
227
227
 
228
228
  return {
229
229
  'count': len(items),
@@ -247,7 +247,7 @@ class KeyValueStoreClient(BaseResourceClient):
247
247
  if existing_store_by_id is None:
248
248
  raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id)
249
249
 
250
- stored_record = existing_store_by_id._records.get(key) # type: ignore
250
+ stored_record = existing_store_by_id._records.get(key)
251
251
 
252
252
  if stored_record is None:
253
253
  return None
@@ -264,8 +264,8 @@ class KeyValueStoreClient(BaseResourceClient):
264
264
  except ValueError:
265
265
  logger.exception('Error parsing key-value store record')
266
266
 
267
- async with existing_store_by_id._file_operation_lock: # type: ignore
268
- await existing_store_by_id._update_timestamps(has_been_modified=False) # type: ignore
267
+ async with existing_store_by_id._file_operation_lock:
268
+ await existing_store_by_id._update_timestamps(has_been_modified=False)
269
269
 
270
270
  return record
271
271
 
@@ -324,22 +324,22 @@ class KeyValueStoreClient(BaseResourceClient):
324
324
  if 'application/json' in content_type and not is_file_or_bytes(value) and not isinstance(value, str):
325
325
  value = json_dumps(value).encode('utf-8')
326
326
 
327
- async with existing_store_by_id._file_operation_lock: # type: ignore
328
- await existing_store_by_id._update_timestamps(has_been_modified=True) # type: ignore
327
+ async with existing_store_by_id._file_operation_lock:
328
+ await existing_store_by_id._update_timestamps(has_been_modified=True)
329
329
  record: KeyValueStoreRecord = {
330
330
  'key': key,
331
331
  'value': value,
332
332
  'contentType': content_type,
333
333
  }
334
334
 
335
- old_record = existing_store_by_id._records.get(key) # type: ignore
336
- existing_store_by_id._records[key] = record # type: ignore
335
+ old_record = existing_store_by_id._records.get(key)
336
+ existing_store_by_id._records[key] = record
337
337
 
338
338
  if self._memory_storage_client._persist_storage:
339
339
  if old_record is not None and _filename_from_record(old_record) != _filename_from_record(record):
340
- await existing_store_by_id._delete_persisted_record(old_record) # type: ignore
340
+ await existing_store_by_id._delete_persisted_record(old_record)
341
341
 
342
- await existing_store_by_id._persist_record(record) # type: ignore
342
+ await existing_store_by_id._persist_record(record)
343
343
 
344
344
  async def _persist_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None:
345
345
  store_directory = self._resource_directory
@@ -385,14 +385,14 @@ class KeyValueStoreClient(BaseResourceClient):
385
385
  if existing_store_by_id is None:
386
386
  raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id)
387
387
 
388
- record = existing_store_by_id._records.get(key) # type: ignore
388
+ record = existing_store_by_id._records.get(key)
389
389
 
390
390
  if record is not None:
391
- async with existing_store_by_id._file_operation_lock: # type: ignore
392
- del existing_store_by_id._records[key] # type: ignore
393
- await existing_store_by_id._update_timestamps(has_been_modified=True) # type: ignore
391
+ async with existing_store_by_id._file_operation_lock:
392
+ del existing_store_by_id._records[key]
393
+ await existing_store_by_id._update_timestamps(has_been_modified=True)
394
394
  if self._memory_storage_client._persist_storage:
395
- await existing_store_by_id._delete_persisted_record(record) # type: ignore
395
+ await existing_store_by_id._delete_persisted_record(record)
396
396
 
397
397
  async def _delete_persisted_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None:
398
398
  store_directory = self._resource_directory
@@ -437,7 +437,7 @@ class KeyValueStoreClient(BaseResourceClient):
437
437
  return memory_storage_client._key_value_stores_directory
438
438
 
439
439
  @classmethod
440
- def _get_storage_client_cache( # type: ignore
440
+ def _get_storage_client_cache(
441
441
  cls: type[KeyValueStoreClient],
442
442
  memory_storage_client: MemoryStorageClient,
443
443
  ) -> list[KeyValueStoreClient]:
@@ -67,8 +67,8 @@ class RequestQueueClient(BaseResourceClient):
67
67
  found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name)
68
68
 
69
69
  if found:
70
- async with found._file_operation_lock: # type: ignore
71
- await found._update_timestamps(has_been_modified=False) # type: ignore
70
+ async with found._file_operation_lock:
71
+ await found._update_timestamps(has_been_modified=False)
72
72
  return found._to_resource_info()
73
73
 
74
74
  return None
@@ -94,7 +94,7 @@ class RequestQueueClient(BaseResourceClient):
94
94
  if name is None:
95
95
  return existing_queue_by_id._to_resource_info()
96
96
 
97
- async with existing_queue_by_id._file_operation_lock: # type: ignore
97
+ async with existing_queue_by_id._file_operation_lock:
98
98
  # Check that name is not in use already
99
99
  existing_queue_by_name = next(
100
100
  (queue for queue in self._memory_storage_client._request_queues_handled if queue._name and queue._name.lower() == name.lower()), None
@@ -112,7 +112,7 @@ class RequestQueueClient(BaseResourceClient):
112
112
  await force_rename(previous_dir, existing_queue_by_id._resource_directory)
113
113
 
114
114
  # Update timestamps
115
- await existing_queue_by_id._update_timestamps(has_been_modified=True) # type: ignore
115
+ await existing_queue_by_id._update_timestamps(has_been_modified=True)
116
116
 
117
117
  return existing_queue_by_id._to_resource_info()
118
118
 
@@ -146,18 +146,18 @@ class RequestQueueClient(BaseResourceClient):
146
146
  if existing_queue_by_id is None:
147
147
  raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
148
148
 
149
- async with existing_queue_by_id._file_operation_lock: # type: ignore
150
- await existing_queue_by_id._update_timestamps(has_been_modified=False) # type: ignore
149
+ async with existing_queue_by_id._file_operation_lock:
150
+ await existing_queue_by_id._update_timestamps(has_been_modified=False)
151
151
 
152
152
  items: list[dict] = []
153
153
 
154
154
  # Iterate all requests in the queue which have sorted key larger than infinity, which means `orderNo` is not `None`
155
155
  # This will iterate them in order of `orderNo`
156
- for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)): # type: ignore
156
+ for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)):
157
157
  if len(items) == limit:
158
158
  break
159
159
 
160
- request = existing_queue_by_id._requests.get(request_key) # type: ignore
160
+ request = existing_queue_by_id._requests.get(request_key)
161
161
 
162
162
  # Check that the request still exists and was not handled,
163
163
  # in case something deleted it or marked it as handled concurrenctly
@@ -167,7 +167,7 @@ class RequestQueueClient(BaseResourceClient):
167
167
  return {
168
168
  'limit': limit,
169
169
  'hadMultipleClients': False,
170
- 'queueModifiedAt': existing_queue_by_id._modified_at, # type: ignore
170
+ 'queueModifiedAt': existing_queue_by_id._modified_at,
171
171
  'items': [self._json_to_request(item['json']) for item in items],
172
172
  }
173
173
 
@@ -190,12 +190,12 @@ class RequestQueueClient(BaseResourceClient):
190
190
 
191
191
  request_model = self._create_internal_request(request, forefront)
192
192
 
193
- async with existing_queue_by_id._file_operation_lock: # type: ignore
194
- existing_request_with_id = existing_queue_by_id._requests.get(request_model['id']) # type: ignore
193
+ async with existing_queue_by_id._file_operation_lock:
194
+ existing_request_with_id = existing_queue_by_id._requests.get(request_model['id'])
195
195
 
196
196
  # We already have the request present, so we return information about it
197
197
  if existing_request_with_id is not None:
198
- await existing_queue_by_id._update_timestamps(has_been_modified=False) # type: ignore
198
+ await existing_queue_by_id._update_timestamps(has_been_modified=False)
199
199
 
200
200
  return {
201
201
  'requestId': existing_request_with_id['id'],
@@ -203,12 +203,12 @@ class RequestQueueClient(BaseResourceClient):
203
203
  'wasAlreadyPresent': True,
204
204
  }
205
205
 
206
- existing_queue_by_id._requests[request_model['id']] = request_model # type: ignore
206
+ existing_queue_by_id._requests[request_model['id']] = request_model
207
207
  if request_model['orderNo'] is None:
208
- existing_queue_by_id._handled_request_count += 1 # type: ignore
208
+ existing_queue_by_id._handled_request_count += 1
209
209
  else:
210
- existing_queue_by_id._pending_request_count += 1 # type: ignore
211
- await existing_queue_by_id._update_timestamps(has_been_modified=True) # type: ignore
210
+ existing_queue_by_id._pending_request_count += 1
211
+ await existing_queue_by_id._update_timestamps(has_been_modified=True)
212
212
  await update_request_queue_item(
213
213
  request=request_model,
214
214
  request_id=request_model['id'],
@@ -240,10 +240,10 @@ class RequestQueueClient(BaseResourceClient):
240
240
  if existing_queue_by_id is None:
241
241
  raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
242
242
 
243
- async with existing_queue_by_id._file_operation_lock: # type: ignore
244
- await existing_queue_by_id._update_timestamps(has_been_modified=False) # type: ignore
243
+ async with existing_queue_by_id._file_operation_lock:
244
+ await existing_queue_by_id._update_timestamps(has_been_modified=False)
245
245
 
246
- request = existing_queue_by_id._requests.get(request_id) # type: ignore
246
+ request = existing_queue_by_id._requests.get(request_id)
247
247
  return self._json_to_request(request['json'] if request is not None else None)
248
248
 
249
249
  async def update_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict:
@@ -268,17 +268,17 @@ class RequestQueueClient(BaseResourceClient):
268
268
  # First we need to check the existing request to be
269
269
  # able to return information about its handled state.
270
270
 
271
- existing_request = existing_queue_by_id._requests.get(request_model['id']) # type: ignore
271
+ existing_request = existing_queue_by_id._requests.get(request_model['id'])
272
272
 
273
273
  # Undefined means that the request is not present in the queue.
274
274
  # We need to insert it, to behave the same as API.
275
275
  if existing_request is None:
276
276
  return await self.add_request(request, forefront=forefront)
277
277
 
278
- async with existing_queue_by_id._file_operation_lock: # type: ignore
278
+ async with existing_queue_by_id._file_operation_lock:
279
279
  # When updating the request, we need to make sure that
280
280
  # the handled counts are updated correctly in all cases.
281
- existing_queue_by_id._requests[request_model['id']] = request_model # type: ignore
281
+ existing_queue_by_id._requests[request_model['id']] = request_model
282
282
 
283
283
  pending_count_adjustment = 0
284
284
  is_request_handled_state_changing = not isinstance(existing_request['orderNo'], type(request_model['orderNo']))
@@ -288,9 +288,9 @@ class RequestQueueClient(BaseResourceClient):
288
288
  if is_request_handled_state_changing:
289
289
  pending_count_adjustment = 1 if request_was_handled_before_update else -1
290
290
 
291
- existing_queue_by_id._pending_request_count += pending_count_adjustment # type: ignore
292
- existing_queue_by_id._handled_request_count -= pending_count_adjustment # type: ignore
293
- await existing_queue_by_id._update_timestamps(has_been_modified=True) # type: ignore
291
+ existing_queue_by_id._pending_request_count += pending_count_adjustment
292
+ existing_queue_by_id._handled_request_count -= pending_count_adjustment
293
+ await existing_queue_by_id._update_timestamps(has_been_modified=True)
294
294
  await update_request_queue_item(
295
295
  request=request_model,
296
296
  request_id=request_model['id'],
@@ -317,16 +317,16 @@ class RequestQueueClient(BaseResourceClient):
317
317
  if existing_queue_by_id is None:
318
318
  raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id)
319
319
 
320
- async with existing_queue_by_id._file_operation_lock: # type: ignore
321
- request = existing_queue_by_id._requests.get(request_id) # type: ignore
320
+ async with existing_queue_by_id._file_operation_lock:
321
+ request = existing_queue_by_id._requests.get(request_id)
322
322
 
323
323
  if request:
324
- del existing_queue_by_id._requests[request_id] # type: ignore
324
+ del existing_queue_by_id._requests[request_id]
325
325
  if request['orderNo'] is None:
326
- existing_queue_by_id._handled_request_count -= 1 # type: ignore
326
+ existing_queue_by_id._handled_request_count -= 1
327
327
  else:
328
- existing_queue_by_id._pending_request_count -= 1 # type: ignore
329
- await existing_queue_by_id._update_timestamps(has_been_modified=True) # type: ignore
328
+ existing_queue_by_id._pending_request_count -= 1
329
+ await existing_queue_by_id._update_timestamps(has_been_modified=True)
330
330
  await delete_request(entity_directory=existing_queue_by_id._resource_directory, request_id=request_id)
331
331
 
332
332
  def _to_resource_info(self: RequestQueueClient) -> dict:
@@ -403,7 +403,7 @@ class RequestQueueClient(BaseResourceClient):
403
403
  return memory_storage_client._request_queues_directory
404
404
 
405
405
  @classmethod
406
- def _get_storage_client_cache( # type: ignore
406
+ def _get_storage_client_cache(
407
407
  cls: type[RequestQueueClient],
408
408
  memory_storage_client: MemoryStorageClient,
409
409
  ) -> list[RequestQueueClient]:
@@ -4,10 +4,7 @@ import traceback
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
6
  try:
7
- from scrapy import Spider # noqa: TCH002
8
7
  from scrapy.downloadermiddlewares.retry import RetryMiddleware
9
- from scrapy.exceptions import IgnoreRequest
10
- from scrapy.http import Request, Response # noqa: TCH002
11
8
  from scrapy.utils.response import response_status_message
12
9
  except ImportError as exc:
13
10
  raise ImportError(
@@ -18,6 +15,9 @@ from ..actor import Actor
18
15
  from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
19
16
 
20
17
  if TYPE_CHECKING:
18
+ from scrapy import Spider
19
+ from scrapy.http import Request, Response
20
+
21
21
  from ..storages import RequestQueue
22
22
 
23
23
 
@@ -33,11 +33,6 @@ class ApifyRetryMiddleware(RetryMiddleware):
33
33
  traceback.print_exc()
34
34
  raise
35
35
 
36
- def __del__(self: ApifyRetryMiddleware) -> None:
37
- """Before deleting the instance, close the nested event loop."""
38
- nested_event_loop.stop()
39
- nested_event_loop.close()
40
-
41
36
  def process_response(
42
37
  self: ApifyRetryMiddleware,
43
38
  request: Request,
@@ -54,9 +49,11 @@ class ApifyRetryMiddleware(RetryMiddleware):
54
49
  Returns:
55
50
  The response, or a new request if the request should be retried.
56
51
  """
52
+ if not isinstance(request.url, str):
53
+ raise TypeError(f'Expected request.url to be a string, got {type(request.url)} instead.')
54
+
57
55
  # Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our
58
56
  # Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details.
59
- assert isinstance(request.url, str) # noqa: S101
60
57
  if request.url.endswith('robots.txt'):
61
58
  return response
62
59
 
@@ -73,19 +70,16 @@ class ApifyRetryMiddleware(RetryMiddleware):
73
70
  spider: Spider,
74
71
  ) -> Request | Response | None:
75
72
  """Handle the exception and decide whether the request should be retried."""
76
- Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (scrapy_request={request})...')
73
+ Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (request={request}, exception={exception})...')
77
74
  apify_request = to_apify_request(request, spider=spider)
78
75
 
79
- if isinstance(exception, IgnoreRequest):
80
- try:
81
- nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
82
- except BaseException:
83
- traceback.print_exc()
84
- raise
85
- else:
86
- nested_event_loop.run_until_complete(self._rq.reclaim_request(apify_request))
76
+ try:
77
+ nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
78
+ except BaseException:
79
+ traceback.print_exc()
80
+ raise
87
81
 
88
- return super().process_exception(request, exception, spider)
82
+ return None
89
83
 
90
84
  async def _handle_retry_logic(
91
85
  self: ApifyRetryMiddleware,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.4.0
3
+ Version: 1.4.1a1
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -24,10 +24,10 @@ Classifier: Topic :: Software Development :: Libraries
24
24
  Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: aiofiles>=22.1.0
28
- Requires-Dist: aioshutil>=1.0
29
27
  Requires-Dist: apify-client~=1.6.0
30
28
  Requires-Dist: apify-shared~=1.1.0
29
+ Requires-Dist: aiofiles>=22.1.0
30
+ Requires-Dist: aioshutil>=1.0
31
31
  Requires-Dist: colorama>=0.4.6
32
32
  Requires-Dist: cryptography>=39.0.0
33
33
  Requires-Dist: httpx>=0.24.1
@@ -51,10 +51,10 @@ Requires-Dist: respx~=0.20.1; extra == "dev"
51
51
  Requires-Dist: ruff~=0.1.6; extra == "dev"
52
52
  Requires-Dist: twine~=4.0.2; extra == "dev"
53
53
  Requires-Dist: types-aiofiles~=23.2.0.0; extra == "dev"
54
- Requires-Dist: types-colorama~=0.4.15.11; extra == "dev"
55
- Requires-Dist: types-psutil~=5.9.5.12; extra == "dev"
54
+ Requires-Dist: types-colorama~=0.4.15.12; extra == "dev"
55
+ Requires-Dist: types-psutil~=5.9.5.17; extra == "dev"
56
56
  Provides-Extra: scrapy
57
- Requires-Dist: scrapy~=2.11.0; extra == "scrapy"
57
+ Requires-Dist: scrapy>=2.11.0; extra == "scrapy"
58
58
 
59
59
  # Apify SDK for Python
60
60
 
@@ -1,7 +1,7 @@
1
- aiofiles>=22.1.0
2
- aioshutil>=1.0
3
1
  apify-client~=1.6.0
4
2
  apify-shared~=1.1.0
3
+ aiofiles>=22.1.0
4
+ aioshutil>=1.0
5
5
  colorama>=0.4.6
6
6
  cryptography>=39.0.0
7
7
  httpx>=0.24.1
@@ -26,8 +26,8 @@ respx~=0.20.1
26
26
  ruff~=0.1.6
27
27
  twine~=4.0.2
28
28
  types-aiofiles~=23.2.0.0
29
- types-colorama~=0.4.15.11
30
- types-psutil~=5.9.5.12
29
+ types-colorama~=0.4.15.12
30
+ types-psutil~=5.9.5.17
31
31
 
32
32
  [scrapy]
33
- scrapy~=2.11.0
33
+ scrapy>=2.11.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes