apify 1.5.1b3__tar.gz → 1.5.2b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (46) hide show
  1. {apify-1.5.1b3 → apify-1.5.2b2}/PKG-INFO +1 -1
  2. {apify-1.5.1b3 → apify-1.5.2b2}/pyproject.toml +1 -1
  3. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +1 -1
  4. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/dataset.py +1 -1
  5. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/dataset_collection.py +1 -1
  6. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +1 -1
  7. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/request_queue_collection.py +1 -1
  8. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/actor.py +1 -1
  9. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/event_manager.py +2 -2
  10. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/log.py +1 -1
  11. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/scrapy/__init__.py +0 -1
  12. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/scrapy/middlewares/apify_proxy.py +9 -7
  13. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/scrapy/middlewares/apify_retry.py +2 -3
  14. apify-1.5.2b2/src/apify/scrapy/pipelines/__init__.py +1 -0
  15. apify-1.5.1b3/src/apify/scrapy/pipelines.py → apify-1.5.2b2/src/apify/scrapy/pipelines/actor_dataset_push.py +1 -1
  16. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/scrapy/scheduler.py +16 -6
  17. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/scrapy/utils.py +19 -11
  18. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/storages/base_storage.py +1 -1
  19. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/storages/dataset.py +1 -1
  20. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/storages/key_value_store.py +1 -1
  21. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/storages/request_queue.py +1 -1
  22. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify.egg-info/PKG-INFO +1 -1
  23. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify.egg-info/SOURCES.txt +2 -1
  24. {apify-1.5.1b3 → apify-1.5.2b2}/LICENSE +0 -0
  25. {apify-1.5.1b3 → apify-1.5.2b2}/README.md +0 -0
  26. {apify-1.5.1b3 → apify-1.5.2b2}/setup.cfg +0 -0
  27. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/__init__.py +0 -0
  28. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_crypto.py +0 -0
  29. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/__init__.py +0 -0
  30. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/file_storage_utils.py +0 -0
  31. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/memory_storage_client.py +0 -0
  32. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/__init__.py +0 -0
  33. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/base_resource_client.py +0 -0
  34. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/key_value_store.py +0 -0
  35. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/request_queue.py +0 -0
  36. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/_utils.py +0 -0
  37. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/config.py +0 -0
  38. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/consts.py +0 -0
  39. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/proxy_configuration.py +0 -0
  40. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/py.typed +0 -0
  41. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/scrapy/middlewares/__init__.py +0 -0
  42. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/storages/__init__.py +0 -0
  43. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify/storages/storage_client_manager.py +0 -0
  44. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify.egg-info/dependency_links.txt +0 -0
  45. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify.egg-info/requires.txt +0 -0
  46. {apify-1.5.1b3 → apify-1.5.2b2}/src/apify.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.5.1b3
3
+ Version: 1.5.2b2
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "apify"
3
- version = "1.5.1b3"
3
+ version = "1.5.2b2"
4
4
  description = "Apify SDK for Python"
5
5
  readme = "README.md"
6
6
  license = { text = "Apache Software License" }
@@ -43,7 +43,7 @@ class BaseResourceCollectionClient(ABC, Generic[ResourceClientType]):
43
43
  raise NotImplementedError('You must override this method in the subclass!')
44
44
 
45
45
  @abstractmethod
46
- async def list(self: BaseResourceCollectionClient) -> ListPage: # noqa: A003
46
+ async def list(self: BaseResourceCollectionClient) -> ListPage:
47
47
  """List the available storages.
48
48
 
49
49
  Returns:
@@ -314,7 +314,7 @@ class DatasetClient(BaseResourceClient):
314
314
  existing_dataset_by_id._dataset_entries[idx] = entry
315
315
  added_ids.append(idx)
316
316
 
317
- data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001
317
+ data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001
318
318
 
319
319
  async with existing_dataset_by_id._file_operation_lock:
320
320
  await existing_dataset_by_id._update_timestamps(has_been_modified=True)
@@ -21,7 +21,7 @@ class DatasetCollectionClient(BaseResourceCollectionClient):
21
21
  def _get_resource_client_class(self: DatasetCollectionClient) -> type[DatasetClient]:
22
22
  return DatasetClient
23
23
 
24
- async def list(self: DatasetCollectionClient) -> ListPage: # noqa: A003
24
+ async def list(self: DatasetCollectionClient) -> ListPage:
25
25
  """List the available datasets.
26
26
 
27
27
  Returns:
@@ -21,7 +21,7 @@ class KeyValueStoreCollectionClient(BaseResourceCollectionClient):
21
21
  def _get_resource_client_class(self: KeyValueStoreCollectionClient) -> type[KeyValueStoreClient]:
22
22
  return KeyValueStoreClient
23
23
 
24
- async def list(self: KeyValueStoreCollectionClient) -> ListPage: # noqa: A003
24
+ async def list(self: KeyValueStoreCollectionClient) -> ListPage:
25
25
  """List the available key-value stores.
26
26
 
27
27
  Returns:
@@ -21,7 +21,7 @@ class RequestQueueCollectionClient(BaseResourceCollectionClient):
21
21
  def _get_resource_client_class(self: RequestQueueCollectionClient) -> type[RequestQueueClient]:
22
22
  return RequestQueueClient
23
23
 
24
- async def list(self: RequestQueueCollectionClient) -> ListPage: # noqa: A003
24
+ async def list(self: RequestQueueCollectionClient) -> ListPage:
25
25
  """List the available request queues.
26
26
 
27
27
  Returns:
@@ -295,7 +295,7 @@ class Actor(metaclass=_ActorContextManager):
295
295
  await self._send_system_info_interval_task
296
296
 
297
297
  @classmethod
298
- async def exit( # noqa: A003
298
+ async def exit(
299
299
  cls: type[Actor],
300
300
  *,
301
301
  exit_code: int = 0,
@@ -219,7 +219,7 @@ class EventManager:
219
219
  try:
220
220
  async with websockets.client.connect(self._config.actor_events_ws_url) as websocket:
221
221
  self._platform_events_websocket = websocket
222
- self._connected_to_platform_websocket.set_result(True) # noqa: FBT003
222
+ self._connected_to_platform_websocket.set_result(True)
223
223
  async for message in websocket:
224
224
  try:
225
225
  parsed_message = json.loads(message)
@@ -234,4 +234,4 @@ class EventManager:
234
234
  logger.exception('Cannot parse actor event', extra={'message': message})
235
235
  except Exception:
236
236
  logger.exception('Error in websocket connection')
237
- self._connected_to_platform_websocket.set_result(False) # noqa: FBT003
237
+ self._connected_to_platform_websocket.set_result(False)
@@ -82,7 +82,7 @@ class ActorLogFormatter(logging.Formatter):
82
82
  return extra_fields
83
83
 
84
84
  @ignore_docs
85
- def format(self: ActorLogFormatter, record: logging.LogRecord) -> str: # noqa: A003
85
+ def format(self: ActorLogFormatter, record: logging.LogRecord) -> str:
86
86
  """Format the log record nicely.
87
87
 
88
88
  This formats the log record so that it:
@@ -1,3 +1,2 @@
1
- from .pipelines import ActorDatasetPushPipeline
2
1
  from .scheduler import ApifyScheduler
3
2
  from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
@@ -1,19 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING
4
3
  from urllib.parse import ParseResult, urlparse
5
4
 
6
- from scrapy.core.downloader.handlers.http11 import TunnelError
7
- from scrapy.exceptions import NotConfigured
5
+ try:
6
+ from scrapy import Request, Spider # noqa: TCH002
7
+ from scrapy.core.downloader.handlers.http11 import TunnelError
8
+ from scrapy.crawler import Crawler # noqa: TCH002
9
+ from scrapy.exceptions import NotConfigured
10
+ except ImportError as exc:
11
+ raise ImportError(
12
+ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13
+ ) from exc
8
14
 
9
15
  from ...actor import Actor
10
16
  from ...proxy_configuration import ProxyConfiguration
11
17
  from ..utils import get_basic_auth_header
12
18
 
13
- if TYPE_CHECKING:
14
- from scrapy import Request, Spider
15
- from scrapy.crawler import Crawler
16
-
17
19
 
18
20
  class ApifyHttpProxyMiddleware:
19
21
  """Apify HTTP proxy middleware for Scrapy.
@@ -4,7 +4,9 @@ import traceback
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
6
  try:
7
+ from scrapy import Spider # noqa: TCH002
7
8
  from scrapy.downloadermiddlewares.retry import RetryMiddleware
9
+ from scrapy.http import Request, Response # noqa: TCH002
8
10
  from scrapy.utils.response import response_status_message
9
11
  except ImportError as exc:
10
12
  raise ImportError(
@@ -15,9 +17,6 @@ from ...actor import Actor
15
17
  from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
16
18
 
17
19
  if TYPE_CHECKING:
18
- from scrapy import Spider
19
- from scrapy.http import Request, Response
20
-
21
20
  from ...storages import RequestQueue
22
21
 
23
22
 
@@ -0,0 +1 @@
1
+ from .actor_dataset_push import ActorDatasetPushPipeline
@@ -9,7 +9,7 @@ except ImportError as exc:
9
9
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
10
10
  ) from exc
11
11
 
12
- from ..actor import Actor
12
+ from ...actor import Actor
13
13
 
14
14
 
15
15
  class ActorDatasetPushPipeline:
@@ -35,7 +35,7 @@ class ApifyScheduler(BaseScheduler):
35
35
  self._rq: RequestQueue | None = None
36
36
  self.spider: Spider | None = None
37
37
 
38
- def open(self: ApifyScheduler, spider: Spider) -> None: # noqa: A003 # this has to be named "open"
38
+ def open(self: ApifyScheduler, spider: Spider) -> None: # this has to be named "open"
39
39
  """Open the scheduler.
40
40
 
41
41
  Args:
@@ -55,7 +55,9 @@ class ApifyScheduler(BaseScheduler):
55
55
  Returns:
56
56
  True if the scheduler has any pending requests, False otherwise.
57
57
  """
58
- assert isinstance(self._rq, RequestQueue) # noqa: S101
58
+ if not isinstance(self._rq, RequestQueue):
59
+ raise TypeError('self._rq must be an instance of the RequestQueue class')
60
+
59
61
  try:
60
62
  is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
61
63
  except BaseException:
@@ -76,10 +78,14 @@ class ApifyScheduler(BaseScheduler):
76
78
  call_id = crypto_random_object_id(8)
77
79
  Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
78
80
 
79
- assert isinstance(self.spider, Spider) # noqa: S101
81
+ if not isinstance(self.spider, Spider):
82
+ raise TypeError('self.spider must be an instance of the Spider class')
83
+
80
84
  apify_request = to_apify_request(request, spider=self.spider)
81
85
  Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
82
- assert isinstance(self._rq, RequestQueue) # noqa: S101
86
+
87
+ if not isinstance(self._rq, RequestQueue):
88
+ raise TypeError('self._rq must be an instance of the RequestQueue class')
83
89
 
84
90
  try:
85
91
  result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
@@ -98,7 +104,9 @@ class ApifyScheduler(BaseScheduler):
98
104
  """
99
105
  call_id = crypto_random_object_id(8)
100
106
  Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
101
- assert isinstance(self._rq, RequestQueue) # noqa: S101
107
+
108
+ if not isinstance(self._rq, RequestQueue):
109
+ raise TypeError('self._rq must be an instance of the RequestQueue class')
102
110
 
103
111
  try:
104
112
  apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
@@ -111,7 +119,9 @@ class ApifyScheduler(BaseScheduler):
111
119
  if apify_request is None:
112
120
  return None
113
121
 
114
- assert isinstance(self.spider, Spider) # noqa: S101
122
+ if not isinstance(self.spider, Spider):
123
+ raise TypeError('self.spider must be an instance of the Spider class')
124
+
115
125
  scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
116
126
  Actor.log.debug(
117
127
  f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',
@@ -6,10 +6,9 @@ import pickle
6
6
  from base64 import b64encode
7
7
  from urllib.parse import unquote
8
8
 
9
- from scrapy.utils.python import to_bytes
10
-
11
9
  try:
12
10
  from scrapy import Request, Spider
11
+ from scrapy.utils.python import to_bytes
13
12
  from scrapy.utils.request import request_from_dict
14
13
  except ImportError as exc:
15
14
  raise ImportError(
@@ -51,7 +50,8 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
51
50
  Returns:
52
51
  The converted Apify request.
53
52
  """
54
- assert isinstance(scrapy_request, Request) # noqa: S101
53
+ if not isinstance(scrapy_request, Request):
54
+ raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
55
55
 
56
56
  call_id = crypto_random_object_id(8)
57
57
  Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
@@ -91,11 +91,14 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
91
91
  Returns:
92
92
  The converted Scrapy request.
93
93
  """
94
- assert isinstance(apify_request, dict) # noqa: S101
95
- assert 'url' in apify_request # noqa: S101
96
- assert 'method' in apify_request # noqa: S101
97
- assert 'id' in apify_request # noqa: S101
98
- assert 'uniqueKey' in apify_request # noqa: S101
94
+ if not isinstance(apify_request, dict):
95
+ raise TypeError('apify_request must be a dictionary')
96
+
97
+ required_keys = ['url', 'method', 'id', 'uniqueKey']
98
+ missing_keys = [key for key in required_keys if key not in apify_request]
99
+
100
+ if missing_keys:
101
+ raise ValueError(f"apify_request must contain {', '.join(map(repr, missing_keys))} key(s)")
99
102
 
100
103
  call_id = crypto_random_object_id(8)
101
104
  Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
@@ -106,14 +109,19 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
106
109
  # - This process involves decoding the base64-encoded request data and reconstructing
107
110
  # the Scrapy Request object from its dictionary representation.
108
111
  Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
112
+
109
113
  scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
110
- assert isinstance(scrapy_request_dict_encoded, str) # noqa: S101
114
+ if not isinstance(scrapy_request_dict_encoded, str):
115
+ raise TypeError('scrapy_request_dict_encoded must be a string')
111
116
 
112
117
  scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
113
- assert isinstance(scrapy_request_dict, dict) # noqa: S101
118
+ if not isinstance(scrapy_request_dict, dict):
119
+ raise TypeError('scrapy_request_dict must be a dictionary')
114
120
 
115
121
  scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
116
- assert isinstance(scrapy_request, Request) # noqa: S101
122
+ if not isinstance(scrapy_request, Request):
123
+ raise TypeError('scrapy_request must be an instance of the Request class')
124
+
117
125
  Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
118
126
 
119
127
  # Update the meta field with the meta field from the apify_request
@@ -91,7 +91,7 @@ class BaseStorage(ABC, Generic[BaseResourceClientType, BaseResourceCollectionCli
91
91
 
92
92
  @classmethod
93
93
  @abstractmethod
94
- async def open( # noqa: A003
94
+ async def open(
95
95
  cls: type[BaseStorage],
96
96
  *,
97
97
  id: str | None = None, # noqa: A002
@@ -463,7 +463,7 @@ class Dataset(BaseStorage):
463
463
  self._remove_from_cache()
464
464
 
465
465
  @classmethod
466
- async def open( # noqa: A003
466
+ async def open(
467
467
  cls: type[Dataset],
468
468
  *,
469
469
  id: str | None = None, # noqa: A002
@@ -226,7 +226,7 @@ class KeyValueStore(BaseStorage):
226
226
  self._remove_from_cache()
227
227
 
228
228
  @classmethod
229
- async def open( # noqa: A003
229
+ async def open(
230
230
  cls: type[KeyValueStore],
231
231
  *,
232
232
  id: str | None = None, # noqa: A002
@@ -536,7 +536,7 @@ class RequestQueue(BaseStorage):
536
536
  return await self._request_queue_client.get()
537
537
 
538
538
  @classmethod
539
- async def open( # noqa: A003
539
+ async def open(
540
540
  cls: type[RequestQueue],
541
541
  *,
542
542
  id: str | None = None, # noqa: A002
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.5.1b3
3
+ Version: 1.5.2b2
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -29,12 +29,13 @@ src/apify/_memory_storage/resource_clients/key_value_store_collection.py
29
29
  src/apify/_memory_storage/resource_clients/request_queue.py
30
30
  src/apify/_memory_storage/resource_clients/request_queue_collection.py
31
31
  src/apify/scrapy/__init__.py
32
- src/apify/scrapy/pipelines.py
33
32
  src/apify/scrapy/scheduler.py
34
33
  src/apify/scrapy/utils.py
35
34
  src/apify/scrapy/middlewares/__init__.py
36
35
  src/apify/scrapy/middlewares/apify_proxy.py
37
36
  src/apify/scrapy/middlewares/apify_retry.py
37
+ src/apify/scrapy/pipelines/__init__.py
38
+ src/apify/scrapy/pipelines/actor_dataset_push.py
38
39
  src/apify/storages/__init__.py
39
40
  src/apify/storages/base_storage.py
40
41
  src/apify/storages/dataset.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes