apify 1.5.2b1__py3-none-any.whl → 1.5.2b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/scrapy/__init__.py CHANGED
@@ -1,3 +1,2 @@
1
- from .pipelines import ActorDatasetPushPipeline
2
1
  from .scheduler import ApifyScheduler
3
2
  from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
@@ -1,10 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING
4
3
  from urllib.parse import ParseResult, urlparse
5
4
 
6
5
  try:
6
+ from scrapy import Request, Spider # noqa: TCH002
7
7
  from scrapy.core.downloader.handlers.http11 import TunnelError
8
+ from scrapy.crawler import Crawler # noqa: TCH002
8
9
  from scrapy.exceptions import NotConfigured
9
10
  except ImportError as exc:
10
11
  raise ImportError(
@@ -15,10 +16,6 @@ from ...actor import Actor
15
16
  from ...proxy_configuration import ProxyConfiguration
16
17
  from ..utils import get_basic_auth_header
17
18
 
18
- if TYPE_CHECKING:
19
- from scrapy import Request, Spider
20
- from scrapy.crawler import Crawler
21
-
22
19
 
23
20
  class ApifyHttpProxyMiddleware:
24
21
  """Apify HTTP proxy middleware for Scrapy.
@@ -4,7 +4,9 @@ import traceback
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
6
  try:
7
+ from scrapy import Spider # noqa: TCH002
7
8
  from scrapy.downloadermiddlewares.retry import RetryMiddleware
9
+ from scrapy.http import Request, Response # noqa: TCH002
8
10
  from scrapy.utils.response import response_status_message
9
11
  except ImportError as exc:
10
12
  raise ImportError(
@@ -15,9 +17,6 @@ from ...actor import Actor
15
17
  from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
16
18
 
17
19
  if TYPE_CHECKING:
18
- from scrapy import Spider
19
- from scrapy.http import Request, Response
20
-
21
20
  from ...storages import RequestQueue
22
21
 
23
22
 
@@ -0,0 +1 @@
1
+ from .actor_dataset_push import ActorDatasetPushPipeline
@@ -9,7 +9,7 @@ except ImportError as exc:
9
9
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
10
10
  ) from exc
11
11
 
12
- from ..actor import Actor
12
+ from ...actor import Actor
13
13
 
14
14
 
15
15
  class ActorDatasetPushPipeline:
apify/scrapy/scheduler.py CHANGED
@@ -55,7 +55,9 @@ class ApifyScheduler(BaseScheduler):
55
55
  Returns:
56
56
  True if the scheduler has any pending requests, False otherwise.
57
57
  """
58
- assert isinstance(self._rq, RequestQueue) # noqa: S101
58
+ if not isinstance(self._rq, RequestQueue):
59
+ raise TypeError('self._rq must be an instance of the RequestQueue class')
60
+
59
61
  try:
60
62
  is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
61
63
  except BaseException:
@@ -76,10 +78,14 @@ class ApifyScheduler(BaseScheduler):
76
78
  call_id = crypto_random_object_id(8)
77
79
  Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
78
80
 
79
- assert isinstance(self.spider, Spider) # noqa: S101
81
+ if not isinstance(self.spider, Spider):
82
+ raise TypeError('self.spider must be an instance of the Spider class')
83
+
80
84
  apify_request = to_apify_request(request, spider=self.spider)
81
85
  Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
82
- assert isinstance(self._rq, RequestQueue) # noqa: S101
86
+
87
+ if not isinstance(self._rq, RequestQueue):
88
+ raise TypeError('self._rq must be an instance of the RequestQueue class')
83
89
 
84
90
  try:
85
91
  result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
@@ -98,7 +104,9 @@ class ApifyScheduler(BaseScheduler):
98
104
  """
99
105
  call_id = crypto_random_object_id(8)
100
106
  Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
101
- assert isinstance(self._rq, RequestQueue) # noqa: S101
107
+
108
+ if not isinstance(self._rq, RequestQueue):
109
+ raise TypeError('self._rq must be an instance of the RequestQueue class')
102
110
 
103
111
  try:
104
112
  apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
@@ -111,7 +119,9 @@ class ApifyScheduler(BaseScheduler):
111
119
  if apify_request is None:
112
120
  return None
113
121
 
114
- assert isinstance(self.spider, Spider) # noqa: S101
122
+ if not isinstance(self.spider, Spider):
123
+ raise TypeError('self.spider must be an instance of the Spider class')
124
+
115
125
  scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
116
126
  Actor.log.debug(
117
127
  f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',
apify/scrapy/utils.py CHANGED
@@ -6,10 +6,9 @@ import pickle
6
6
  from base64 import b64encode
7
7
  from urllib.parse import unquote
8
8
 
9
- from scrapy.utils.python import to_bytes
10
-
11
9
  try:
12
10
  from scrapy import Request, Spider
11
+ from scrapy.utils.python import to_bytes
13
12
  from scrapy.utils.request import request_from_dict
14
13
  except ImportError as exc:
15
14
  raise ImportError(
@@ -51,7 +50,8 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
51
50
  Returns:
52
51
  The converted Apify request.
53
52
  """
54
- assert isinstance(scrapy_request, Request) # noqa: S101
53
+ if not isinstance(scrapy_request, Request):
54
+ raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
55
55
 
56
56
  call_id = crypto_random_object_id(8)
57
57
  Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
@@ -91,11 +91,14 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
91
91
  Returns:
92
92
  The converted Scrapy request.
93
93
  """
94
- assert isinstance(apify_request, dict) # noqa: S101
95
- assert 'url' in apify_request # noqa: S101
96
- assert 'method' in apify_request # noqa: S101
97
- assert 'id' in apify_request # noqa: S101
98
- assert 'uniqueKey' in apify_request # noqa: S101
94
+ if not isinstance(apify_request, dict):
95
+ raise TypeError('apify_request must be a dictionary')
96
+
97
+ required_keys = ['url', 'method', 'id', 'uniqueKey']
98
+ missing_keys = [key for key in required_keys if key not in apify_request]
99
+
100
+ if missing_keys:
101
+ raise ValueError(f"apify_request must contain {', '.join(map(repr, missing_keys))} key(s)")
99
102
 
100
103
  call_id = crypto_random_object_id(8)
101
104
  Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
@@ -106,14 +109,19 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
106
109
  # - This process involves decoding the base64-encoded request data and reconstructing
107
110
  # the Scrapy Request object from its dictionary representation.
108
111
  Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
112
+
109
113
  scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
110
- assert isinstance(scrapy_request_dict_encoded, str) # noqa: S101
114
+ if not isinstance(scrapy_request_dict_encoded, str):
115
+ raise TypeError('scrapy_request_dict_encoded must be a string')
111
116
 
112
117
  scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
113
- assert isinstance(scrapy_request_dict, dict) # noqa: S101
118
+ if not isinstance(scrapy_request_dict, dict):
119
+ raise TypeError('scrapy_request_dict must be a dictionary')
114
120
 
115
121
  scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
116
- assert isinstance(scrapy_request, Request) # noqa: S101
122
+ if not isinstance(scrapy_request, Request):
123
+ raise TypeError('scrapy_request must be an instance of the Request class')
124
+
117
125
  Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
118
126
 
119
127
  # Update the meta field with the meta field from the apify_request
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.5.2b1
3
+ Version: 1.5.2b2
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -20,21 +20,22 @@ apify/_memory_storage/resource_clients/key_value_store.py,sha256=Fbw2dDsO8WveQBM
20
20
  apify/_memory_storage/resource_clients/key_value_store_collection.py,sha256=Oy1jJ_wWd_g6DpLV4OAMBxfrnMxwtGDCG3mzVTL-Aww,1664
21
21
  apify/_memory_storage/resource_clients/request_queue.py,sha256=7LS_jrBBJvylFZedZHrgwMPyCsLz8X9-mAvvhOaYzXI,19614
22
22
  apify/_memory_storage/resource_clients/request_queue_collection.py,sha256=ydnYy2zu9wp_A58fr4LnENjvOVNEWgJ8vyG5jjHzXIA,1637
23
- apify/scrapy/__init__.py,sha256=z8uIRyjdp03tm2mmgRXff3LasUwuInnvkFt_L2Q-cGw,222
24
- apify/scrapy/pipelines.py,sha256=beBC1JwPU-51vEHL4k3GC1iljY74e5XAor8YBE8lE-I,955
25
- apify/scrapy/scheduler.py,sha256=XUo6_7ZqtoFsnLadG1985AZVfjMNZYOBMPB_UVE0zwc,4562
26
- apify/scrapy/utils.py,sha256=DcspnpCY7LVPGpFPrZzzn9nRB4p5gWgK-UOb1ChCwMA,6777
23
+ apify/scrapy/__init__.py,sha256=tCnqsdzcCx0Rpx13r1THeEJ6SzEgjmyrmHVKdCgfEfo,174
24
+ apify/scrapy/scheduler.py,sha256=Ogl789PrvY0jAAaK3hgTxjliNQ9Dx0cisE9YM8vm52Y,4926
25
+ apify/scrapy/utils.py,sha256=2qphTfTVHu-wG-1Ibrp0mDG18ONaosZcX0kQlf_nkxY,7162
27
26
  apify/scrapy/middlewares/__init__.py,sha256=zzosV8BD8SZQIrVKsSaGFGV9rHinNLKm5GPL3ZNxSZQ,96
28
- apify/scrapy/middlewares/apify_proxy.py,sha256=f_913T1mqhgb9ca_s3fGy8WETlJGDU1IMHV8OCATGFM,5922
29
- apify/scrapy/middlewares/apify_retry.py,sha256=RrUMrXgk9FTydBG99VbD7m1nDtWccMsO_Kf-rNivunI,4559
27
+ apify/scrapy/middlewares/apify_proxy.py,sha256=FFNGFq7danSNKl722nZ7zC-VKoMLRk6bVtlFL4cksvg,5902
28
+ apify/scrapy/middlewares/apify_retry.py,sha256=HaQcYxoOFm_CMgTWkbL5HPoEhpMPFKkHn11_j-4JvwE,4590
29
+ apify/scrapy/pipelines/__init__.py,sha256=KBUE3maOWrLfJRmWSsyW6YxxZY4lCGP2GZyMm9Z56VY,57
30
+ apify/scrapy/pipelines/actor_dataset_push.py,sha256=qXtSFIUhIvqC8KGDdft4N_FmSA2qyfZfsjd9uSYTZo4,956
30
31
  apify/storages/__init__.py,sha256=rBdwhyZxUMG6m_7uAb4sl5eg_dxiLvYVas5aRcZ6PIE,268
31
32
  apify/storages/base_storage.py,sha256=LKkC0W6ndmvVI2huoiIArZUKuMF6aqwqpjGQPWCcx2s,7408
32
33
  apify/storages/dataset.py,sha256=x_rte5nVOMPdg3ui_KHpH71UdUT2gcN10bGnrLXB6xk,23291
33
34
  apify/storages/key_value_store.py,sha256=HKSF6odZTMWgFF6usS9l9xcCGLyRitRq59LAKbmyYAY,10730
34
35
  apify/storages/request_queue.py,sha256=b0Qh2d1BWDtdbf_adAVS68fVkdcR2gtL4KyfxAp1oMY,26915
35
36
  apify/storages/storage_client_manager.py,sha256=QAGbu47pwFkHa-AFfolNW3W5hvR7zNz2yxK9Sv0wQbA,2457
36
- apify-1.5.2b1.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
37
- apify-1.5.2b1.dist-info/METADATA,sha256=OzQoXVVNAQ4DHjrK0qG5mwjvLp2c1dm4dVVKAGufH7M,6235
38
- apify-1.5.2b1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
39
- apify-1.5.2b1.dist-info/top_level.txt,sha256=2oFNsHggn5m_rCaaP7xijQg_-Va2ByOSYuvKgACsS5w,6
40
- apify-1.5.2b1.dist-info/RECORD,,
37
+ apify-1.5.2b2.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
38
+ apify-1.5.2b2.dist-info/METADATA,sha256=Cw3UknTKsrRzu5DUsxVUUIbIRTyBhzt0IpDcs49UwPk,6235
39
+ apify-1.5.2b2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
40
+ apify-1.5.2b2.dist-info/top_level.txt,sha256=2oFNsHggn5m_rCaaP7xijQg_-Va2ByOSYuvKgACsS5w,6
41
+ apify-1.5.2b2.dist-info/RECORD,,