apify 2.2.2b1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/scrapy/scheduler.py CHANGED
@@ -1,41 +1,33 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import traceback
4
+ from logging import getLogger
4
5
  from typing import TYPE_CHECKING
5
6
 
6
- from crawlee.storage_clients import MemoryStorageClient
7
+ from scrapy import Spider
8
+ from scrapy.core.scheduler import BaseScheduler
9
+ from scrapy.utils.reactor import is_asyncio_reactor_installed
7
10
 
8
- from apify._configuration import Configuration
11
+ from ._async_thread import AsyncThread
12
+ from .requests import to_apify_request, to_scrapy_request
13
+ from apify import Configuration
9
14
  from apify.apify_storage_client import ApifyStorageClient
15
+ from apify.storages import RequestQueue
10
16
 
11
- try:
12
- from scrapy import Spider
13
- from scrapy.core.scheduler import BaseScheduler
14
- from scrapy.utils.reactor import is_asyncio_reactor_installed
15
-
16
- if TYPE_CHECKING:
17
- from scrapy.http.request import Request
18
- except ImportError as exc:
19
- raise ImportError(
20
- 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
21
- ) from exc
22
-
23
- from crawlee._utils.crypto import crypto_random_object_id
17
+ if TYPE_CHECKING:
18
+ from scrapy.http.request import Request
19
+ from twisted.internet.defer import Deferred
24
20
 
25
- from apify import Actor
26
- from apify.scrapy.requests import to_apify_request, to_scrapy_request
27
- from apify.scrapy.utils import nested_event_loop
28
- from apify.storages import RequestQueue
21
+ logger = getLogger(__name__)
29
22
 
30
23
 
31
24
  class ApifyScheduler(BaseScheduler):
32
- """A Scrapy scheduler that uses the Apify Request Queue to manage requests.
25
+ """A Scrapy scheduler that uses the Apify `RequestQueue` to manage requests.
33
26
 
34
27
  This scheduler requires the asyncio Twisted reactor to be installed.
35
28
  """
36
29
 
37
30
  def __init__(self) -> None:
38
- """Create a new instance."""
39
31
  if not is_asyncio_reactor_installed():
40
32
  raise ValueError(
41
33
  f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. '
@@ -45,7 +37,10 @@ class ApifyScheduler(BaseScheduler):
45
37
  self._rq: RequestQueue | None = None
46
38
  self.spider: Spider | None = None
47
39
 
48
- def open(self, spider: Spider) -> None: # this has to be named "open"
40
+ # A thread with the asyncio event loop to run coroutines on.
41
+ self._async_thread = AsyncThread()
42
+
43
+ def open(self, spider: Spider) -> Deferred[None] | None:
49
44
  """Open the scheduler.
50
45
 
51
46
  Args:
@@ -53,23 +48,42 @@ class ApifyScheduler(BaseScheduler):
53
48
  """
54
49
  self.spider = spider
55
50
 
56
- async def open_queue() -> RequestQueue:
51
+ async def open_rq() -> RequestQueue:
57
52
  config = Configuration.get_global_configuration()
58
-
59
- # Use the ApifyStorageClient if the Actor is running on the Apify platform,
60
- # otherwise use the MemoryStorageClient.
61
- storage_client = (
62
- ApifyStorageClient.from_config(config) if config.is_at_home else MemoryStorageClient.from_config(config)
63
- )
64
-
65
- return await RequestQueue.open(storage_client=storage_client)
53
+ if config.is_at_home:
54
+ storage_client = ApifyStorageClient.from_config(config)
55
+ return await RequestQueue.open(storage_client=storage_client)
56
+ return await RequestQueue.open()
66
57
 
67
58
  try:
68
- self._rq = nested_event_loop.run_until_complete(open_queue())
69
- except BaseException:
59
+ self._rq = self._async_thread.run_coro(open_rq())
60
+ except Exception:
70
61
  traceback.print_exc()
71
62
  raise
72
63
 
64
+ return None
65
+
66
+ def close(self, reason: str) -> None:
67
+ """Close the scheduler.
68
+
69
+ Shut down the event loop and its thread gracefully.
70
+
71
+ Args:
72
+ reason: The reason for closing the spider.
73
+ """
74
+ logger.debug(f'Closing {self.__class__.__name__} due to {reason}...')
75
+ try:
76
+ self._async_thread.close()
77
+
78
+ except KeyboardInterrupt:
79
+ logger.warning('Shutdown interrupted by KeyboardInterrupt!')
80
+
81
+ except Exception:
82
+ logger.exception('Exception occurred while shutting down.')
83
+
84
+ finally:
85
+ logger.debug(f'{self.__class__.__name__} closed successfully.')
86
+
73
87
  def has_pending_requests(self) -> bool:
74
88
  """Check if the scheduler has any pending requests.
75
89
 
@@ -80,8 +94,8 @@ class ApifyScheduler(BaseScheduler):
80
94
  raise TypeError('self._rq must be an instance of the RequestQueue class')
81
95
 
82
96
  try:
83
- is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
84
- except BaseException:
97
+ is_finished = self._async_thread.run_coro(self._rq.is_finished())
98
+ except Exception:
85
99
  traceback.print_exc()
86
100
  raise
87
101
 
@@ -98,29 +112,27 @@ class ApifyScheduler(BaseScheduler):
98
112
  Returns:
99
113
  True if the request was successfully enqueued, False otherwise.
100
114
  """
101
- call_id = crypto_random_object_id(8)
102
- Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
115
+ logger.debug(f'ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
103
116
 
104
117
  if not isinstance(self.spider, Spider):
105
118
  raise TypeError('self.spider must be an instance of the Spider class')
106
119
 
107
120
  apify_request = to_apify_request(request, spider=self.spider)
108
121
  if apify_request is None:
109
- Actor.log.error(f'Request {request} was not enqueued because it could not be converted to Apify request.')
122
+ logger.error(f'Request {request} could not be converted to Apify request.')
110
123
  return False
111
124
 
112
- Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
113
-
125
+ logger.debug(f'Converted to apify_request: {apify_request}')
114
126
  if not isinstance(self._rq, RequestQueue):
115
127
  raise TypeError('self._rq must be an instance of the RequestQueue class')
116
128
 
117
129
  try:
118
- result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
119
- except BaseException:
130
+ result = self._async_thread.run_coro(self._rq.add_request(apify_request))
131
+ except Exception:
120
132
  traceback.print_exc()
121
133
  raise
122
134
 
123
- Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
135
+ logger.debug(f'rq.add_request result: {result}')
124
136
  return bool(result.was_already_present)
125
137
 
126
138
  def next_request(self) -> Request | None:
@@ -129,40 +141,31 @@ class ApifyScheduler(BaseScheduler):
129
141
  Returns:
130
142
  The next request, or None if there are no more requests.
131
143
  """
132
- call_id = crypto_random_object_id(8)
133
- Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
134
-
144
+ logger.debug('next_request called...')
135
145
  if not isinstance(self._rq, RequestQueue):
136
146
  raise TypeError('self._rq must be an instance of the RequestQueue class')
137
147
 
138
- # Fetch the next request from the Request Queue
139
148
  try:
140
- apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
141
- except BaseException:
149
+ apify_request = self._async_thread.run_coro(self._rq.fetch_next_request())
150
+ except Exception:
142
151
  traceback.print_exc()
143
152
  raise
144
153
 
145
- Actor.log.debug(
146
- f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})'
147
- )
148
-
154
+ logger.debug(f'Fetched apify_request: {apify_request}')
149
155
  if apify_request is None:
150
156
  return None
151
157
 
152
158
  if not isinstance(self.spider, Spider):
153
159
  raise TypeError('self.spider must be an instance of the Spider class')
154
160
 
155
- # Let the Request Queue know that the request is being handled. Every request should be marked as handled,
156
- # retrying is handled by the Scrapy's RetryMiddleware.
161
+ # Let the request queue know that the request is being handled. Every request should
162
+ # be marked as handled, retrying is handled by the Scrapy's RetryMiddleware.
157
163
  try:
158
- nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
159
- except BaseException:
164
+ self._async_thread.run_coro(self._rq.mark_request_as_handled(apify_request))
165
+ except Exception:
160
166
  traceback.print_exc()
161
167
  raise
162
168
 
163
169
  scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
164
- Actor.log.debug(
165
- f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned '
166
- f'(scrapy_request={scrapy_request})',
167
- )
170
+ logger.debug(f'Converted to scrapy_request: {scrapy_request}')
168
171
  return scrapy_request
apify/scrapy/utils.py CHANGED
@@ -1,29 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  from base64 import b64encode
5
4
  from typing import TYPE_CHECKING
6
5
  from urllib.parse import unquote
7
6
 
8
- from apify_shared.utils import ignore_docs
7
+ from scrapy.utils.project import get_project_settings
8
+ from scrapy.utils.python import to_bytes
9
9
 
10
- try:
11
- from scrapy.utils.project import get_project_settings
12
- from scrapy.utils.python import to_bytes
10
+ if TYPE_CHECKING:
11
+ from scrapy.settings import Settings
13
12
 
14
- if TYPE_CHECKING:
15
- from scrapy.settings import Settings
16
- except ImportError as exc:
17
- raise ImportError(
18
- 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
19
- '"pip install apify[scrapy]".'
20
- ) from exc
21
13
 
22
-
23
- nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
24
-
25
-
26
- @ignore_docs
27
14
  def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
28
15
  """Generate a basic authentication header for the given username and password."""
29
16
  string = f'{unquote(username)}:{unquote(password)}'
@@ -31,18 +18,6 @@ def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'la
31
18
  return b'Basic ' + b64encode(user_pass)
32
19
 
33
20
 
34
- @ignore_docs
35
- def get_running_event_loop_id() -> int:
36
- """Get the ID of the currently running event loop.
37
-
38
- It could be useful mainly for debugging purposes.
39
-
40
- Returns:
41
- The ID of the event loop.
42
- """
43
- return id(asyncio.get_running_loop())
44
-
45
-
46
21
  def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
47
22
  """Integrates Apify configuration into a Scrapy project settings.
48
23
 
@@ -65,10 +40,6 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
65
40
  # ensuring it is executed as the final step in the pipeline sequence
66
41
  settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
67
42
 
68
- # Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request
69
- # during process_response, but currently we have no way of detecting it and handling it properly.
70
- settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None
71
-
72
43
  # Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware
73
44
  settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
74
45
  settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: apify
3
- Version: 2.2.2b1
3
+ Version: 2.3.0
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3.13
20
20
  Classifier: Topic :: Software Development :: Libraries
21
21
  Provides-Extra: scrapy
22
- Requires-Dist: apify-client (>=1.8.1)
22
+ Requires-Dist: apify-client (>=1.9.2)
23
23
  Requires-Dist: apify-shared (>=1.2.1)
24
24
  Requires-Dist: crawlee (>=0.5.1,<0.6.0)
25
25
  Requires-Dist: cryptography (>=42.0.0)
@@ -1,14 +1,15 @@
1
1
  apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
2
- apify/_actor.py,sha256=Pb7HPHIAodQOiGtyb-At45x8GfDItWCusRtQkoz1Pq4,46138
3
- apify/_configuration.py,sha256=T3Z_o_W98iSyTbrutfb578yW51aexZ_V0FcLwTxFLjI,10878
2
+ apify/_actor.py,sha256=EB3gGjASV0PbPJ6BtgOq45HN23vM-9ceNCNRfeh2BkQ,48821
3
+ apify/_charging.py,sha256=m7hJIQde4M7vS4g_4hsNRP5xHNXjYQ8MyqOEGeNb7VY,12267
4
+ apify/_configuration.py,sha256=yidcWHsu-IJ2mmLmXStKq_HHcdfQxZq7koYjlZfRnQ8,11128
4
5
  apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
5
6
  apify/_crypto.py,sha256=e0_aM3l9_5Osk-jszYOOjrAKK60OggSHbiw5c30QnsU,5638
6
- apify/_models.py,sha256=Btlz-23obKY5tJ75JnUwkVNC2lmU1IEBbdU3HvWaVhg,5748
7
+ apify/_models.py,sha256=uWazgwDWH3OneftJ3ArhPz9N3oVuam5pdmlOQ1GE0NU,7905
7
8
  apify/_platform_event_manager.py,sha256=44xyV0Lpzf4h4VZ0rkyYg_nhbQkEONNor8_Z9gIKO40,7899
8
9
  apify/_proxy_configuration.py,sha256=c-O6_PZ9pUD-i4J0RFEKTtfyJPP2rTRJJA1TH8NVsV8,13189
9
- apify/_utils.py,sha256=CCLkpAsZKp00ykm88Z_Fbck5PNT0j6mJYOuD0RxzZUs,1620
10
+ apify/_utils.py,sha256=92byxeXTpDFwhBq7ZS-obeXKtKWvVzCZMV0Drg3EjhQ,1634
10
11
  apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
11
- apify/apify_storage_client/_apify_storage_client.py,sha256=0rS75JoRHt7stRYS9-oqm3DmaSIZQN5C11N5MZQUvlA,2616
12
+ apify/apify_storage_client/_apify_storage_client.py,sha256=jTX5vd-K9mnFTyZu2V2dUg7oyWogvmNIDUlEXnvIlOw,2766
12
13
  apify/apify_storage_client/_dataset_client.py,sha256=UUodnR_MQBg5RkURrfegkGJWR5OmdPPgPfGepvkdQoU,5580
13
14
  apify/apify_storage_client/_dataset_collection_client.py,sha256=qCcKZlA0bkO-sL7xED0Yose85NlrRa9AKr4oCSrYX6k,1489
14
15
  apify/apify_storage_client/_key_value_store_client.py,sha256=MSuoIeqEHLu92WfUU7kyB3Cc_gKUlm8TghnU3_xkPtE,3363
@@ -18,21 +19,24 @@ apify/apify_storage_client/_request_queue_collection_client.py,sha256=MdzgbQb2D8
18
19
  apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
20
  apify/log.py,sha256=j-E4t-WeA93bc1NCQRG8sTntehQCiiN8ia-MdQe3_Ts,1291
20
21
  apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- apify/scrapy/__init__.py,sha256=HE5wCN7-DZKPydLCOvjNyLuL3CvN2fUFweXfrDfe1Ss,348
22
+ apify/scrapy/__init__.py,sha256=m2a0ts_JY9xJkBy4JU5mV8PJqjA3GGKLXBFu4nl-n-A,1048
23
+ apify/scrapy/_actor_runner.py,sha256=rXWSnlQWGskDUH8PtLCv5SkOIx4AiVa4QbCYeCett5c,938
24
+ apify/scrapy/_async_thread.py,sha256=AfeH9ZkSRZXxL11wzwrroDNsTzq4tAvURlinUZBtYMA,4753
25
+ apify/scrapy/_logging_config.py,sha256=hFq90fNtZyjjJA7w2k-mtuEC8xCFiBMTalbwPDcaig4,2022
22
26
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
23
- apify/scrapy/middlewares/apify_proxy.py,sha256=H8a3vrA_7S_ucRkE3VDNMU8TY2CdzGTMXbhbJbfLv1c,5755
27
+ apify/scrapy/middlewares/apify_proxy.py,sha256=CDAOXS3bcVDZHM3B0GvhXbxEikMIadLF_0P73WL_nI4,5550
24
28
  apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
29
  apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
26
- apify/scrapy/pipelines/actor_dataset_push.py,sha256=otggoULfUdCqOPJLb9wMROZ9WylnlL-209930tMS2Rg,971
30
+ apify/scrapy/pipelines/actor_dataset_push.py,sha256=XUUyznQTD-E3wYUUFt2WAOnWhbnRrY0WuedlfYfYhDI,846
27
31
  apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
32
  apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
- apify/scrapy/requests.py,sha256=yZ9hIsz2YyqOoOwzN9F1h76wG4qwawrI6h_6xq0I7Iw,7599
30
- apify/scrapy/scheduler.py,sha256=blO333BhFDMu3wAvSQONYdhmAmyiysqsv3YF5FKO_20,6281
31
- apify/scrapy/utils.py,sha256=758DcHCSAgCTProY0QX74uJ1XrzVsQwvCmFanj2f_3Q,2928
33
+ apify/scrapy/requests.py,sha256=tOiFtG0kyzbBwtmaOisLAcpJENR1eDtpPR1nRH7JJGg,6551
34
+ apify/scrapy/scheduler.py,sha256=-r1wZjMmeRDPxZKGHO-EYDYpGdDgSPAdNgMFViqUK8E,6019
35
+ apify/scrapy/utils.py,sha256=5cka33PWc_at14yjhnLkCvY4h-ySUgVVhhDLxTy39ZI,1965
32
36
  apify/storages/__init__.py,sha256=FW-z6ubuPnHGM-Wp15T8mR5q6lnpDGrCW-IkgZd5L30,177
33
37
  apify/storages/_request_list.py,sha256=-lZJcE5nq69aJhGFJ7Sh2ctqgAWUDyOwYm5_0y1hdAE,5865
34
38
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- apify-2.2.2b1.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
36
- apify-2.2.2b1.dist-info/METADATA,sha256=vHXzlhslB1UvfspGitKBpmPXCeqxeRcBgMJ523JtHD8,8696
37
- apify-2.2.2b1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
38
- apify-2.2.2b1.dist-info/RECORD,,
39
+ apify-2.3.0.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
40
+ apify-2.3.0.dist-info/METADATA,sha256=CZJsIK4N-dhBV72efiPK19wTsFzPh9vd8i12FVebJkE,8694
41
+ apify-2.3.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
42
+ apify-2.3.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.1
2
+ Generator: poetry-core 2.1.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any