apify 2.2.2b2__py3-none-any.whl → 2.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/_actor.py CHANGED
@@ -270,8 +270,8 @@ class _ActorType:
270
270
  self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython')
271
271
  elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508
272
272
  self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test')
273
- elif hasattr(asyncio, '_nest_patched'):
274
- self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in a nested event loop')
273
+ elif os.getenv('SCRAPY_SETTINGS_MODULE'):
274
+ self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running with Scrapy')
275
275
  else:
276
276
  sys.exit(exit_code)
277
277
 
@@ -66,3 +66,7 @@ class ApifyStorageClient(BaseStorageClient):
66
66
  @override
67
67
  async def purge_on_start(self) -> None:
68
68
  pass
69
+
70
+ @override
71
+ def get_rate_limit_errors(self) -> dict[int, int]: # type: ignore[misc]
72
+ return self._apify_client.stats.rate_limit_errors
apify/scrapy/__init__.py CHANGED
@@ -1,11 +1,32 @@
1
- from apify.scrapy.requests import to_apify_request, to_scrapy_request
2
- from apify.scrapy.scheduler import ApifyScheduler
3
- from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
1
+ from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
+ from crawlee._utils.try_import import try_import as _try_import
3
+
4
+ _install_import_hook(__name__)
5
+
6
+ # The following imports use try_import to handle optional dependencies, as they may not always be available.
7
+
8
+ with _try_import(__name__, 'run_scrapy_actor'):
9
+ from ._actor_runner import run_scrapy_actor
10
+
11
+ with _try_import(__name__, 'initialize_logging'):
12
+ from ._logging_config import initialize_logging
13
+
14
+ with _try_import(__name__, 'to_apify_request', 'to_scrapy_request'):
15
+ from .requests import to_apify_request, to_scrapy_request
16
+
17
+ with _try_import(__name__, 'ApifyScheduler'):
18
+ from .scheduler import ApifyScheduler
19
+
20
+ with _try_import(__name__, 'apply_apify_settings', 'get_basic_auth_header'):
21
+ from .utils import apply_apify_settings, get_basic_auth_header
22
+
4
23
 
5
24
  __all__ = [
6
25
  'ApifyScheduler',
26
+ 'apply_apify_settings',
7
27
  'get_basic_auth_header',
8
- 'get_running_event_loop_id',
28
+ 'initialize_logging',
29
+ 'run_scrapy_actor',
9
30
  'to_apify_request',
10
31
  'to_scrapy_request',
11
32
  ]
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from typing import TYPE_CHECKING
5
+
6
+ from twisted.internet.defer import Deferred, ensureDeferred
7
+ from twisted.internet.task import react
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Coroutine
11
+
12
+
13
+ async def _run_coro_as_deferred(coro: Coroutine) -> None:
14
+ """Wrap the given asyncio coroutine in a Task and await its result as a Twisted Deferred."""
15
+ task = asyncio.ensure_future(coro)
16
+ await Deferred.fromFuture(task)
17
+
18
+
19
+ def run_scrapy_actor(coro: Coroutine) -> None:
20
+ """Start Twisted's reactor and execute the provided Actor coroutine.
21
+
22
+ This function initiates the Twisted reactor and runs the given asyncio coroutine (typically the
23
+ Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted event loops,
24
+ enabling the Apify and Scrapy integration to work together.
25
+ """
26
+ react(lambda _: ensureDeferred(_run_coro_as_deferred(coro)))
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import threading
5
+ from concurrent import futures
6
+ from datetime import timedelta
7
+ from logging import getLogger
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Coroutine
12
+
13
+ logger = getLogger(__name__)
14
+
15
+
16
+ class AsyncThread:
17
+ """Class for running an asyncio event loop in a separate thread.
18
+
19
+ This allows running asynchronous coroutines from synchronous code by executingthem on an event loop
20
+ that runs in its own dedicated thread.
21
+ """
22
+
23
+ def __init__(self) -> None:
24
+ self._eventloop = asyncio.new_event_loop()
25
+
26
+ # Start the event loop in a dedicated daemon thread.
27
+ self._thread = threading.Thread(
28
+ target=self._start_event_loop,
29
+ daemon=True,
30
+ )
31
+ self._thread.start()
32
+
33
+ def run_coro(
34
+ self,
35
+ coro: Coroutine,
36
+ timeout: timedelta = timedelta(seconds=60),
37
+ ) -> Any:
38
+ """Run a coroutine on an event loop running in a separate thread.
39
+
40
+ This method schedules the coroutine to run on the event loop and blocks until the coroutine completes
41
+ or the specified timeout is reached.
42
+
43
+ Args:
44
+ coro: The coroutine to run.
45
+ timeout: The maximum number of seconds to wait for the coroutine to finish.
46
+
47
+ Returns:
48
+ The result returned by the coroutine.
49
+
50
+ Raises:
51
+ RuntimeError: If the event loop is not running.
52
+ TimeoutError: If the coroutine does not complete within the timeout.
53
+ Exception: Any exception raised during coroutine execution.
54
+ """
55
+ if not self._eventloop.is_running():
56
+ raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.')
57
+
58
+ # Submit the coroutine to the event loop running in the other thread.
59
+ future = asyncio.run_coroutine_threadsafe(coro, self._eventloop)
60
+ try:
61
+ # Wait for the coroutine's result until the specified timeout.
62
+ return future.result(timeout=timeout.total_seconds())
63
+ except futures.TimeoutError as exc:
64
+ logger.exception('Coroutine execution timed out.', exc_info=exc)
65
+ raise
66
+ except Exception as exc:
67
+ logger.exception('Coroutine execution raised an exception.', exc_info=exc)
68
+ raise
69
+
70
+ def close(self, timeout: timedelta = timedelta(seconds=60)) -> None:
71
+ """Close the event loop and its thread gracefully.
72
+
73
+ This method cancels all pending tasks, stops the event loop, and waits for the thread to exit.
74
+ If the thread does not exit within the given timeout, a forced shutdown is attempted.
75
+
76
+ Args:
77
+ timeout: The maximum number of seconds to wait for the event loop thread to exit.
78
+ """
79
+ if self._eventloop.is_running():
80
+ # Cancel all pending tasks in the event loop.
81
+ self.run_coro(self._shutdown_tasks())
82
+
83
+ # Schedule the event loop to stop.
84
+ self._eventloop.call_soon_threadsafe(self._eventloop.stop)
85
+
86
+ # Wait for the event loop thread to finish execution.
87
+ self._thread.join(timeout=timeout.total_seconds())
88
+
89
+ # If the thread is still running after the timeout, force a shutdown.
90
+ if self._thread.is_alive():
91
+ logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...')
92
+ self._force_exit_event_loop()
93
+
94
+ def _start_event_loop(self) -> None:
95
+ """Set up and run the asyncio event loop in the dedicated thread."""
96
+ asyncio.set_event_loop(self._eventloop)
97
+ try:
98
+ self._eventloop.run_forever()
99
+ finally:
100
+ self._eventloop.close()
101
+ logger.debug('Asyncio event loop has been closed.')
102
+
103
+ async def _shutdown_tasks(self) -> None:
104
+ """Cancel all pending tasks in the event loop."""
105
+ # Retrieve all tasks for the event loop, excluding the current task.
106
+ tasks = [task for task in asyncio.all_tasks(self._eventloop) if task is not asyncio.current_task()]
107
+
108
+ # Cancel each pending task.
109
+ for task in tasks:
110
+ task.cancel()
111
+
112
+ # Wait until all tasks have been cancelled or finished.
113
+ await asyncio.gather(*tasks, return_exceptions=True)
114
+
115
+ def _force_exit_event_loop(self) -> None:
116
+ """Forcefully shut down the event loop and its thread."""
117
+ try:
118
+ logger.info('Forced shutdown of the event loop and its thread...')
119
+ self._eventloop.call_soon_threadsafe(self._eventloop.stop)
120
+ self._thread.join(timeout=5)
121
+ except Exception as exc:
122
+ logger.exception('Exception occurred during forced event loop shutdown.', exc_info=exc)
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+ from scrapy.utils import log as scrapy_logging
7
+ from scrapy.utils.project import get_project_settings
8
+
9
+ from apify.log import ActorLogFormatter
10
+
11
+ # Define logger names.
12
+ _PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
13
+ _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
14
+ _ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
15
+
16
+
17
+ def _configure_logger(name: str | None, logging_level: str, handler: logging.Handler) -> None:
18
+ """Clear and reconfigure the logger."""
19
+ logger = logging.getLogger(name)
20
+ logger.handlers.clear()
21
+ logger.setLevel(logging_level)
22
+
23
+ if name is None: # Root logger.
24
+ logger.addHandler(handler)
25
+ logger.propagate = False
26
+ else:
27
+ logger.propagate = True
28
+
29
+
30
+ def initialize_logging() -> None:
31
+ """Configure logging for Apify Actors and adjust Scrapy's logging settings."""
32
+ # Retrieve Scrapy project settings and determine the logging level.
33
+ settings = get_project_settings()
34
+ logging_level = settings.get('LOG_LEVEL', 'INFO') # Default to INFO.
35
+
36
+ # Create a custom handler with the Apify log formatter.
37
+ handler = logging.StreamHandler()
38
+ handler.setFormatter(ActorLogFormatter(include_logger_name=True))
39
+
40
+ # Configure the root logger and all other defined loggers.
41
+ for logger_name in [None, *_ALL_LOGGERS]:
42
+ _configure_logger(logger_name, logging_level, handler)
43
+
44
+ # Set the 'httpx' logger to a less verbose level.
45
+ logging.getLogger('httpx').setLevel('WARNING')
46
+
47
+ # Monkey-patch Scrapy's logging configuration to re-apply our settings.
48
+ original_configure_logging = scrapy_logging.configure_logging
49
+
50
+ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
51
+ original_configure_logging(*args, **kwargs)
52
+ for logger_name in [None, *_ALL_LOGGERS]:
53
+ _configure_logger(logger_name, logging_level, handler)
54
+
55
+ scrapy_logging.configure_logging = new_configure_logging
@@ -3,19 +3,15 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
  from urllib.parse import ParseResult, urlparse
5
5
 
6
- try:
7
- if TYPE_CHECKING:
8
- from scrapy import Request, Spider
9
- from scrapy.crawler import Crawler
10
- from scrapy.core.downloader.handlers.http11 import TunnelError
11
- from scrapy.exceptions import NotConfigured
12
- except ImportError as exc:
13
- raise ImportError(
14
- 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
15
- ) from exc
6
+ from scrapy.core.downloader.handlers.http11 import TunnelError
7
+ from scrapy.exceptions import NotConfigured
16
8
 
17
9
  from apify import Actor, ProxyConfiguration
18
- from apify.scrapy.utils import get_basic_auth_header
10
+ from apify.scrapy import get_basic_auth_header
11
+
12
+ if TYPE_CHECKING:
13
+ from scrapy import Request, Spider
14
+ from scrapy.crawler import Crawler
19
15
 
20
16
 
21
17
  class ApifyHttpProxyMiddleware:
@@ -51,7 +47,7 @@ class ApifyHttpProxyMiddleware:
51
47
  proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
52
48
 
53
49
  if proxy_settings is None:
54
- Actor.log.warning(
50
+ Actor.log.info(
55
51
  'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing '
56
52
  ' in the Actor input.'
57
53
  )
@@ -60,7 +56,7 @@ class ApifyHttpProxyMiddleware:
60
56
  use_apify_proxy = proxy_settings.get('useApifyProxy', False)
61
57
 
62
58
  if use_apify_proxy is not True:
63
- Actor.log.warning(
59
+ Actor.log.info(
64
60
  'ApifyHttpProxyMiddleware is not going to be used. Actor input field '
65
61
  '"proxyConfiguration.useApifyProxy" is set to False.'
66
62
  )
@@ -1,19 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from logging import getLogger
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from itemadapter.adapter import ItemAdapter
6
7
 
7
- try:
8
- if TYPE_CHECKING:
9
- from scrapy import Item, Spider
10
- except ImportError as exc:
11
- raise ImportError(
12
- 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13
- ) from exc
14
-
15
8
  from apify import Actor
16
9
 
10
+ if TYPE_CHECKING:
11
+ from scrapy import Item, Spider
12
+
13
+ logger = getLogger(__name__)
14
+
17
15
 
18
16
  class ActorDatasetPushPipeline:
19
17
  """A Scrapy pipeline for pushing items to an Actor's default dataset.
@@ -28,6 +26,6 @@ class ActorDatasetPushPipeline:
28
26
  ) -> Item:
29
27
  """Pushes the provided Scrapy item to the Actor's default dataset."""
30
28
  item_dict = ItemAdapter(item).asdict()
31
- Actor.log.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.')
29
+ logger.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.')
32
30
  await Actor.push_data(item_dict)
33
31
  return item
apify/scrapy/requests.py CHANGED
@@ -2,37 +2,21 @@ from __future__ import annotations
2
2
 
3
3
  import codecs
4
4
  import pickle
5
+ from logging import getLogger
5
6
  from typing import Any, cast
6
7
 
7
- from apify_shared.utils import ignore_docs
8
+ from scrapy import Request as ScrapyRequest
9
+ from scrapy import Spider
10
+ from scrapy.http.headers import Headers
11
+ from scrapy.utils.request import request_from_dict
8
12
 
9
- try:
10
- from scrapy import Request, Spider
11
- from scrapy.http.headers import Headers
12
- from scrapy.utils.request import request_from_dict
13
- except ImportError as exc:
14
- raise ImportError(
15
- 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
16
- ) from exc
17
-
18
- from crawlee import Request as CrawleeRequest
13
+ from crawlee import Request as ApifyRequest
19
14
  from crawlee._types import HttpHeaders
20
- from crawlee._utils.crypto import crypto_random_object_id
21
- from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
22
-
23
- from apify import Actor
24
-
25
15
 
26
- def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
27
- """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
28
-
29
- Works for RetryMiddleware and RedirectMiddleware.
30
- """
31
- return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
16
+ logger = getLogger(__name__)
32
17
 
33
18
 
34
- @ignore_docs
35
- def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None:
19
+ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None:
36
20
  """Convert a Scrapy request to an Apify request.
37
21
 
38
22
  Args:
@@ -42,54 +26,45 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
42
26
  Returns:
43
27
  The converted Apify request if the conversion was successful, otherwise None.
44
28
  """
45
- if not isinstance(scrapy_request, Request):
46
- Actor.log.warning( # type: ignore[unreachable]
47
- 'Failed to convert to Apify request: Scrapy request must be a Request instance.'
48
- )
29
+ if not isinstance(scrapy_request, ScrapyRequest):
30
+ logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.') # type: ignore[unreachable]
49
31
  return None
50
32
 
51
- call_id = crypto_random_object_id(8)
52
- Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
33
+ logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...')
34
+
35
+ # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
36
+ request_kwargs: dict[str, Any] = {
37
+ 'url': scrapy_request.url,
38
+ 'method': scrapy_request.method,
39
+ 'payload': scrapy_request.body,
40
+ 'use_extended_unique_key': True,
41
+ 'keep_url_fragment': False,
42
+ }
53
43
 
54
44
  try:
55
- if _is_request_produced_by_middleware(scrapy_request):
56
- unique_key = compute_unique_key(
57
- url=scrapy_request.url,
58
- method=scrapy_request.method, # type: ignore[arg-type] # str vs literal
59
- payload=scrapy_request.body,
60
- use_extended_unique_key=True,
61
- )
62
- elif scrapy_request.dont_filter:
63
- unique_key = crypto_random_object_id(8)
64
- elif scrapy_request.meta.get('apify_request_unique_key'):
65
- unique_key = scrapy_request.meta['apify_request_unique_key']
45
+ if scrapy_request.dont_filter:
46
+ request_kwargs['always_enqueue'] = True
66
47
  else:
67
- unique_key = crypto_random_object_id(8)
48
+ if scrapy_request.meta.get('apify_request_unique_key'):
49
+ request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
68
50
 
69
- if scrapy_request.meta.get('apify_request_id'):
70
- request_id = scrapy_request.meta['apify_request_id']
71
- else:
72
- request_id = unique_key_to_request_id(unique_key)
73
-
74
- apify_request = CrawleeRequest(
75
- url=scrapy_request.url,
76
- method=scrapy_request.method,
77
- payload=scrapy_request.body,
78
- user_data=scrapy_request.meta.get('userData', {}),
79
- unique_key=unique_key,
80
- id=request_id,
81
- )
51
+ if scrapy_request.meta.get('apify_request_id'):
52
+ request_kwargs['id'] = scrapy_request.meta['apify_request_id']
53
+
54
+ request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
82
55
 
83
56
  # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
84
57
  if isinstance(scrapy_request.headers, Headers):
85
- apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
58
+ request_kwargs['headers'] = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
86
59
  else:
87
- Actor.log.warning( # type: ignore[unreachable]
60
+ logger.warning( # type: ignore[unreachable]
88
61
  f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
89
62
  )
90
63
 
91
- # Serialize the Scrapy Request and store it in the apify_request.
92
- # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
64
+ apify_request = ApifyRequest.from_url(**request_kwargs)
65
+
66
+ # Serialize the Scrapy ScrapyRequest and store it in the apify_request.
67
+ # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
93
68
  # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
94
69
  # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
95
70
  scrapy_request_dict = scrapy_request.to_dict(spider=spider)
@@ -97,15 +72,14 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
97
72
  apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
98
73
 
99
74
  except Exception as exc:
100
- Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
75
+ logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
101
76
  return None
102
77
 
103
- Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
78
+ logger.debug(f'scrapy_request was converted to the apify_request={apify_request}')
104
79
  return apify_request
105
80
 
106
81
 
107
- @ignore_docs
108
- def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
82
+ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequest:
109
83
  """Convert an Apify request to a Scrapy request.
110
84
 
111
85
  Args:
@@ -113,24 +87,23 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
113
87
  spider: The Scrapy spider that the request is associated with.
114
88
 
115
89
  Raises:
116
- TypeError: If the apify_request is not a crawlee request.
117
- ValueError: If the apify_request does not contain the required keys.
90
+ TypeError: If the Apify request is not an instance of the `ApifyRequest` class.
91
+ ValueError: If the Apify request does not contain the required keys.
118
92
 
119
93
  Returns:
120
94
  The converted Scrapy request.
121
95
  """
122
- if not isinstance(cast(Any, apify_request), CrawleeRequest):
123
- raise TypeError('apify_request must be a crawlee.Request instance')
96
+ if not isinstance(cast(Any, apify_request), ApifyRequest):
97
+ raise TypeError('apify_request must be a crawlee.ScrapyRequest instance')
124
98
 
125
- call_id = crypto_random_object_id(8)
126
- Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
99
+ logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...')
127
100
 
128
101
  # If the apify_request comes from the Scrapy
129
102
  if 'scrapy_request' in apify_request.user_data:
130
- # Deserialize the Scrapy Request from the apify_request.
103
+ # Deserialize the Scrapy ScrapyRequest from the apify_request.
131
104
  # - This process involves decoding the base64-encoded request data and reconstructing
132
- # the Scrapy Request object from its dictionary representation.
133
- Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
105
+ # the Scrapy ScrapyRequest object from its dictionary representation.
106
+ logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...')
134
107
 
135
108
  scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
136
109
  if not isinstance(scrapy_request_dict_encoded, str):
@@ -141,10 +114,10 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
141
114
  raise TypeError('scrapy_request_dict must be a dictionary')
142
115
 
143
116
  scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
144
- if not isinstance(scrapy_request, Request):
145
- raise TypeError('scrapy_request must be an instance of the Request class')
117
+ if not isinstance(scrapy_request, ScrapyRequest):
118
+ raise TypeError('scrapy_request must be an instance of the ScrapyRequest class')
146
119
 
147
- Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
120
+ logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...')
148
121
 
149
122
  # Update the meta field with the meta field from the apify_request
150
123
  meta = scrapy_request.meta or {}
@@ -152,11 +125,11 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
152
125
  # scrapy_request.meta is a property, so we have to set it like this
153
126
  scrapy_request._meta = meta # noqa: SLF001
154
127
 
155
- # If the apify_request comes directly from the Request Queue, typically start URLs
128
+ # If the apify_request comes directly from the Scrapy, typically start URLs.
156
129
  else:
157
- Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
130
+ logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)')
158
131
 
159
- scrapy_request = Request(
132
+ scrapy_request = ScrapyRequest(
160
133
  url=apify_request.url,
161
134
  method=apify_request.method,
162
135
  meta={
@@ -173,5 +146,5 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
173
146
  if apify_request.user_data:
174
147
  scrapy_request.meta['userData'] = apify_request.user_data
175
148
 
176
- Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
149
+ logger.debug(f'an apify_request was converted to the scrapy_request={scrapy_request}')
177
150
  return scrapy_request
apify/scrapy/scheduler.py CHANGED
@@ -1,41 +1,33 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import traceback
4
+ from logging import getLogger
4
5
  from typing import TYPE_CHECKING
5
6
 
6
- from crawlee.storage_clients import MemoryStorageClient
7
+ from scrapy import Spider
8
+ from scrapy.core.scheduler import BaseScheduler
9
+ from scrapy.utils.reactor import is_asyncio_reactor_installed
7
10
 
8
- from apify._configuration import Configuration
11
+ from ._async_thread import AsyncThread
12
+ from .requests import to_apify_request, to_scrapy_request
13
+ from apify import Configuration
9
14
  from apify.apify_storage_client import ApifyStorageClient
15
+ from apify.storages import RequestQueue
10
16
 
11
- try:
12
- from scrapy import Spider
13
- from scrapy.core.scheduler import BaseScheduler
14
- from scrapy.utils.reactor import is_asyncio_reactor_installed
15
-
16
- if TYPE_CHECKING:
17
- from scrapy.http.request import Request
18
- except ImportError as exc:
19
- raise ImportError(
20
- 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
21
- ) from exc
22
-
23
- from crawlee._utils.crypto import crypto_random_object_id
17
+ if TYPE_CHECKING:
18
+ from scrapy.http.request import Request
19
+ from twisted.internet.defer import Deferred
24
20
 
25
- from apify import Actor
26
- from apify.scrapy.requests import to_apify_request, to_scrapy_request
27
- from apify.scrapy.utils import nested_event_loop
28
- from apify.storages import RequestQueue
21
+ logger = getLogger(__name__)
29
22
 
30
23
 
31
24
  class ApifyScheduler(BaseScheduler):
32
- """A Scrapy scheduler that uses the Apify Request Queue to manage requests.
25
+ """A Scrapy scheduler that uses the Apify `RequestQueue` to manage requests.
33
26
 
34
27
  This scheduler requires the asyncio Twisted reactor to be installed.
35
28
  """
36
29
 
37
30
  def __init__(self) -> None:
38
- """Create a new instance."""
39
31
  if not is_asyncio_reactor_installed():
40
32
  raise ValueError(
41
33
  f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. '
@@ -45,7 +37,10 @@ class ApifyScheduler(BaseScheduler):
45
37
  self._rq: RequestQueue | None = None
46
38
  self.spider: Spider | None = None
47
39
 
48
- def open(self, spider: Spider) -> None: # this has to be named "open"
40
+ # A thread with the asyncio event loop to run coroutines on.
41
+ self._async_thread = AsyncThread()
42
+
43
+ def open(self, spider: Spider) -> Deferred[None] | None:
49
44
  """Open the scheduler.
50
45
 
51
46
  Args:
@@ -53,23 +48,42 @@ class ApifyScheduler(BaseScheduler):
53
48
  """
54
49
  self.spider = spider
55
50
 
56
- async def open_queue() -> RequestQueue:
51
+ async def open_rq() -> RequestQueue:
57
52
  config = Configuration.get_global_configuration()
58
-
59
- # Use the ApifyStorageClient if the Actor is running on the Apify platform,
60
- # otherwise use the MemoryStorageClient.
61
- storage_client = (
62
- ApifyStorageClient.from_config(config) if config.is_at_home else MemoryStorageClient.from_config(config)
63
- )
64
-
65
- return await RequestQueue.open(storage_client=storage_client)
53
+ if config.is_at_home:
54
+ storage_client = ApifyStorageClient.from_config(config)
55
+ return await RequestQueue.open(storage_client=storage_client)
56
+ return await RequestQueue.open()
66
57
 
67
58
  try:
68
- self._rq = nested_event_loop.run_until_complete(open_queue())
69
- except BaseException:
59
+ self._rq = self._async_thread.run_coro(open_rq())
60
+ except Exception:
70
61
  traceback.print_exc()
71
62
  raise
72
63
 
64
+ return None
65
+
66
+ def close(self, reason: str) -> None:
67
+ """Close the scheduler.
68
+
69
+ Shut down the event loop and its thread gracefully.
70
+
71
+ Args:
72
+ reason: The reason for closing the spider.
73
+ """
74
+ logger.debug(f'Closing {self.__class__.__name__} due to {reason}...')
75
+ try:
76
+ self._async_thread.close()
77
+
78
+ except KeyboardInterrupt:
79
+ logger.warning('Shutdown interrupted by KeyboardInterrupt!')
80
+
81
+ except Exception:
82
+ logger.exception('Exception occurred while shutting down.')
83
+
84
+ finally:
85
+ logger.debug(f'{self.__class__.__name__} closed successfully.')
86
+
73
87
  def has_pending_requests(self) -> bool:
74
88
  """Check if the scheduler has any pending requests.
75
89
 
@@ -80,8 +94,8 @@ class ApifyScheduler(BaseScheduler):
80
94
  raise TypeError('self._rq must be an instance of the RequestQueue class')
81
95
 
82
96
  try:
83
- is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
84
- except BaseException:
97
+ is_finished = self._async_thread.run_coro(self._rq.is_finished())
98
+ except Exception:
85
99
  traceback.print_exc()
86
100
  raise
87
101
 
@@ -98,29 +112,27 @@ class ApifyScheduler(BaseScheduler):
98
112
  Returns:
99
113
  True if the request was successfully enqueued, False otherwise.
100
114
  """
101
- call_id = crypto_random_object_id(8)
102
- Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
115
+ logger.debug(f'ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
103
116
 
104
117
  if not isinstance(self.spider, Spider):
105
118
  raise TypeError('self.spider must be an instance of the Spider class')
106
119
 
107
120
  apify_request = to_apify_request(request, spider=self.spider)
108
121
  if apify_request is None:
109
- Actor.log.error(f'Request {request} was not enqueued because it could not be converted to Apify request.')
122
+ logger.error(f'Request {request} could not be converted to Apify request.')
110
123
  return False
111
124
 
112
- Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
113
-
125
+ logger.debug(f'Converted to apify_request: {apify_request}')
114
126
  if not isinstance(self._rq, RequestQueue):
115
127
  raise TypeError('self._rq must be an instance of the RequestQueue class')
116
128
 
117
129
  try:
118
- result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
119
- except BaseException:
130
+ result = self._async_thread.run_coro(self._rq.add_request(apify_request))
131
+ except Exception:
120
132
  traceback.print_exc()
121
133
  raise
122
134
 
123
- Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
135
+ logger.debug(f'rq.add_request result: {result}')
124
136
  return bool(result.was_already_present)
125
137
 
126
138
  def next_request(self) -> Request | None:
@@ -129,40 +141,31 @@ class ApifyScheduler(BaseScheduler):
129
141
  Returns:
130
142
  The next request, or None if there are no more requests.
131
143
  """
132
- call_id = crypto_random_object_id(8)
133
- Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
134
-
144
+ logger.debug('next_request called...')
135
145
  if not isinstance(self._rq, RequestQueue):
136
146
  raise TypeError('self._rq must be an instance of the RequestQueue class')
137
147
 
138
- # Fetch the next request from the Request Queue
139
148
  try:
140
- apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
141
- except BaseException:
149
+ apify_request = self._async_thread.run_coro(self._rq.fetch_next_request())
150
+ except Exception:
142
151
  traceback.print_exc()
143
152
  raise
144
153
 
145
- Actor.log.debug(
146
- f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})'
147
- )
148
-
154
+ logger.debug(f'Fetched apify_request: {apify_request}')
149
155
  if apify_request is None:
150
156
  return None
151
157
 
152
158
  if not isinstance(self.spider, Spider):
153
159
  raise TypeError('self.spider must be an instance of the Spider class')
154
160
 
155
- # Let the Request Queue know that the request is being handled. Every request should be marked as handled,
156
- # retrying is handled by the Scrapy's RetryMiddleware.
161
+ # Let the request queue know that the request is being handled. Every request should
162
+ # be marked as handled, retrying is handled by the Scrapy's RetryMiddleware.
157
163
  try:
158
- nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
159
- except BaseException:
164
+ self._async_thread.run_coro(self._rq.mark_request_as_handled(apify_request))
165
+ except Exception:
160
166
  traceback.print_exc()
161
167
  raise
162
168
 
163
169
  scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
164
- Actor.log.debug(
165
- f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned '
166
- f'(scrapy_request={scrapy_request})',
167
- )
170
+ logger.debug(f'Converted to scrapy_request: {scrapy_request}')
168
171
  return scrapy_request
apify/scrapy/utils.py CHANGED
@@ -1,29 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  from base64 import b64encode
5
4
  from typing import TYPE_CHECKING
6
5
  from urllib.parse import unquote
7
6
 
8
- from apify_shared.utils import ignore_docs
7
+ from scrapy.utils.project import get_project_settings
8
+ from scrapy.utils.python import to_bytes
9
9
 
10
- try:
11
- from scrapy.utils.project import get_project_settings
12
- from scrapy.utils.python import to_bytes
10
+ if TYPE_CHECKING:
11
+ from scrapy.settings import Settings
13
12
 
14
- if TYPE_CHECKING:
15
- from scrapy.settings import Settings
16
- except ImportError as exc:
17
- raise ImportError(
18
- 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
19
- '"pip install apify[scrapy]".'
20
- ) from exc
21
13
 
22
-
23
- nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
24
-
25
-
26
- @ignore_docs
27
14
  def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
28
15
  """Generate a basic authentication header for the given username and password."""
29
16
  string = f'{unquote(username)}:{unquote(password)}'
@@ -31,18 +18,6 @@ def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'la
31
18
  return b'Basic ' + b64encode(user_pass)
32
19
 
33
20
 
34
- @ignore_docs
35
- def get_running_event_loop_id() -> int:
36
- """Get the ID of the currently running event loop.
37
-
38
- It could be useful mainly for debugging purposes.
39
-
40
- Returns:
41
- The ID of the event loop.
42
- """
43
- return id(asyncio.get_running_loop())
44
-
45
-
46
21
  def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
47
22
  """Integrates Apify configuration into a Scrapy project settings.
48
23
 
@@ -65,10 +40,6 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
65
40
  # ensuring it is executed as the final step in the pipeline sequence
66
41
  settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
67
42
 
68
- # Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request
69
- # during process_response, but currently we have no way of detecting it and handling it properly.
70
- settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None
71
-
72
43
  # Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware
73
44
  settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
74
45
  settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: apify
3
- Version: 2.2.2b2
3
+ Version: 2.3.0b1
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3.13
20
20
  Classifier: Topic :: Software Development :: Libraries
21
21
  Provides-Extra: scrapy
22
- Requires-Dist: apify-client (>=1.8.1)
22
+ Requires-Dist: apify-client (>=1.9.1)
23
23
  Requires-Dist: apify-shared (>=1.2.1)
24
24
  Requires-Dist: crawlee (>=0.5.1,<0.6.0)
25
25
  Requires-Dist: cryptography (>=42.0.0)
@@ -1,5 +1,5 @@
1
1
  apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
2
- apify/_actor.py,sha256=Pb7HPHIAodQOiGtyb-At45x8GfDItWCusRtQkoz1Pq4,46138
2
+ apify/_actor.py,sha256=cN74o-yrxwtrvt6RA9SXzp42fEw-xcZcd9ur8KUhRmM,46129
3
3
  apify/_configuration.py,sha256=T3Z_o_W98iSyTbrutfb578yW51aexZ_V0FcLwTxFLjI,10878
4
4
  apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
5
5
  apify/_crypto.py,sha256=e0_aM3l9_5Osk-jszYOOjrAKK60OggSHbiw5c30QnsU,5638
@@ -8,7 +8,7 @@ apify/_platform_event_manager.py,sha256=44xyV0Lpzf4h4VZ0rkyYg_nhbQkEONNor8_Z9gIK
8
8
  apify/_proxy_configuration.py,sha256=c-O6_PZ9pUD-i4J0RFEKTtfyJPP2rTRJJA1TH8NVsV8,13189
9
9
  apify/_utils.py,sha256=CCLkpAsZKp00ykm88Z_Fbck5PNT0j6mJYOuD0RxzZUs,1620
10
10
  apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
11
- apify/apify_storage_client/_apify_storage_client.py,sha256=0rS75JoRHt7stRYS9-oqm3DmaSIZQN5C11N5MZQUvlA,2616
11
+ apify/apify_storage_client/_apify_storage_client.py,sha256=jTX5vd-K9mnFTyZu2V2dUg7oyWogvmNIDUlEXnvIlOw,2766
12
12
  apify/apify_storage_client/_dataset_client.py,sha256=UUodnR_MQBg5RkURrfegkGJWR5OmdPPgPfGepvkdQoU,5580
13
13
  apify/apify_storage_client/_dataset_collection_client.py,sha256=qCcKZlA0bkO-sL7xED0Yose85NlrRa9AKr4oCSrYX6k,1489
14
14
  apify/apify_storage_client/_key_value_store_client.py,sha256=MSuoIeqEHLu92WfUU7kyB3Cc_gKUlm8TghnU3_xkPtE,3363
@@ -18,21 +18,24 @@ apify/apify_storage_client/_request_queue_collection_client.py,sha256=MdzgbQb2D8
18
18
  apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  apify/log.py,sha256=j-E4t-WeA93bc1NCQRG8sTntehQCiiN8ia-MdQe3_Ts,1291
20
20
  apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- apify/scrapy/__init__.py,sha256=HE5wCN7-DZKPydLCOvjNyLuL3CvN2fUFweXfrDfe1Ss,348
21
+ apify/scrapy/__init__.py,sha256=m2a0ts_JY9xJkBy4JU5mV8PJqjA3GGKLXBFu4nl-n-A,1048
22
+ apify/scrapy/_actor_runner.py,sha256=rXWSnlQWGskDUH8PtLCv5SkOIx4AiVa4QbCYeCett5c,938
23
+ apify/scrapy/_async_thread.py,sha256=AfeH9ZkSRZXxL11wzwrroDNsTzq4tAvURlinUZBtYMA,4753
24
+ apify/scrapy/_logging_config.py,sha256=hFq90fNtZyjjJA7w2k-mtuEC8xCFiBMTalbwPDcaig4,2022
22
25
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
23
- apify/scrapy/middlewares/apify_proxy.py,sha256=H8a3vrA_7S_ucRkE3VDNMU8TY2CdzGTMXbhbJbfLv1c,5755
26
+ apify/scrapy/middlewares/apify_proxy.py,sha256=CDAOXS3bcVDZHM3B0GvhXbxEikMIadLF_0P73WL_nI4,5550
24
27
  apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
28
  apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
26
- apify/scrapy/pipelines/actor_dataset_push.py,sha256=otggoULfUdCqOPJLb9wMROZ9WylnlL-209930tMS2Rg,971
29
+ apify/scrapy/pipelines/actor_dataset_push.py,sha256=XUUyznQTD-E3wYUUFt2WAOnWhbnRrY0WuedlfYfYhDI,846
27
30
  apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
31
  apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
- apify/scrapy/requests.py,sha256=yZ9hIsz2YyqOoOwzN9F1h76wG4qwawrI6h_6xq0I7Iw,7599
30
- apify/scrapy/scheduler.py,sha256=blO333BhFDMu3wAvSQONYdhmAmyiysqsv3YF5FKO_20,6281
31
- apify/scrapy/utils.py,sha256=758DcHCSAgCTProY0QX74uJ1XrzVsQwvCmFanj2f_3Q,2928
32
+ apify/scrapy/requests.py,sha256=tOiFtG0kyzbBwtmaOisLAcpJENR1eDtpPR1nRH7JJGg,6551
33
+ apify/scrapy/scheduler.py,sha256=-r1wZjMmeRDPxZKGHO-EYDYpGdDgSPAdNgMFViqUK8E,6019
34
+ apify/scrapy/utils.py,sha256=5cka33PWc_at14yjhnLkCvY4h-ySUgVVhhDLxTy39ZI,1965
32
35
  apify/storages/__init__.py,sha256=FW-z6ubuPnHGM-Wp15T8mR5q6lnpDGrCW-IkgZd5L30,177
33
36
  apify/storages/_request_list.py,sha256=-lZJcE5nq69aJhGFJ7Sh2ctqgAWUDyOwYm5_0y1hdAE,5865
34
37
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- apify-2.2.2b2.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
36
- apify-2.2.2b2.dist-info/METADATA,sha256=QZkLUfL0yzzoIjc_9Gdst5EG_600Nw044phOKoMhZCw,8696
37
- apify-2.2.2b2.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
38
- apify-2.2.2b2.dist-info/RECORD,,
38
+ apify-2.3.0b1.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
39
+ apify-2.3.0b1.dist-info/METADATA,sha256=IO-6Yvs9P-SeqNnJ04jAWzer0mzqa9hY1AJqKr0X_2w,8696
40
+ apify-2.3.0b1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
41
+ apify-2.3.0b1.dist-info/RECORD,,