apify 2.2.2b2__tar.gz → 2.3.0b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- {apify-2.2.2b2 → apify-2.3.0b1}/PKG-INFO +2 -2
- {apify-2.2.2b2 → apify-2.3.0b1}/pyproject.toml +15 -3
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_actor.py +2 -2
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_apify_storage_client.py +4 -0
- apify-2.3.0b1/src/apify/scrapy/__init__.py +32 -0
- apify-2.3.0b1/src/apify/scrapy/_actor_runner.py +26 -0
- apify-2.3.0b1/src/apify/scrapy/_async_thread.py +122 -0
- apify-2.3.0b1/src/apify/scrapy/_logging_config.py +55 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/middlewares/apify_proxy.py +9 -13
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/pipelines/actor_dataset_push.py +7 -9
- apify-2.3.0b1/src/apify/scrapy/requests.py +150 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/scheduler.py +64 -61
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/utils.py +4 -33
- apify-2.2.2b2/src/apify/scrapy/__init__.py +0 -11
- apify-2.2.2b2/src/apify/scrapy/requests.py +0 -177
- {apify-2.2.2b2 → apify-2.3.0b1}/LICENSE +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/README.md +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/__init__.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_configuration.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_consts.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_crypto.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_models.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_platform_event_manager.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_proxy_configuration.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/_utils.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/__init__.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_dataset_client.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_dataset_collection_client.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_key_value_store_client.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_key_value_store_collection_client.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_request_queue_client.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_request_queue_collection_client.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/py.typed +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/log.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/py.typed +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/middlewares/py.typed +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/pipelines/py.typed +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/scrapy/py.typed +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/storages/__init__.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/storages/_request_list.py +0 -0
- {apify-2.2.2b2 → apify-2.3.0b1}/src/apify/storages/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0b1
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
20
|
Classifier: Topic :: Software Development :: Libraries
|
|
21
21
|
Provides-Extra: scrapy
|
|
22
|
-
Requires-Dist: apify-client (>=1.
|
|
22
|
+
Requires-Dist: apify-client (>=1.9.1)
|
|
23
23
|
Requires-Dist: apify-shared (>=1.2.1)
|
|
24
24
|
Requires-Dist: crawlee (>=0.5.1,<0.6.0)
|
|
25
25
|
Requires-Dist: cryptography (>=42.0.0)
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "apify"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.3.0b1"
|
|
8
8
|
description = "Apify SDK for Python"
|
|
9
9
|
authors = ["Apify Technologies s.r.o. <support@apify.com>"]
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -43,7 +43,7 @@ keywords = [
|
|
|
43
43
|
|
|
44
44
|
[tool.poetry.dependencies]
|
|
45
45
|
python = "^3.9"
|
|
46
|
-
apify-client = ">=1.
|
|
46
|
+
apify-client = ">=1.9.1"
|
|
47
47
|
apify-shared = ">=1.2.1"
|
|
48
48
|
crawlee = "~0.5.1"
|
|
49
49
|
cryptography = ">=42.0.0"
|
|
@@ -60,7 +60,7 @@ websockets = ">=10.0 <14.0.0"
|
|
|
60
60
|
build = "~1.2.0"
|
|
61
61
|
filelock = "~3.17.0"
|
|
62
62
|
griffe = "~1.5.0"
|
|
63
|
-
mypy = "~1.
|
|
63
|
+
mypy = "~1.15.0"
|
|
64
64
|
pre-commit = "~4.1.0"
|
|
65
65
|
pydoc-markdown = "~4.8.0"
|
|
66
66
|
pytest = "~8.3.0"
|
|
@@ -136,6 +136,18 @@ indent-style = "space"
|
|
|
136
136
|
"TRY301", # Abstract `raise` to an inner function
|
|
137
137
|
"PLW0603", # Using the global statement to update `{name}` is discouraged
|
|
138
138
|
]
|
|
139
|
+
"**/docs/**/scrapy_project/**/__main__.py" = [
|
|
140
|
+
# Because of asyncioreactor.install() call.
|
|
141
|
+
"E402", # Module level import not at top of file
|
|
142
|
+
]
|
|
143
|
+
"**/docs/**/scrapy_project/**" = [
|
|
144
|
+
# Local imports are mixed up with the Apify SDK.
|
|
145
|
+
"I001", # Import block is un-sorted or un-formatted
|
|
146
|
+
# Class variables are common in Scrapy projects.
|
|
147
|
+
"RUF012", # Mutable class attributes should be annotated with `typing.ClassVar`
|
|
148
|
+
# Local imports in Scrapy project.
|
|
149
|
+
"TID252", # Prefer absolute imports over relative imports from parent modules
|
|
150
|
+
]
|
|
139
151
|
|
|
140
152
|
[tool.ruff.lint.flake8-quotes]
|
|
141
153
|
docstring-quotes = "double"
|
|
@@ -270,8 +270,8 @@ class _ActorType:
|
|
|
270
270
|
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython')
|
|
271
271
|
elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508
|
|
272
272
|
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test')
|
|
273
|
-
elif
|
|
274
|
-
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running
|
|
273
|
+
elif os.getenv('SCRAPY_SETTINGS_MODULE'):
|
|
274
|
+
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running with Scrapy')
|
|
275
275
|
else:
|
|
276
276
|
sys.exit(exit_code)
|
|
277
277
|
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
|
+
from crawlee._utils.try_import import try_import as _try_import
|
|
3
|
+
|
|
4
|
+
_install_import_hook(__name__)
|
|
5
|
+
|
|
6
|
+
# The following imports use try_import to handle optional dependencies, as they may not always be available.
|
|
7
|
+
|
|
8
|
+
with _try_import(__name__, 'run_scrapy_actor'):
|
|
9
|
+
from ._actor_runner import run_scrapy_actor
|
|
10
|
+
|
|
11
|
+
with _try_import(__name__, 'initialize_logging'):
|
|
12
|
+
from ._logging_config import initialize_logging
|
|
13
|
+
|
|
14
|
+
with _try_import(__name__, 'to_apify_request', 'to_scrapy_request'):
|
|
15
|
+
from .requests import to_apify_request, to_scrapy_request
|
|
16
|
+
|
|
17
|
+
with _try_import(__name__, 'ApifyScheduler'):
|
|
18
|
+
from .scheduler import ApifyScheduler
|
|
19
|
+
|
|
20
|
+
with _try_import(__name__, 'apply_apify_settings', 'get_basic_auth_header'):
|
|
21
|
+
from .utils import apply_apify_settings, get_basic_auth_header
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
'ApifyScheduler',
|
|
26
|
+
'apply_apify_settings',
|
|
27
|
+
'get_basic_auth_header',
|
|
28
|
+
'initialize_logging',
|
|
29
|
+
'run_scrapy_actor',
|
|
30
|
+
'to_apify_request',
|
|
31
|
+
'to_scrapy_request',
|
|
32
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from twisted.internet.defer import Deferred, ensureDeferred
|
|
7
|
+
from twisted.internet.task import react
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Coroutine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def _run_coro_as_deferred(coro: Coroutine) -> None:
|
|
14
|
+
"""Wrap the given asyncio coroutine in a Task and await its result as a Twisted Deferred."""
|
|
15
|
+
task = asyncio.ensure_future(coro)
|
|
16
|
+
await Deferred.fromFuture(task)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run_scrapy_actor(coro: Coroutine) -> None:
|
|
20
|
+
"""Start Twisted's reactor and execute the provided Actor coroutine.
|
|
21
|
+
|
|
22
|
+
This function initiates the Twisted reactor and runs the given asyncio coroutine (typically the
|
|
23
|
+
Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted event loops,
|
|
24
|
+
enabling the Apify and Scrapy integration to work together.
|
|
25
|
+
"""
|
|
26
|
+
react(lambda _: ensureDeferred(_run_coro_as_deferred(coro)))
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import threading
|
|
5
|
+
from concurrent import futures
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
from logging import getLogger
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Coroutine
|
|
12
|
+
|
|
13
|
+
logger = getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AsyncThread:
|
|
17
|
+
"""Class for running an asyncio event loop in a separate thread.
|
|
18
|
+
|
|
19
|
+
This allows running asynchronous coroutines from synchronous code by executingthem on an event loop
|
|
20
|
+
that runs in its own dedicated thread.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self) -> None:
|
|
24
|
+
self._eventloop = asyncio.new_event_loop()
|
|
25
|
+
|
|
26
|
+
# Start the event loop in a dedicated daemon thread.
|
|
27
|
+
self._thread = threading.Thread(
|
|
28
|
+
target=self._start_event_loop,
|
|
29
|
+
daemon=True,
|
|
30
|
+
)
|
|
31
|
+
self._thread.start()
|
|
32
|
+
|
|
33
|
+
def run_coro(
|
|
34
|
+
self,
|
|
35
|
+
coro: Coroutine,
|
|
36
|
+
timeout: timedelta = timedelta(seconds=60),
|
|
37
|
+
) -> Any:
|
|
38
|
+
"""Run a coroutine on an event loop running in a separate thread.
|
|
39
|
+
|
|
40
|
+
This method schedules the coroutine to run on the event loop and blocks until the coroutine completes
|
|
41
|
+
or the specified timeout is reached.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
coro: The coroutine to run.
|
|
45
|
+
timeout: The maximum number of seconds to wait for the coroutine to finish.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The result returned by the coroutine.
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
RuntimeError: If the event loop is not running.
|
|
52
|
+
TimeoutError: If the coroutine does not complete within the timeout.
|
|
53
|
+
Exception: Any exception raised during coroutine execution.
|
|
54
|
+
"""
|
|
55
|
+
if not self._eventloop.is_running():
|
|
56
|
+
raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.')
|
|
57
|
+
|
|
58
|
+
# Submit the coroutine to the event loop running in the other thread.
|
|
59
|
+
future = asyncio.run_coroutine_threadsafe(coro, self._eventloop)
|
|
60
|
+
try:
|
|
61
|
+
# Wait for the coroutine's result until the specified timeout.
|
|
62
|
+
return future.result(timeout=timeout.total_seconds())
|
|
63
|
+
except futures.TimeoutError as exc:
|
|
64
|
+
logger.exception('Coroutine execution timed out.', exc_info=exc)
|
|
65
|
+
raise
|
|
66
|
+
except Exception as exc:
|
|
67
|
+
logger.exception('Coroutine execution raised an exception.', exc_info=exc)
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
def close(self, timeout: timedelta = timedelta(seconds=60)) -> None:
|
|
71
|
+
"""Close the event loop and its thread gracefully.
|
|
72
|
+
|
|
73
|
+
This method cancels all pending tasks, stops the event loop, and waits for the thread to exit.
|
|
74
|
+
If the thread does not exit within the given timeout, a forced shutdown is attempted.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
timeout: The maximum number of seconds to wait for the event loop thread to exit.
|
|
78
|
+
"""
|
|
79
|
+
if self._eventloop.is_running():
|
|
80
|
+
# Cancel all pending tasks in the event loop.
|
|
81
|
+
self.run_coro(self._shutdown_tasks())
|
|
82
|
+
|
|
83
|
+
# Schedule the event loop to stop.
|
|
84
|
+
self._eventloop.call_soon_threadsafe(self._eventloop.stop)
|
|
85
|
+
|
|
86
|
+
# Wait for the event loop thread to finish execution.
|
|
87
|
+
self._thread.join(timeout=timeout.total_seconds())
|
|
88
|
+
|
|
89
|
+
# If the thread is still running after the timeout, force a shutdown.
|
|
90
|
+
if self._thread.is_alive():
|
|
91
|
+
logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...')
|
|
92
|
+
self._force_exit_event_loop()
|
|
93
|
+
|
|
94
|
+
def _start_event_loop(self) -> None:
|
|
95
|
+
"""Set up and run the asyncio event loop in the dedicated thread."""
|
|
96
|
+
asyncio.set_event_loop(self._eventloop)
|
|
97
|
+
try:
|
|
98
|
+
self._eventloop.run_forever()
|
|
99
|
+
finally:
|
|
100
|
+
self._eventloop.close()
|
|
101
|
+
logger.debug('Asyncio event loop has been closed.')
|
|
102
|
+
|
|
103
|
+
async def _shutdown_tasks(self) -> None:
|
|
104
|
+
"""Cancel all pending tasks in the event loop."""
|
|
105
|
+
# Retrieve all tasks for the event loop, excluding the current task.
|
|
106
|
+
tasks = [task for task in asyncio.all_tasks(self._eventloop) if task is not asyncio.current_task()]
|
|
107
|
+
|
|
108
|
+
# Cancel each pending task.
|
|
109
|
+
for task in tasks:
|
|
110
|
+
task.cancel()
|
|
111
|
+
|
|
112
|
+
# Wait until all tasks have been cancelled or finished.
|
|
113
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
114
|
+
|
|
115
|
+
def _force_exit_event_loop(self) -> None:
|
|
116
|
+
"""Forcefully shut down the event loop and its thread."""
|
|
117
|
+
try:
|
|
118
|
+
logger.info('Forced shutdown of the event loop and its thread...')
|
|
119
|
+
self._eventloop.call_soon_threadsafe(self._eventloop.stop)
|
|
120
|
+
self._thread.join(timeout=5)
|
|
121
|
+
except Exception as exc:
|
|
122
|
+
logger.exception('Exception occurred during forced event loop shutdown.', exc_info=exc)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from scrapy.utils import log as scrapy_logging
|
|
7
|
+
from scrapy.utils.project import get_project_settings
|
|
8
|
+
|
|
9
|
+
from apify.log import ActorLogFormatter
|
|
10
|
+
|
|
11
|
+
# Define logger names.
|
|
12
|
+
_PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
|
|
13
|
+
_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
|
|
14
|
+
_ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _configure_logger(name: str | None, logging_level: str, handler: logging.Handler) -> None:
|
|
18
|
+
"""Clear and reconfigure the logger."""
|
|
19
|
+
logger = logging.getLogger(name)
|
|
20
|
+
logger.handlers.clear()
|
|
21
|
+
logger.setLevel(logging_level)
|
|
22
|
+
|
|
23
|
+
if name is None: # Root logger.
|
|
24
|
+
logger.addHandler(handler)
|
|
25
|
+
logger.propagate = False
|
|
26
|
+
else:
|
|
27
|
+
logger.propagate = True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def initialize_logging() -> None:
|
|
31
|
+
"""Configure logging for Apify Actors and adjust Scrapy's logging settings."""
|
|
32
|
+
# Retrieve Scrapy project settings and determine the logging level.
|
|
33
|
+
settings = get_project_settings()
|
|
34
|
+
logging_level = settings.get('LOG_LEVEL', 'INFO') # Default to INFO.
|
|
35
|
+
|
|
36
|
+
# Create a custom handler with the Apify log formatter.
|
|
37
|
+
handler = logging.StreamHandler()
|
|
38
|
+
handler.setFormatter(ActorLogFormatter(include_logger_name=True))
|
|
39
|
+
|
|
40
|
+
# Configure the root logger and all other defined loggers.
|
|
41
|
+
for logger_name in [None, *_ALL_LOGGERS]:
|
|
42
|
+
_configure_logger(logger_name, logging_level, handler)
|
|
43
|
+
|
|
44
|
+
# Set the 'httpx' logger to a less verbose level.
|
|
45
|
+
logging.getLogger('httpx').setLevel('WARNING')
|
|
46
|
+
|
|
47
|
+
# Monkey-patch Scrapy's logging configuration to re-apply our settings.
|
|
48
|
+
original_configure_logging = scrapy_logging.configure_logging
|
|
49
|
+
|
|
50
|
+
def new_configure_logging(*args: Any, **kwargs: Any) -> None:
|
|
51
|
+
original_configure_logging(*args, **kwargs)
|
|
52
|
+
for logger_name in [None, *_ALL_LOGGERS]:
|
|
53
|
+
_configure_logger(logger_name, logging_level, handler)
|
|
54
|
+
|
|
55
|
+
scrapy_logging.configure_logging = new_configure_logging
|
|
@@ -3,19 +3,15 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
from urllib.parse import ParseResult, urlparse
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
from scrapy import Request, Spider
|
|
9
|
-
from scrapy.crawler import Crawler
|
|
10
|
-
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
11
|
-
from scrapy.exceptions import NotConfigured
|
|
12
|
-
except ImportError as exc:
|
|
13
|
-
raise ImportError(
|
|
14
|
-
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
15
|
-
) from exc
|
|
6
|
+
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
7
|
+
from scrapy.exceptions import NotConfigured
|
|
16
8
|
|
|
17
9
|
from apify import Actor, ProxyConfiguration
|
|
18
|
-
from apify.scrapy
|
|
10
|
+
from apify.scrapy import get_basic_auth_header
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from scrapy import Request, Spider
|
|
14
|
+
from scrapy.crawler import Crawler
|
|
19
15
|
|
|
20
16
|
|
|
21
17
|
class ApifyHttpProxyMiddleware:
|
|
@@ -51,7 +47,7 @@ class ApifyHttpProxyMiddleware:
|
|
|
51
47
|
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
|
|
52
48
|
|
|
53
49
|
if proxy_settings is None:
|
|
54
|
-
Actor.log.
|
|
50
|
+
Actor.log.info(
|
|
55
51
|
'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing '
|
|
56
52
|
' in the Actor input.'
|
|
57
53
|
)
|
|
@@ -60,7 +56,7 @@ class ApifyHttpProxyMiddleware:
|
|
|
60
56
|
use_apify_proxy = proxy_settings.get('useApifyProxy', False)
|
|
61
57
|
|
|
62
58
|
if use_apify_proxy is not True:
|
|
63
|
-
Actor.log.
|
|
59
|
+
Actor.log.info(
|
|
64
60
|
'ApifyHttpProxyMiddleware is not going to be used. Actor input field '
|
|
65
61
|
'"proxyConfiguration.useApifyProxy" is set to False.'
|
|
66
62
|
)
|
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from logging import getLogger
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from itemadapter.adapter import ItemAdapter
|
|
6
7
|
|
|
7
|
-
try:
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
from scrapy import Item, Spider
|
|
10
|
-
except ImportError as exc:
|
|
11
|
-
raise ImportError(
|
|
12
|
-
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
|
-
) from exc
|
|
14
|
-
|
|
15
8
|
from apify import Actor
|
|
16
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from scrapy import Item, Spider
|
|
12
|
+
|
|
13
|
+
logger = getLogger(__name__)
|
|
14
|
+
|
|
17
15
|
|
|
18
16
|
class ActorDatasetPushPipeline:
|
|
19
17
|
"""A Scrapy pipeline for pushing items to an Actor's default dataset.
|
|
@@ -28,6 +26,6 @@ class ActorDatasetPushPipeline:
|
|
|
28
26
|
) -> Item:
|
|
29
27
|
"""Pushes the provided Scrapy item to the Actor's default dataset."""
|
|
30
28
|
item_dict = ItemAdapter(item).asdict()
|
|
31
|
-
|
|
29
|
+
logger.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.')
|
|
32
30
|
await Actor.push_data(item_dict)
|
|
33
31
|
return item
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import codecs
|
|
4
|
+
import pickle
|
|
5
|
+
from logging import getLogger
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
from scrapy import Request as ScrapyRequest
|
|
9
|
+
from scrapy import Spider
|
|
10
|
+
from scrapy.http.headers import Headers
|
|
11
|
+
from scrapy.utils.request import request_from_dict
|
|
12
|
+
|
|
13
|
+
from crawlee import Request as ApifyRequest
|
|
14
|
+
from crawlee._types import HttpHeaders
|
|
15
|
+
|
|
16
|
+
logger = getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None:
|
|
20
|
+
"""Convert a Scrapy request to an Apify request.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
scrapy_request: The Scrapy request to be converted.
|
|
24
|
+
spider: The Scrapy spider that the request is associated with.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
The converted Apify request if the conversion was successful, otherwise None.
|
|
28
|
+
"""
|
|
29
|
+
if not isinstance(scrapy_request, ScrapyRequest):
|
|
30
|
+
logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.') # type: ignore[unreachable]
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...')
|
|
34
|
+
|
|
35
|
+
# Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
|
|
36
|
+
request_kwargs: dict[str, Any] = {
|
|
37
|
+
'url': scrapy_request.url,
|
|
38
|
+
'method': scrapy_request.method,
|
|
39
|
+
'payload': scrapy_request.body,
|
|
40
|
+
'use_extended_unique_key': True,
|
|
41
|
+
'keep_url_fragment': False,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
if scrapy_request.dont_filter:
|
|
46
|
+
request_kwargs['always_enqueue'] = True
|
|
47
|
+
else:
|
|
48
|
+
if scrapy_request.meta.get('apify_request_unique_key'):
|
|
49
|
+
request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
|
|
50
|
+
|
|
51
|
+
if scrapy_request.meta.get('apify_request_id'):
|
|
52
|
+
request_kwargs['id'] = scrapy_request.meta['apify_request_id']
|
|
53
|
+
|
|
54
|
+
request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
|
|
55
|
+
|
|
56
|
+
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
|
|
57
|
+
if isinstance(scrapy_request.headers, Headers):
|
|
58
|
+
request_kwargs['headers'] = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
|
|
59
|
+
else:
|
|
60
|
+
logger.warning( # type: ignore[unreachable]
|
|
61
|
+
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
apify_request = ApifyRequest.from_url(**request_kwargs)
|
|
65
|
+
|
|
66
|
+
# Serialize the Scrapy ScrapyRequest and store it in the apify_request.
|
|
67
|
+
# - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
|
|
68
|
+
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
|
|
69
|
+
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
|
|
70
|
+
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
|
|
71
|
+
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
|
|
72
|
+
apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
|
|
73
|
+
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
logger.debug(f'scrapy_request was converted to the apify_request={apify_request}')
|
|
79
|
+
return apify_request
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequest:
|
|
83
|
+
"""Convert an Apify request to a Scrapy request.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
apify_request: The Apify request to be converted.
|
|
87
|
+
spider: The Scrapy spider that the request is associated with.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
TypeError: If the Apify request is not an instance of the `ApifyRequest` class.
|
|
91
|
+
ValueError: If the Apify request does not contain the required keys.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The converted Scrapy request.
|
|
95
|
+
"""
|
|
96
|
+
if not isinstance(cast(Any, apify_request), ApifyRequest):
|
|
97
|
+
raise TypeError('apify_request must be a crawlee.ScrapyRequest instance')
|
|
98
|
+
|
|
99
|
+
logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...')
|
|
100
|
+
|
|
101
|
+
# If the apify_request comes from the Scrapy
|
|
102
|
+
if 'scrapy_request' in apify_request.user_data:
|
|
103
|
+
# Deserialize the Scrapy ScrapyRequest from the apify_request.
|
|
104
|
+
# - This process involves decoding the base64-encoded request data and reconstructing
|
|
105
|
+
# the Scrapy ScrapyRequest object from its dictionary representation.
|
|
106
|
+
logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...')
|
|
107
|
+
|
|
108
|
+
scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
|
|
109
|
+
if not isinstance(scrapy_request_dict_encoded, str):
|
|
110
|
+
raise TypeError('scrapy_request_dict_encoded must be a string')
|
|
111
|
+
|
|
112
|
+
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
|
|
113
|
+
if not isinstance(scrapy_request_dict, dict):
|
|
114
|
+
raise TypeError('scrapy_request_dict must be a dictionary')
|
|
115
|
+
|
|
116
|
+
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
|
|
117
|
+
if not isinstance(scrapy_request, ScrapyRequest):
|
|
118
|
+
raise TypeError('scrapy_request must be an instance of the ScrapyRequest class')
|
|
119
|
+
|
|
120
|
+
logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...')
|
|
121
|
+
|
|
122
|
+
# Update the meta field with the meta field from the apify_request
|
|
123
|
+
meta = scrapy_request.meta or {}
|
|
124
|
+
meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
|
|
125
|
+
# scrapy_request.meta is a property, so we have to set it like this
|
|
126
|
+
scrapy_request._meta = meta # noqa: SLF001
|
|
127
|
+
|
|
128
|
+
# If the apify_request comes directly from the Scrapy, typically start URLs.
|
|
129
|
+
else:
|
|
130
|
+
logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)')
|
|
131
|
+
|
|
132
|
+
scrapy_request = ScrapyRequest(
|
|
133
|
+
url=apify_request.url,
|
|
134
|
+
method=apify_request.method,
|
|
135
|
+
meta={
|
|
136
|
+
'apify_request_id': apify_request.id,
|
|
137
|
+
'apify_request_unique_key': apify_request.unique_key,
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Add optional 'headers' field
|
|
142
|
+
if apify_request.headers:
|
|
143
|
+
scrapy_request.headers |= Headers(apify_request.headers)
|
|
144
|
+
|
|
145
|
+
# Add optional 'userData' field
|
|
146
|
+
if apify_request.user_data:
|
|
147
|
+
scrapy_request.meta['userData'] = apify_request.user_data
|
|
148
|
+
|
|
149
|
+
logger.debug(f'an apify_request was converted to the scrapy_request={scrapy_request}')
|
|
150
|
+
return scrapy_request
|
|
@@ -1,41 +1,33 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import traceback
|
|
4
|
+
from logging import getLogger
|
|
4
5
|
from typing import TYPE_CHECKING
|
|
5
6
|
|
|
6
|
-
from
|
|
7
|
+
from scrapy import Spider
|
|
8
|
+
from scrapy.core.scheduler import BaseScheduler
|
|
9
|
+
from scrapy.utils.reactor import is_asyncio_reactor_installed
|
|
7
10
|
|
|
8
|
-
from
|
|
11
|
+
from ._async_thread import AsyncThread
|
|
12
|
+
from .requests import to_apify_request, to_scrapy_request
|
|
13
|
+
from apify import Configuration
|
|
9
14
|
from apify.apify_storage_client import ApifyStorageClient
|
|
15
|
+
from apify.storages import RequestQueue
|
|
10
16
|
|
|
11
|
-
|
|
12
|
-
from scrapy import
|
|
13
|
-
from
|
|
14
|
-
from scrapy.utils.reactor import is_asyncio_reactor_installed
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from scrapy.http.request import Request
|
|
18
|
-
except ImportError as exc:
|
|
19
|
-
raise ImportError(
|
|
20
|
-
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
21
|
-
) from exc
|
|
22
|
-
|
|
23
|
-
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from scrapy.http.request import Request
|
|
19
|
+
from twisted.internet.defer import Deferred
|
|
24
20
|
|
|
25
|
-
|
|
26
|
-
from apify.scrapy.requests import to_apify_request, to_scrapy_request
|
|
27
|
-
from apify.scrapy.utils import nested_event_loop
|
|
28
|
-
from apify.storages import RequestQueue
|
|
21
|
+
logger = getLogger(__name__)
|
|
29
22
|
|
|
30
23
|
|
|
31
24
|
class ApifyScheduler(BaseScheduler):
|
|
32
|
-
"""A Scrapy scheduler that uses the Apify
|
|
25
|
+
"""A Scrapy scheduler that uses the Apify `RequestQueue` to manage requests.
|
|
33
26
|
|
|
34
27
|
This scheduler requires the asyncio Twisted reactor to be installed.
|
|
35
28
|
"""
|
|
36
29
|
|
|
37
30
|
def __init__(self) -> None:
|
|
38
|
-
"""Create a new instance."""
|
|
39
31
|
if not is_asyncio_reactor_installed():
|
|
40
32
|
raise ValueError(
|
|
41
33
|
f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. '
|
|
@@ -45,7 +37,10 @@ class ApifyScheduler(BaseScheduler):
|
|
|
45
37
|
self._rq: RequestQueue | None = None
|
|
46
38
|
self.spider: Spider | None = None
|
|
47
39
|
|
|
48
|
-
|
|
40
|
+
# A thread with the asyncio event loop to run coroutines on.
|
|
41
|
+
self._async_thread = AsyncThread()
|
|
42
|
+
|
|
43
|
+
def open(self, spider: Spider) -> Deferred[None] | None:
|
|
49
44
|
"""Open the scheduler.
|
|
50
45
|
|
|
51
46
|
Args:
|
|
@@ -53,23 +48,42 @@ class ApifyScheduler(BaseScheduler):
|
|
|
53
48
|
"""
|
|
54
49
|
self.spider = spider
|
|
55
50
|
|
|
56
|
-
async def
|
|
51
|
+
async def open_rq() -> RequestQueue:
|
|
57
52
|
config = Configuration.get_global_configuration()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
ApifyStorageClient.from_config(config) if config.is_at_home else MemoryStorageClient.from_config(config)
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
return await RequestQueue.open(storage_client=storage_client)
|
|
53
|
+
if config.is_at_home:
|
|
54
|
+
storage_client = ApifyStorageClient.from_config(config)
|
|
55
|
+
return await RequestQueue.open(storage_client=storage_client)
|
|
56
|
+
return await RequestQueue.open()
|
|
66
57
|
|
|
67
58
|
try:
|
|
68
|
-
self._rq =
|
|
69
|
-
except
|
|
59
|
+
self._rq = self._async_thread.run_coro(open_rq())
|
|
60
|
+
except Exception:
|
|
70
61
|
traceback.print_exc()
|
|
71
62
|
raise
|
|
72
63
|
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def close(self, reason: str) -> None:
|
|
67
|
+
"""Close the scheduler.
|
|
68
|
+
|
|
69
|
+
Shut down the event loop and its thread gracefully.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
reason: The reason for closing the spider.
|
|
73
|
+
"""
|
|
74
|
+
logger.debug(f'Closing {self.__class__.__name__} due to {reason}...')
|
|
75
|
+
try:
|
|
76
|
+
self._async_thread.close()
|
|
77
|
+
|
|
78
|
+
except KeyboardInterrupt:
|
|
79
|
+
logger.warning('Shutdown interrupted by KeyboardInterrupt!')
|
|
80
|
+
|
|
81
|
+
except Exception:
|
|
82
|
+
logger.exception('Exception occurred while shutting down.')
|
|
83
|
+
|
|
84
|
+
finally:
|
|
85
|
+
logger.debug(f'{self.__class__.__name__} closed successfully.')
|
|
86
|
+
|
|
73
87
|
def has_pending_requests(self) -> bool:
|
|
74
88
|
"""Check if the scheduler has any pending requests.
|
|
75
89
|
|
|
@@ -80,8 +94,8 @@ class ApifyScheduler(BaseScheduler):
|
|
|
80
94
|
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
81
95
|
|
|
82
96
|
try:
|
|
83
|
-
is_finished =
|
|
84
|
-
except
|
|
97
|
+
is_finished = self._async_thread.run_coro(self._rq.is_finished())
|
|
98
|
+
except Exception:
|
|
85
99
|
traceback.print_exc()
|
|
86
100
|
raise
|
|
87
101
|
|
|
@@ -98,29 +112,27 @@ class ApifyScheduler(BaseScheduler):
|
|
|
98
112
|
Returns:
|
|
99
113
|
True if the request was successfully enqueued, False otherwise.
|
|
100
114
|
"""
|
|
101
|
-
|
|
102
|
-
Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
|
|
115
|
+
logger.debug(f'ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
|
|
103
116
|
|
|
104
117
|
if not isinstance(self.spider, Spider):
|
|
105
118
|
raise TypeError('self.spider must be an instance of the Spider class')
|
|
106
119
|
|
|
107
120
|
apify_request = to_apify_request(request, spider=self.spider)
|
|
108
121
|
if apify_request is None:
|
|
109
|
-
|
|
122
|
+
logger.error(f'Request {request} could not be converted to Apify request.')
|
|
110
123
|
return False
|
|
111
124
|
|
|
112
|
-
|
|
113
|
-
|
|
125
|
+
logger.debug(f'Converted to apify_request: {apify_request}')
|
|
114
126
|
if not isinstance(self._rq, RequestQueue):
|
|
115
127
|
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
116
128
|
|
|
117
129
|
try:
|
|
118
|
-
result =
|
|
119
|
-
except
|
|
130
|
+
result = self._async_thread.run_coro(self._rq.add_request(apify_request))
|
|
131
|
+
except Exception:
|
|
120
132
|
traceback.print_exc()
|
|
121
133
|
raise
|
|
122
134
|
|
|
123
|
-
|
|
135
|
+
logger.debug(f'rq.add_request result: {result}')
|
|
124
136
|
return bool(result.was_already_present)
|
|
125
137
|
|
|
126
138
|
def next_request(self) -> Request | None:
|
|
@@ -129,40 +141,31 @@ class ApifyScheduler(BaseScheduler):
|
|
|
129
141
|
Returns:
|
|
130
142
|
The next request, or None if there are no more requests.
|
|
131
143
|
"""
|
|
132
|
-
|
|
133
|
-
Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
|
|
134
|
-
|
|
144
|
+
logger.debug('next_request called...')
|
|
135
145
|
if not isinstance(self._rq, RequestQueue):
|
|
136
146
|
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
137
147
|
|
|
138
|
-
# Fetch the next request from the Request Queue
|
|
139
148
|
try:
|
|
140
|
-
apify_request =
|
|
141
|
-
except
|
|
149
|
+
apify_request = self._async_thread.run_coro(self._rq.fetch_next_request())
|
|
150
|
+
except Exception:
|
|
142
151
|
traceback.print_exc()
|
|
143
152
|
raise
|
|
144
153
|
|
|
145
|
-
|
|
146
|
-
f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})'
|
|
147
|
-
)
|
|
148
|
-
|
|
154
|
+
logger.debug(f'Fetched apify_request: {apify_request}')
|
|
149
155
|
if apify_request is None:
|
|
150
156
|
return None
|
|
151
157
|
|
|
152
158
|
if not isinstance(self.spider, Spider):
|
|
153
159
|
raise TypeError('self.spider must be an instance of the Spider class')
|
|
154
160
|
|
|
155
|
-
# Let the
|
|
156
|
-
# retrying is handled by the Scrapy's RetryMiddleware.
|
|
161
|
+
# Let the request queue know that the request is being handled. Every request should
|
|
162
|
+
# be marked as handled, retrying is handled by the Scrapy's RetryMiddleware.
|
|
157
163
|
try:
|
|
158
|
-
|
|
159
|
-
except
|
|
164
|
+
self._async_thread.run_coro(self._rq.mark_request_as_handled(apify_request))
|
|
165
|
+
except Exception:
|
|
160
166
|
traceback.print_exc()
|
|
161
167
|
raise
|
|
162
168
|
|
|
163
169
|
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
|
|
164
|
-
|
|
165
|
-
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned '
|
|
166
|
-
f'(scrapy_request={scrapy_request})',
|
|
167
|
-
)
|
|
170
|
+
logger.debug(f'Converted to scrapy_request: {scrapy_request}')
|
|
168
171
|
return scrapy_request
|
|
@@ -1,29 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import asyncio
|
|
4
3
|
from base64 import b64encode
|
|
5
4
|
from typing import TYPE_CHECKING
|
|
6
5
|
from urllib.parse import unquote
|
|
7
6
|
|
|
8
|
-
from
|
|
7
|
+
from scrapy.utils.project import get_project_settings
|
|
8
|
+
from scrapy.utils.python import to_bytes
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
from scrapy.
|
|
12
|
-
from scrapy.utils.python import to_bytes
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from scrapy.settings import Settings
|
|
13
12
|
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from scrapy.settings import Settings
|
|
16
|
-
except ImportError as exc:
|
|
17
|
-
raise ImportError(
|
|
18
|
-
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
|
|
19
|
-
'"pip install apify[scrapy]".'
|
|
20
|
-
) from exc
|
|
21
13
|
|
|
22
|
-
|
|
23
|
-
nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@ignore_docs
|
|
27
14
|
def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
|
|
28
15
|
"""Generate a basic authentication header for the given username and password."""
|
|
29
16
|
string = f'{unquote(username)}:{unquote(password)}'
|
|
@@ -31,18 +18,6 @@ def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'la
|
|
|
31
18
|
return b'Basic ' + b64encode(user_pass)
|
|
32
19
|
|
|
33
20
|
|
|
34
|
-
@ignore_docs
|
|
35
|
-
def get_running_event_loop_id() -> int:
|
|
36
|
-
"""Get the ID of the currently running event loop.
|
|
37
|
-
|
|
38
|
-
It could be useful mainly for debugging purposes.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The ID of the event loop.
|
|
42
|
-
"""
|
|
43
|
-
return id(asyncio.get_running_loop())
|
|
44
|
-
|
|
45
|
-
|
|
46
21
|
def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
|
|
47
22
|
"""Integrates Apify configuration into a Scrapy project settings.
|
|
48
23
|
|
|
@@ -65,10 +40,6 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
|
|
|
65
40
|
# ensuring it is executed as the final step in the pipeline sequence
|
|
66
41
|
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
|
|
67
42
|
|
|
68
|
-
# Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request
|
|
69
|
-
# during process_response, but currently we have no way of detecting it and handling it properly.
|
|
70
|
-
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None
|
|
71
|
-
|
|
72
43
|
# Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware
|
|
73
44
|
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
|
|
74
45
|
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from apify.scrapy.requests import to_apify_request, to_scrapy_request
|
|
2
|
-
from apify.scrapy.scheduler import ApifyScheduler
|
|
3
|
-
from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
|
|
4
|
-
|
|
5
|
-
__all__ = [
|
|
6
|
-
'ApifyScheduler',
|
|
7
|
-
'get_basic_auth_header',
|
|
8
|
-
'get_running_event_loop_id',
|
|
9
|
-
'to_apify_request',
|
|
10
|
-
'to_scrapy_request',
|
|
11
|
-
]
|
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import codecs
|
|
4
|
-
import pickle
|
|
5
|
-
from typing import Any, cast
|
|
6
|
-
|
|
7
|
-
from apify_shared.utils import ignore_docs
|
|
8
|
-
|
|
9
|
-
try:
|
|
10
|
-
from scrapy import Request, Spider
|
|
11
|
-
from scrapy.http.headers import Headers
|
|
12
|
-
from scrapy.utils.request import request_from_dict
|
|
13
|
-
except ImportError as exc:
|
|
14
|
-
raise ImportError(
|
|
15
|
-
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
16
|
-
) from exc
|
|
17
|
-
|
|
18
|
-
from crawlee import Request as CrawleeRequest
|
|
19
|
-
from crawlee._types import HttpHeaders
|
|
20
|
-
from crawlee._utils.crypto import crypto_random_object_id
|
|
21
|
-
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
|
|
22
|
-
|
|
23
|
-
from apify import Actor
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
|
|
27
|
-
"""Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
|
|
28
|
-
|
|
29
|
-
Works for RetryMiddleware and RedirectMiddleware.
|
|
30
|
-
"""
|
|
31
|
-
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
@ignore_docs
|
|
35
|
-
def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None:
|
|
36
|
-
"""Convert a Scrapy request to an Apify request.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
scrapy_request: The Scrapy request to be converted.
|
|
40
|
-
spider: The Scrapy spider that the request is associated with.
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
The converted Apify request if the conversion was successful, otherwise None.
|
|
44
|
-
"""
|
|
45
|
-
if not isinstance(scrapy_request, Request):
|
|
46
|
-
Actor.log.warning( # type: ignore[unreachable]
|
|
47
|
-
'Failed to convert to Apify request: Scrapy request must be a Request instance.'
|
|
48
|
-
)
|
|
49
|
-
return None
|
|
50
|
-
|
|
51
|
-
call_id = crypto_random_object_id(8)
|
|
52
|
-
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
|
|
53
|
-
|
|
54
|
-
try:
|
|
55
|
-
if _is_request_produced_by_middleware(scrapy_request):
|
|
56
|
-
unique_key = compute_unique_key(
|
|
57
|
-
url=scrapy_request.url,
|
|
58
|
-
method=scrapy_request.method, # type: ignore[arg-type] # str vs literal
|
|
59
|
-
payload=scrapy_request.body,
|
|
60
|
-
use_extended_unique_key=True,
|
|
61
|
-
)
|
|
62
|
-
elif scrapy_request.dont_filter:
|
|
63
|
-
unique_key = crypto_random_object_id(8)
|
|
64
|
-
elif scrapy_request.meta.get('apify_request_unique_key'):
|
|
65
|
-
unique_key = scrapy_request.meta['apify_request_unique_key']
|
|
66
|
-
else:
|
|
67
|
-
unique_key = crypto_random_object_id(8)
|
|
68
|
-
|
|
69
|
-
if scrapy_request.meta.get('apify_request_id'):
|
|
70
|
-
request_id = scrapy_request.meta['apify_request_id']
|
|
71
|
-
else:
|
|
72
|
-
request_id = unique_key_to_request_id(unique_key)
|
|
73
|
-
|
|
74
|
-
apify_request = CrawleeRequest(
|
|
75
|
-
url=scrapy_request.url,
|
|
76
|
-
method=scrapy_request.method,
|
|
77
|
-
payload=scrapy_request.body,
|
|
78
|
-
user_data=scrapy_request.meta.get('userData', {}),
|
|
79
|
-
unique_key=unique_key,
|
|
80
|
-
id=request_id,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
|
|
84
|
-
if isinstance(scrapy_request.headers, Headers):
|
|
85
|
-
apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
|
|
86
|
-
else:
|
|
87
|
-
Actor.log.warning( # type: ignore[unreachable]
|
|
88
|
-
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
# Serialize the Scrapy Request and store it in the apify_request.
|
|
92
|
-
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
|
|
93
|
-
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
|
|
94
|
-
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
|
|
95
|
-
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
|
|
96
|
-
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
|
|
97
|
-
apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
|
|
98
|
-
|
|
99
|
-
except Exception as exc:
|
|
100
|
-
Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
|
|
101
|
-
return None
|
|
102
|
-
|
|
103
|
-
Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
|
|
104
|
-
return apify_request
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
@ignore_docs
|
|
108
|
-
def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
|
|
109
|
-
"""Convert an Apify request to a Scrapy request.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
apify_request: The Apify request to be converted.
|
|
113
|
-
spider: The Scrapy spider that the request is associated with.
|
|
114
|
-
|
|
115
|
-
Raises:
|
|
116
|
-
TypeError: If the apify_request is not a crawlee request.
|
|
117
|
-
ValueError: If the apify_request does not contain the required keys.
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
The converted Scrapy request.
|
|
121
|
-
"""
|
|
122
|
-
if not isinstance(cast(Any, apify_request), CrawleeRequest):
|
|
123
|
-
raise TypeError('apify_request must be a crawlee.Request instance')
|
|
124
|
-
|
|
125
|
-
call_id = crypto_random_object_id(8)
|
|
126
|
-
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
|
|
127
|
-
|
|
128
|
-
# If the apify_request comes from the Scrapy
|
|
129
|
-
if 'scrapy_request' in apify_request.user_data:
|
|
130
|
-
# Deserialize the Scrapy Request from the apify_request.
|
|
131
|
-
# - This process involves decoding the base64-encoded request data and reconstructing
|
|
132
|
-
# the Scrapy Request object from its dictionary representation.
|
|
133
|
-
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
|
|
134
|
-
|
|
135
|
-
scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
|
|
136
|
-
if not isinstance(scrapy_request_dict_encoded, str):
|
|
137
|
-
raise TypeError('scrapy_request_dict_encoded must be a string')
|
|
138
|
-
|
|
139
|
-
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
|
|
140
|
-
if not isinstance(scrapy_request_dict, dict):
|
|
141
|
-
raise TypeError('scrapy_request_dict must be a dictionary')
|
|
142
|
-
|
|
143
|
-
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
|
|
144
|
-
if not isinstance(scrapy_request, Request):
|
|
145
|
-
raise TypeError('scrapy_request must be an instance of the Request class')
|
|
146
|
-
|
|
147
|
-
Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
|
|
148
|
-
|
|
149
|
-
# Update the meta field with the meta field from the apify_request
|
|
150
|
-
meta = scrapy_request.meta or {}
|
|
151
|
-
meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
|
|
152
|
-
# scrapy_request.meta is a property, so we have to set it like this
|
|
153
|
-
scrapy_request._meta = meta # noqa: SLF001
|
|
154
|
-
|
|
155
|
-
# If the apify_request comes directly from the Request Queue, typically start URLs
|
|
156
|
-
else:
|
|
157
|
-
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
|
|
158
|
-
|
|
159
|
-
scrapy_request = Request(
|
|
160
|
-
url=apify_request.url,
|
|
161
|
-
method=apify_request.method,
|
|
162
|
-
meta={
|
|
163
|
-
'apify_request_id': apify_request.id,
|
|
164
|
-
'apify_request_unique_key': apify_request.unique_key,
|
|
165
|
-
},
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
# Add optional 'headers' field
|
|
169
|
-
if apify_request.headers:
|
|
170
|
-
scrapy_request.headers |= Headers(apify_request.headers)
|
|
171
|
-
|
|
172
|
-
# Add optional 'userData' field
|
|
173
|
-
if apify_request.user_data:
|
|
174
|
-
scrapy_request.meta['userData'] = apify_request.user_data
|
|
175
|
-
|
|
176
|
-
Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
|
|
177
|
-
return scrapy_request
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_dataset_collection_client.py
RENAMED
|
File without changes
|
|
File without changes
|
{apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_key_value_store_collection_client.py
RENAMED
|
File without changes
|
|
File without changes
|
{apify-2.2.2b2 → apify-2.3.0b1}/src/apify/apify_storage_client/_request_queue_collection_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|