apify 1.5.2b2__tar.gz → 1.5.2b3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- {apify-1.5.2b2 → apify-1.5.2b3}/PKG-INFO +1 -1
- {apify-1.5.2b2 → apify-1.5.2b3}/pyproject.toml +1 -1
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/middlewares/apify_proxy.py +5 -2
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/middlewares/apify_retry.py +3 -2
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/scheduler.py +5 -15
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/utils.py +11 -19
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify.egg-info/PKG-INFO +1 -1
- {apify-1.5.2b2 → apify-1.5.2b3}/LICENSE +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/README.md +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/setup.cfg +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_crypto.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/file_storage_utils.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/memory_storage_client.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/base_resource_client.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/dataset.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/dataset_collection.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/key_value_store.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/request_queue.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/request_queue_collection.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_utils.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/actor.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/config.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/consts.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/event_manager.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/log.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/proxy_configuration.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/py.typed +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/storages/__init__.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/storages/base_storage.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/storages/dataset.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/storages/key_value_store.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/storages/request_queue.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify/storages/storage_client_manager.py +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify.egg-info/SOURCES.txt +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify.egg-info/dependency_links.txt +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify.egg-info/requires.txt +0 -0
- {apify-1.5.2b2 → apify-1.5.2b3}/src/apify.egg-info/top_level.txt +0 -0
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
from urllib.parse import ParseResult, urlparse
|
|
4
5
|
|
|
5
6
|
try:
|
|
6
|
-
from scrapy import Request, Spider # noqa: TCH002
|
|
7
7
|
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
8
|
-
from scrapy.crawler import Crawler # noqa: TCH002
|
|
9
8
|
from scrapy.exceptions import NotConfigured
|
|
10
9
|
except ImportError as exc:
|
|
11
10
|
raise ImportError(
|
|
@@ -16,6 +15,10 @@ from ...actor import Actor
|
|
|
16
15
|
from ...proxy_configuration import ProxyConfiguration
|
|
17
16
|
from ..utils import get_basic_auth_header
|
|
18
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from scrapy import Request, Spider
|
|
20
|
+
from scrapy.crawler import Crawler
|
|
21
|
+
|
|
19
22
|
|
|
20
23
|
class ApifyHttpProxyMiddleware:
|
|
21
24
|
"""Apify HTTP proxy middleware for Scrapy.
|
|
@@ -4,9 +4,7 @@ import traceback
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
|
-
from scrapy import Spider # noqa: TCH002
|
|
8
7
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware
|
|
9
|
-
from scrapy.http import Request, Response # noqa: TCH002
|
|
10
8
|
from scrapy.utils.response import response_status_message
|
|
11
9
|
except ImportError as exc:
|
|
12
10
|
raise ImportError(
|
|
@@ -17,6 +15,9 @@ from ...actor import Actor
|
|
|
17
15
|
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
|
|
18
16
|
|
|
19
17
|
if TYPE_CHECKING:
|
|
18
|
+
from scrapy import Spider
|
|
19
|
+
from scrapy.http import Request, Response
|
|
20
|
+
|
|
20
21
|
from ...storages import RequestQueue
|
|
21
22
|
|
|
22
23
|
|
|
@@ -55,9 +55,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
55
55
|
Returns:
|
|
56
56
|
True if the scheduler has any pending requests, False otherwise.
|
|
57
57
|
"""
|
|
58
|
-
|
|
59
|
-
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
60
|
-
|
|
58
|
+
assert isinstance(self._rq, RequestQueue) # noqa: S101
|
|
61
59
|
try:
|
|
62
60
|
is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
|
|
63
61
|
except BaseException:
|
|
@@ -78,14 +76,10 @@ class ApifyScheduler(BaseScheduler):
|
|
|
78
76
|
call_id = crypto_random_object_id(8)
|
|
79
77
|
Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
|
|
80
78
|
|
|
81
|
-
|
|
82
|
-
raise TypeError('self.spider must be an instance of the Spider class')
|
|
83
|
-
|
|
79
|
+
assert isinstance(self.spider, Spider) # noqa: S101
|
|
84
80
|
apify_request = to_apify_request(request, spider=self.spider)
|
|
85
81
|
Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
|
|
86
|
-
|
|
87
|
-
if not isinstance(self._rq, RequestQueue):
|
|
88
|
-
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
82
|
+
assert isinstance(self._rq, RequestQueue) # noqa: S101
|
|
89
83
|
|
|
90
84
|
try:
|
|
91
85
|
result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
|
|
@@ -104,9 +98,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
104
98
|
"""
|
|
105
99
|
call_id = crypto_random_object_id(8)
|
|
106
100
|
Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
|
|
107
|
-
|
|
108
|
-
if not isinstance(self._rq, RequestQueue):
|
|
109
|
-
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
101
|
+
assert isinstance(self._rq, RequestQueue) # noqa: S101
|
|
110
102
|
|
|
111
103
|
try:
|
|
112
104
|
apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
|
|
@@ -119,9 +111,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
119
111
|
if apify_request is None:
|
|
120
112
|
return None
|
|
121
113
|
|
|
122
|
-
|
|
123
|
-
raise TypeError('self.spider must be an instance of the Spider class')
|
|
124
|
-
|
|
114
|
+
assert isinstance(self.spider, Spider) # noqa: S101
|
|
125
115
|
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
|
|
126
116
|
Actor.log.debug(
|
|
127
117
|
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',
|
|
@@ -6,9 +6,10 @@ import pickle
|
|
|
6
6
|
from base64 import b64encode
|
|
7
7
|
from urllib.parse import unquote
|
|
8
8
|
|
|
9
|
+
from scrapy.utils.python import to_bytes
|
|
10
|
+
|
|
9
11
|
try:
|
|
10
12
|
from scrapy import Request, Spider
|
|
11
|
-
from scrapy.utils.python import to_bytes
|
|
12
13
|
from scrapy.utils.request import request_from_dict
|
|
13
14
|
except ImportError as exc:
|
|
14
15
|
raise ImportError(
|
|
@@ -50,8 +51,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
|
|
|
50
51
|
Returns:
|
|
51
52
|
The converted Apify request.
|
|
52
53
|
"""
|
|
53
|
-
|
|
54
|
-
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
|
|
54
|
+
assert isinstance(scrapy_request, Request) # noqa: S101
|
|
55
55
|
|
|
56
56
|
call_id = crypto_random_object_id(8)
|
|
57
57
|
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
|
|
@@ -91,14 +91,11 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
91
91
|
Returns:
|
|
92
92
|
The converted Scrapy request.
|
|
93
93
|
"""
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if missing_keys:
|
|
101
|
-
raise ValueError(f"apify_request must contain {', '.join(map(repr, missing_keys))} key(s)")
|
|
94
|
+
assert isinstance(apify_request, dict) # noqa: S101
|
|
95
|
+
assert 'url' in apify_request # noqa: S101
|
|
96
|
+
assert 'method' in apify_request # noqa: S101
|
|
97
|
+
assert 'id' in apify_request # noqa: S101
|
|
98
|
+
assert 'uniqueKey' in apify_request # noqa: S101
|
|
102
99
|
|
|
103
100
|
call_id = crypto_random_object_id(8)
|
|
104
101
|
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
|
|
@@ -109,19 +106,14 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
109
106
|
# - This process involves decoding the base64-encoded request data and reconstructing
|
|
110
107
|
# the Scrapy Request object from its dictionary representation.
|
|
111
108
|
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
|
|
112
|
-
|
|
113
109
|
scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
|
|
114
|
-
|
|
115
|
-
raise TypeError('scrapy_request_dict_encoded must be a string')
|
|
110
|
+
assert isinstance(scrapy_request_dict_encoded, str) # noqa: S101
|
|
116
111
|
|
|
117
112
|
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
|
|
118
|
-
|
|
119
|
-
raise TypeError('scrapy_request_dict must be a dictionary')
|
|
113
|
+
assert isinstance(scrapy_request_dict, dict) # noqa: S101
|
|
120
114
|
|
|
121
115
|
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
|
|
122
|
-
|
|
123
|
-
raise TypeError('scrapy_request must be an instance of the Request class')
|
|
124
|
-
|
|
116
|
+
assert isinstance(scrapy_request, Request) # noqa: S101
|
|
125
117
|
Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
|
|
126
118
|
|
|
127
119
|
# Update the meta field with the meta field from the apify_request
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/base_resource_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/dataset_collection.py
RENAMED
|
File without changes
|
{apify-1.5.2b2 → apify-1.5.2b3}/src/apify/_memory_storage/resource_clients/key_value_store.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|