apify 1.5.2b1__tar.gz → 1.5.2b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- {apify-1.5.2b1 → apify-1.5.2b2}/PKG-INFO +1 -1
- {apify-1.5.2b1 → apify-1.5.2b2}/pyproject.toml +1 -1
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/scrapy/__init__.py +0 -1
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/scrapy/middlewares/apify_proxy.py +2 -5
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/scrapy/middlewares/apify_retry.py +2 -3
- apify-1.5.2b2/src/apify/scrapy/pipelines/__init__.py +1 -0
- apify-1.5.2b1/src/apify/scrapy/pipelines.py → apify-1.5.2b2/src/apify/scrapy/pipelines/actor_dataset_push.py +1 -1
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/scrapy/scheduler.py +15 -5
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/scrapy/utils.py +19 -11
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify.egg-info/PKG-INFO +1 -1
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify.egg-info/SOURCES.txt +2 -1
- {apify-1.5.2b1 → apify-1.5.2b2}/LICENSE +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/README.md +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/setup.cfg +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/__init__.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_crypto.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/__init__.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/file_storage_utils.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/memory_storage_client.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/__init__.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/base_resource_client.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/dataset.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/dataset_collection.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/key_value_store.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/request_queue.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/request_queue_collection.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_utils.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/actor.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/config.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/consts.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/event_manager.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/log.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/proxy_configuration.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/py.typed +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/storages/__init__.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/storages/base_storage.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/storages/dataset.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/storages/key_value_store.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/storages/request_queue.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify/storages/storage_client_manager.py +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify.egg-info/dependency_links.txt +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify.egg-info/requires.txt +0 -0
- {apify-1.5.2b1 → apify-1.5.2b2}/src/apify.egg-info/top_level.txt +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
3
|
from urllib.parse import ParseResult, urlparse
|
|
5
4
|
|
|
6
5
|
try:
|
|
6
|
+
from scrapy import Request, Spider # noqa: TCH002
|
|
7
7
|
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
8
|
+
from scrapy.crawler import Crawler # noqa: TCH002
|
|
8
9
|
from scrapy.exceptions import NotConfigured
|
|
9
10
|
except ImportError as exc:
|
|
10
11
|
raise ImportError(
|
|
@@ -15,10 +16,6 @@ from ...actor import Actor
|
|
|
15
16
|
from ...proxy_configuration import ProxyConfiguration
|
|
16
17
|
from ..utils import get_basic_auth_header
|
|
17
18
|
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
from scrapy import Request, Spider
|
|
20
|
-
from scrapy.crawler import Crawler
|
|
21
|
-
|
|
22
19
|
|
|
23
20
|
class ApifyHttpProxyMiddleware:
|
|
24
21
|
"""Apify HTTP proxy middleware for Scrapy.
|
|
@@ -4,7 +4,9 @@ import traceback
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
6
|
try:
|
|
7
|
+
from scrapy import Spider # noqa: TCH002
|
|
7
8
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware
|
|
9
|
+
from scrapy.http import Request, Response # noqa: TCH002
|
|
8
10
|
from scrapy.utils.response import response_status_message
|
|
9
11
|
except ImportError as exc:
|
|
10
12
|
raise ImportError(
|
|
@@ -15,9 +17,6 @@ from ...actor import Actor
|
|
|
15
17
|
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
|
|
16
18
|
|
|
17
19
|
if TYPE_CHECKING:
|
|
18
|
-
from scrapy import Spider
|
|
19
|
-
from scrapy.http import Request, Response
|
|
20
|
-
|
|
21
20
|
from ...storages import RequestQueue
|
|
22
21
|
|
|
23
22
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .actor_dataset_push import ActorDatasetPushPipeline
|
|
@@ -55,7 +55,9 @@ class ApifyScheduler(BaseScheduler):
|
|
|
55
55
|
Returns:
|
|
56
56
|
True if the scheduler has any pending requests, False otherwise.
|
|
57
57
|
"""
|
|
58
|
-
|
|
58
|
+
if not isinstance(self._rq, RequestQueue):
|
|
59
|
+
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
60
|
+
|
|
59
61
|
try:
|
|
60
62
|
is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
|
|
61
63
|
except BaseException:
|
|
@@ -76,10 +78,14 @@ class ApifyScheduler(BaseScheduler):
|
|
|
76
78
|
call_id = crypto_random_object_id(8)
|
|
77
79
|
Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
|
|
78
80
|
|
|
79
|
-
|
|
81
|
+
if not isinstance(self.spider, Spider):
|
|
82
|
+
raise TypeError('self.spider must be an instance of the Spider class')
|
|
83
|
+
|
|
80
84
|
apify_request = to_apify_request(request, spider=self.spider)
|
|
81
85
|
Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
|
|
82
|
-
|
|
86
|
+
|
|
87
|
+
if not isinstance(self._rq, RequestQueue):
|
|
88
|
+
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
83
89
|
|
|
84
90
|
try:
|
|
85
91
|
result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
|
|
@@ -98,7 +104,9 @@ class ApifyScheduler(BaseScheduler):
|
|
|
98
104
|
"""
|
|
99
105
|
call_id = crypto_random_object_id(8)
|
|
100
106
|
Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
|
|
101
|
-
|
|
107
|
+
|
|
108
|
+
if not isinstance(self._rq, RequestQueue):
|
|
109
|
+
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
102
110
|
|
|
103
111
|
try:
|
|
104
112
|
apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
|
|
@@ -111,7 +119,9 @@ class ApifyScheduler(BaseScheduler):
|
|
|
111
119
|
if apify_request is None:
|
|
112
120
|
return None
|
|
113
121
|
|
|
114
|
-
|
|
122
|
+
if not isinstance(self.spider, Spider):
|
|
123
|
+
raise TypeError('self.spider must be an instance of the Spider class')
|
|
124
|
+
|
|
115
125
|
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
|
|
116
126
|
Actor.log.debug(
|
|
117
127
|
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',
|
|
@@ -6,10 +6,9 @@ import pickle
|
|
|
6
6
|
from base64 import b64encode
|
|
7
7
|
from urllib.parse import unquote
|
|
8
8
|
|
|
9
|
-
from scrapy.utils.python import to_bytes
|
|
10
|
-
|
|
11
9
|
try:
|
|
12
10
|
from scrapy import Request, Spider
|
|
11
|
+
from scrapy.utils.python import to_bytes
|
|
13
12
|
from scrapy.utils.request import request_from_dict
|
|
14
13
|
except ImportError as exc:
|
|
15
14
|
raise ImportError(
|
|
@@ -51,7 +50,8 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
|
|
|
51
50
|
Returns:
|
|
52
51
|
The converted Apify request.
|
|
53
52
|
"""
|
|
54
|
-
|
|
53
|
+
if not isinstance(scrapy_request, Request):
|
|
54
|
+
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
|
|
55
55
|
|
|
56
56
|
call_id = crypto_random_object_id(8)
|
|
57
57
|
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
|
|
@@ -91,11 +91,14 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
91
91
|
Returns:
|
|
92
92
|
The converted Scrapy request.
|
|
93
93
|
"""
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
94
|
+
if not isinstance(apify_request, dict):
|
|
95
|
+
raise TypeError('apify_request must be a dictionary')
|
|
96
|
+
|
|
97
|
+
required_keys = ['url', 'method', 'id', 'uniqueKey']
|
|
98
|
+
missing_keys = [key for key in required_keys if key not in apify_request]
|
|
99
|
+
|
|
100
|
+
if missing_keys:
|
|
101
|
+
raise ValueError(f"apify_request must contain {', '.join(map(repr, missing_keys))} key(s)")
|
|
99
102
|
|
|
100
103
|
call_id = crypto_random_object_id(8)
|
|
101
104
|
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
|
|
@@ -106,14 +109,19 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
|
|
|
106
109
|
# - This process involves decoding the base64-encoded request data and reconstructing
|
|
107
110
|
# the Scrapy Request object from its dictionary representation.
|
|
108
111
|
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
|
|
112
|
+
|
|
109
113
|
scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
|
|
110
|
-
|
|
114
|
+
if not isinstance(scrapy_request_dict_encoded, str):
|
|
115
|
+
raise TypeError('scrapy_request_dict_encoded must be a string')
|
|
111
116
|
|
|
112
117
|
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
|
|
113
|
-
|
|
118
|
+
if not isinstance(scrapy_request_dict, dict):
|
|
119
|
+
raise TypeError('scrapy_request_dict must be a dictionary')
|
|
114
120
|
|
|
115
121
|
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
|
|
116
|
-
|
|
122
|
+
if not isinstance(scrapy_request, Request):
|
|
123
|
+
raise TypeError('scrapy_request must be an instance of the Request class')
|
|
124
|
+
|
|
117
125
|
Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
|
|
118
126
|
|
|
119
127
|
# Update the meta field with the meta field from the apify_request
|
|
@@ -29,12 +29,13 @@ src/apify/_memory_storage/resource_clients/key_value_store_collection.py
|
|
|
29
29
|
src/apify/_memory_storage/resource_clients/request_queue.py
|
|
30
30
|
src/apify/_memory_storage/resource_clients/request_queue_collection.py
|
|
31
31
|
src/apify/scrapy/__init__.py
|
|
32
|
-
src/apify/scrapy/pipelines.py
|
|
33
32
|
src/apify/scrapy/scheduler.py
|
|
34
33
|
src/apify/scrapy/utils.py
|
|
35
34
|
src/apify/scrapy/middlewares/__init__.py
|
|
36
35
|
src/apify/scrapy/middlewares/apify_proxy.py
|
|
37
36
|
src/apify/scrapy/middlewares/apify_retry.py
|
|
37
|
+
src/apify/scrapy/pipelines/__init__.py
|
|
38
|
+
src/apify/scrapy/pipelines/actor_dataset_push.py
|
|
38
39
|
src/apify/storages/__init__.py
|
|
39
40
|
src/apify/storages/base_storage.py
|
|
40
41
|
src/apify/storages/dataset.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/base_resource_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/dataset_collection.py
RENAMED
|
File without changes
|
{apify-1.5.2b1 → apify-1.5.2b2}/src/apify/_memory_storage/resource_clients/key_value_store.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|