apify 1.7.0b1__py3-none-any.whl → 2.2.0b14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +19 -4
- apify/_actor.py +1030 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.0b14.dist-info/METADATA +211 -0
- apify-2.2.0b14.dist-info/RECORD +38 -0
- {apify-1.7.0b1.dist-info → apify-2.2.0b14.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.0b1.dist-info/METADATA +0 -149
- apify-1.7.0b1.dist-info/RECORD +0 -41
- apify-1.7.0b1.dist-info/top_level.txt +0 -1
- {apify-1.7.0b1.dist-info → apify-2.2.0b14.dist-info}/LICENSE +0 -0
apify/scrapy/scheduler.py
CHANGED
|
@@ -1,21 +1,28 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import traceback
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from apify._configuration import Configuration
|
|
7
|
+
from apify.apify_storage_client import ApifyStorageClient
|
|
4
8
|
|
|
5
9
|
try:
|
|
6
10
|
from scrapy import Spider
|
|
7
11
|
from scrapy.core.scheduler import BaseScheduler
|
|
8
|
-
from scrapy.http.request import Request # noqa: TCH002
|
|
9
12
|
from scrapy.utils.reactor import is_asyncio_reactor_installed
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from scrapy.http.request import Request
|
|
10
16
|
except ImportError as exc:
|
|
11
17
|
raise ImportError(
|
|
12
18
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
13
19
|
) from exc
|
|
14
20
|
|
|
15
|
-
from
|
|
16
|
-
|
|
21
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
22
|
+
|
|
23
|
+
from apify import Actor
|
|
17
24
|
from apify.scrapy.requests import to_apify_request, to_scrapy_request
|
|
18
|
-
from apify.scrapy.utils import nested_event_loop
|
|
25
|
+
from apify.scrapy.utils import nested_event_loop
|
|
19
26
|
from apify.storages import RequestQueue
|
|
20
27
|
|
|
21
28
|
|
|
@@ -25,7 +32,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
25
32
|
This scheduler requires the asyncio Twisted reactor to be installed.
|
|
26
33
|
"""
|
|
27
34
|
|
|
28
|
-
def __init__(self
|
|
35
|
+
def __init__(self) -> None:
|
|
29
36
|
"""Create a new instance."""
|
|
30
37
|
if not is_asyncio_reactor_installed():
|
|
31
38
|
raise ValueError(
|
|
@@ -36,7 +43,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
36
43
|
self._rq: RequestQueue | None = None
|
|
37
44
|
self.spider: Spider | None = None
|
|
38
45
|
|
|
39
|
-
def open(self
|
|
46
|
+
def open(self, spider: Spider) -> None: # this has to be named "open"
|
|
40
47
|
"""Open the scheduler.
|
|
41
48
|
|
|
42
49
|
Args:
|
|
@@ -44,13 +51,17 @@ class ApifyScheduler(BaseScheduler):
|
|
|
44
51
|
"""
|
|
45
52
|
self.spider = spider
|
|
46
53
|
|
|
54
|
+
async def open_queue() -> RequestQueue:
|
|
55
|
+
custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration())
|
|
56
|
+
return await RequestQueue.open(storage_client=custom_loop_apify_client)
|
|
57
|
+
|
|
47
58
|
try:
|
|
48
|
-
self._rq = nested_event_loop.run_until_complete(
|
|
59
|
+
self._rq = nested_event_loop.run_until_complete(open_queue())
|
|
49
60
|
except BaseException:
|
|
50
61
|
traceback.print_exc()
|
|
51
62
|
raise
|
|
52
63
|
|
|
53
|
-
def has_pending_requests(self
|
|
64
|
+
def has_pending_requests(self) -> bool:
|
|
54
65
|
"""Check if the scheduler has any pending requests.
|
|
55
66
|
|
|
56
67
|
Returns:
|
|
@@ -67,7 +78,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
67
78
|
|
|
68
79
|
return not is_finished
|
|
69
80
|
|
|
70
|
-
def enqueue_request(self
|
|
81
|
+
def enqueue_request(self, request: Request) -> bool:
|
|
71
82
|
"""Add a request to the scheduler.
|
|
72
83
|
|
|
73
84
|
This could be called from either from a spider or a downloader middleware (e.g. redirect, retry, ...).
|
|
@@ -95,20 +106,15 @@ class ApifyScheduler(BaseScheduler):
|
|
|
95
106
|
raise TypeError('self._rq must be an instance of the RequestQueue class')
|
|
96
107
|
|
|
97
108
|
try:
|
|
98
|
-
result = nested_event_loop.run_until_complete(
|
|
99
|
-
self._rq.add_request(
|
|
100
|
-
apify_request,
|
|
101
|
-
use_extended_unique_key=True,
|
|
102
|
-
)
|
|
103
|
-
)
|
|
109
|
+
result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
|
|
104
110
|
except BaseException:
|
|
105
111
|
traceback.print_exc()
|
|
106
112
|
raise
|
|
107
113
|
|
|
108
114
|
Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
|
|
109
|
-
return bool(result
|
|
115
|
+
return bool(result.was_already_present)
|
|
110
116
|
|
|
111
|
-
def next_request(self
|
|
117
|
+
def next_request(self) -> Request | None:
|
|
112
118
|
"""Fetch the next request from the scheduler.
|
|
113
119
|
|
|
114
120
|
Returns:
|
|
@@ -127,7 +133,9 @@ class ApifyScheduler(BaseScheduler):
|
|
|
127
133
|
traceback.print_exc()
|
|
128
134
|
raise
|
|
129
135
|
|
|
130
|
-
Actor.log.debug(
|
|
136
|
+
Actor.log.debug(
|
|
137
|
+
f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})'
|
|
138
|
+
)
|
|
131
139
|
|
|
132
140
|
if apify_request is None:
|
|
133
141
|
return None
|
|
@@ -145,6 +153,7 @@ class ApifyScheduler(BaseScheduler):
|
|
|
145
153
|
|
|
146
154
|
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
|
|
147
155
|
Actor.log.debug(
|
|
148
|
-
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned
|
|
156
|
+
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned '
|
|
157
|
+
f'(scrapy_request={scrapy_request})',
|
|
149
158
|
)
|
|
150
159
|
return scrapy_request
|
apify/scrapy/utils.py
CHANGED
|
@@ -2,23 +2,28 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from base64 import b64encode
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
from urllib.parse import unquote
|
|
6
7
|
|
|
8
|
+
from apify_shared.utils import ignore_docs
|
|
9
|
+
|
|
7
10
|
try:
|
|
8
|
-
from scrapy.settings import Settings # noqa: TCH002
|
|
9
11
|
from scrapy.utils.project import get_project_settings
|
|
10
12
|
from scrapy.utils.python import to_bytes
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from scrapy.settings import Settings
|
|
11
16
|
except ImportError as exc:
|
|
12
17
|
raise ImportError(
|
|
13
|
-
'To use this module, you need to install the "scrapy" extra.
|
|
18
|
+
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
|
|
19
|
+
'"pip install apify[scrapy]".'
|
|
14
20
|
) from exc
|
|
15
21
|
|
|
16
|
-
from apify.actor import Actor
|
|
17
|
-
from apify.storages import RequestQueue, StorageClientManager
|
|
18
22
|
|
|
19
23
|
nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
|
|
20
24
|
|
|
21
25
|
|
|
26
|
+
@ignore_docs
|
|
22
27
|
def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
|
|
23
28
|
"""Generate a basic authentication header for the given username and password."""
|
|
24
29
|
string = f'{unquote(username)}:{unquote(password)}'
|
|
@@ -26,6 +31,7 @@ def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'la
|
|
|
26
31
|
return b'Basic ' + b64encode(user_pass)
|
|
27
32
|
|
|
28
33
|
|
|
34
|
+
@ignore_docs
|
|
29
35
|
def get_running_event_loop_id() -> int:
|
|
30
36
|
"""Get the ID of the currently running event loop.
|
|
31
37
|
|
|
@@ -71,31 +77,3 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
|
|
|
71
77
|
settings['APIFY_PROXY_SETTINGS'] = proxy_config
|
|
72
78
|
|
|
73
79
|
return settings
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
async def open_queue_with_custom_client() -> RequestQueue:
|
|
77
|
-
"""Open a Request Queue with custom Apify Client.
|
|
78
|
-
|
|
79
|
-
TODO: add support for custom client to Actor.open_request_queue(), so that
|
|
80
|
-
we don't have to do this hacky workaround
|
|
81
|
-
"""
|
|
82
|
-
# Create a new Apify Client with its httpx client in the custom event loop
|
|
83
|
-
custom_loop_apify_client = Actor.new_client()
|
|
84
|
-
|
|
85
|
-
# Set the new Apify Client as the default client, back up the old client
|
|
86
|
-
old_client = Actor.apify_client
|
|
87
|
-
StorageClientManager.set_cloud_client(custom_loop_apify_client)
|
|
88
|
-
|
|
89
|
-
# Create a new Request Queue in the custom event loop,
|
|
90
|
-
# replace its Apify client with the custom loop's Apify client
|
|
91
|
-
rq = await Actor.open_request_queue()
|
|
92
|
-
|
|
93
|
-
if Actor.config.is_at_home:
|
|
94
|
-
rq._request_queue_client = custom_loop_apify_client.request_queue(
|
|
95
|
-
rq._id,
|
|
96
|
-
client_key=rq._client_key,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Restore the old Apify Client as the default client
|
|
100
|
-
StorageClientManager.set_cloud_client(old_client)
|
|
101
|
-
return rq
|
apify/storages/__init__.py
CHANGED
|
@@ -1,11 +1,5 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .key_value_store import KeyValueStore
|
|
3
|
-
from .request_queue import RequestQueue
|
|
4
|
-
from .storage_client_manager import StorageClientManager
|
|
1
|
+
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
'RequestQueue',
|
|
10
|
-
'StorageClientManager',
|
|
11
|
-
]
|
|
3
|
+
from ._request_list import RequestList
|
|
4
|
+
|
|
5
|
+
__all__ = ['Dataset', 'KeyValueStore', 'RequestList', 'RequestQueue']
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
from asyncio import Task
|
|
6
|
+
from functools import partial
|
|
7
|
+
from typing import Annotated, Any, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, TypeAdapter
|
|
10
|
+
|
|
11
|
+
from crawlee import Request
|
|
12
|
+
from crawlee._types import HttpMethod
|
|
13
|
+
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
|
|
14
|
+
from crawlee.request_loaders import RequestList as CrawleeRequestList
|
|
15
|
+
|
|
16
|
+
from apify._utils import docs_group
|
|
17
|
+
|
|
18
|
+
URL_NO_COMMAS_REGEX = re.compile(
|
|
19
|
+
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _RequestDetails(BaseModel):
|
|
24
|
+
method: HttpMethod = 'GET'
|
|
25
|
+
payload: str = ''
|
|
26
|
+
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
|
|
27
|
+
user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _RequestsFromUrlInput(_RequestDetails):
|
|
31
|
+
requests_from_url: str = Field(alias='requestsFromUrl')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _SimpleUrlInput(_RequestDetails):
|
|
35
|
+
url: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@docs_group('Classes')
|
|
42
|
+
class RequestList(CrawleeRequestList):
|
|
43
|
+
"""Extends crawlee RequestList.
|
|
44
|
+
|
|
45
|
+
Method open is used to create RequestList from actor's requestListSources input.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
async def open(
|
|
50
|
+
name: str | None = None,
|
|
51
|
+
request_list_sources_input: list[dict[str, Any]] | None = None,
|
|
52
|
+
http_client: BaseHttpClient | None = None,
|
|
53
|
+
) -> RequestList:
|
|
54
|
+
"""Creates RequestList from Actor input requestListSources.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
name: Name of the returned RequestList.
|
|
58
|
+
request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
|
|
59
|
+
http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
RequestList created from request_list_sources_input.
|
|
63
|
+
|
|
64
|
+
### Usage
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
example_input = [
|
|
68
|
+
# Gather urls from response body.
|
|
69
|
+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
|
|
70
|
+
# Directly include this url.
|
|
71
|
+
{'url': 'https://crawlee.dev', 'method': 'GET'}
|
|
72
|
+
]
|
|
73
|
+
request_list = await RequestList.open(request_list_sources_input=example_input)
|
|
74
|
+
```
|
|
75
|
+
"""
|
|
76
|
+
request_list_sources_input = request_list_sources_input or []
|
|
77
|
+
return await RequestList._create_request_list(name, request_list_sources_input, http_client)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
async def _create_request_list(
|
|
81
|
+
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
|
|
82
|
+
) -> RequestList:
|
|
83
|
+
if not http_client:
|
|
84
|
+
http_client = HttpxHttpClient()
|
|
85
|
+
|
|
86
|
+
url_inputs = url_input_adapter.validate_python(request_list_sources_input)
|
|
87
|
+
|
|
88
|
+
simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
|
|
89
|
+
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
|
|
90
|
+
|
|
91
|
+
simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
|
|
92
|
+
remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
|
|
93
|
+
|
|
94
|
+
return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
|
|
98
|
+
return [
|
|
99
|
+
Request.from_url(
|
|
100
|
+
method=request_input.method,
|
|
101
|
+
url=request_input.url,
|
|
102
|
+
payload=request_input.payload.encode('utf-8'),
|
|
103
|
+
headers=request_input.headers,
|
|
104
|
+
user_data=request_input.user_data,
|
|
105
|
+
)
|
|
106
|
+
for request_input in simple_url_inputs
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
async def _fetch_requests_from_url(
|
|
111
|
+
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
|
|
112
|
+
) -> list[Request]:
|
|
113
|
+
"""Crete list of requests from url.
|
|
114
|
+
|
|
115
|
+
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
|
|
116
|
+
callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
|
|
117
|
+
collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
|
|
118
|
+
"""
|
|
119
|
+
created_requests: list[Request] = []
|
|
120
|
+
|
|
121
|
+
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
|
|
122
|
+
"""Callback to scrape response body with regexp and create Requests from matches."""
|
|
123
|
+
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
|
|
124
|
+
created_requests.extend(
|
|
125
|
+
[
|
|
126
|
+
Request.from_url(
|
|
127
|
+
match.group(0),
|
|
128
|
+
method=request_input.method,
|
|
129
|
+
payload=request_input.payload.encode('utf-8'),
|
|
130
|
+
headers=request_input.headers,
|
|
131
|
+
user_data=request_input.user_data,
|
|
132
|
+
)
|
|
133
|
+
for match in matches
|
|
134
|
+
]
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
remote_url_requests = []
|
|
138
|
+
for remote_url_requests_input in remote_url_requests_inputs:
|
|
139
|
+
get_response_task = asyncio.create_task(
|
|
140
|
+
http_client.send_request(
|
|
141
|
+
method='GET',
|
|
142
|
+
url=remote_url_requests_input.requests_from_url,
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
|
|
147
|
+
remote_url_requests.append(get_response_task)
|
|
148
|
+
|
|
149
|
+
await asyncio.gather(*remote_url_requests)
|
|
150
|
+
return created_requests
|
apify/storages/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: apify
|
|
3
|
+
Version: 2.2.0b14
|
|
4
|
+
Summary: Apify SDK for Python
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
|
|
7
|
+
Author: Apify Technologies s.r.o.
|
|
8
|
+
Author-email: support@apify.com
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Provides-Extra: scrapy
|
|
22
|
+
Requires-Dist: apify-client (>=1.8.1)
|
|
23
|
+
Requires-Dist: apify-shared (>=1.2.1)
|
|
24
|
+
Requires-Dist: crawlee (>=0.5.1,<0.6.0)
|
|
25
|
+
Requires-Dist: cryptography (>=42.0.0)
|
|
26
|
+
Requires-Dist: httpx (>=0.27.0)
|
|
27
|
+
Requires-Dist: lazy-object-proxy (>=1.10.0)
|
|
28
|
+
Requires-Dist: more_itertools (>=10.2.0)
|
|
29
|
+
Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
|
|
30
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
31
|
+
Requires-Dist: websockets (>=10.0,<14.0.0)
|
|
32
|
+
Project-URL: Apify Homepage, https://apify.com
|
|
33
|
+
Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
|
|
34
|
+
Project-URL: Documentation, https://docs.apify.com/sdk/python/
|
|
35
|
+
Project-URL: Homepage, https://docs.apify.com/sdk/python/
|
|
36
|
+
Project-URL: Issue Tracker, https://github.com/apify/apify-sdk-python/issues
|
|
37
|
+
Project-URL: Repository, https://github.com/apify/apify-sdk-python
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# Apify SDK for Python
|
|
41
|
+
|
|
42
|
+
The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
|
|
43
|
+
in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
|
|
44
|
+
event handling.
|
|
45
|
+
|
|
46
|
+
If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
|
|
47
|
+
check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
The Apify SDK for Python is available on PyPI as the `apify` package.
|
|
52
|
+
For default installation, using Pip, run the following:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install apify
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
|
|
59
|
+
To install Apify with the `scrapy` extra, use the following command:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install apify[scrapy]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Documentation
|
|
66
|
+
|
|
67
|
+
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
|
|
68
|
+
|
|
69
|
+
## Examples
|
|
70
|
+
|
|
71
|
+
Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
|
|
72
|
+
|
|
73
|
+
### Apify SDK with HTTPX and BeautifulSoup
|
|
74
|
+
|
|
75
|
+
This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from apify import Actor
|
|
79
|
+
from bs4 import BeautifulSoup
|
|
80
|
+
from httpx import AsyncClient
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
async def main() -> None:
|
|
84
|
+
async with Actor:
|
|
85
|
+
# Retrieve the Actor input, and use default values if not provided.
|
|
86
|
+
actor_input = await Actor.get_input() or {}
|
|
87
|
+
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
|
|
88
|
+
|
|
89
|
+
# Open the default request queue for handling URLs to be processed.
|
|
90
|
+
request_queue = await Actor.open_request_queue()
|
|
91
|
+
|
|
92
|
+
# Enqueue the start URLs.
|
|
93
|
+
for start_url in start_urls:
|
|
94
|
+
url = start_url.get('url')
|
|
95
|
+
await request_queue.add_request(url)
|
|
96
|
+
|
|
97
|
+
# Process the URLs from the request queue.
|
|
98
|
+
while request := await request_queue.fetch_next_request():
|
|
99
|
+
Actor.log.info(f'Scraping {request.url} ...')
|
|
100
|
+
|
|
101
|
+
# Fetch the HTTP response from the specified URL using HTTPX.
|
|
102
|
+
async with AsyncClient() as client:
|
|
103
|
+
response = await client.get(request.url)
|
|
104
|
+
|
|
105
|
+
# Parse the HTML content using Beautiful Soup.
|
|
106
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
107
|
+
|
|
108
|
+
# Extract the desired data.
|
|
109
|
+
data = {
|
|
110
|
+
'url': actor_input['url'],
|
|
111
|
+
'title': soup.title.string,
|
|
112
|
+
'h1s': [h1.text for h1 in soup.find_all('h1')],
|
|
113
|
+
'h2s': [h2.text for h2 in soup.find_all('h2')],
|
|
114
|
+
'h3s': [h3.text for h3 in soup.find_all('h3')],
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Store the extracted data to the default dataset.
|
|
118
|
+
await Actor.push_data(data)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Apify SDK with PlaywrightCrawler from Crawlee
|
|
122
|
+
|
|
123
|
+
This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from apify import Actor, Request
|
|
127
|
+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
async def main() -> None:
|
|
131
|
+
async with Actor:
|
|
132
|
+
# Retrieve the Actor input, and use default values if not provided.
|
|
133
|
+
actor_input = await Actor.get_input() or {}
|
|
134
|
+
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
|
|
135
|
+
|
|
136
|
+
# Exit if no start URLs are provided.
|
|
137
|
+
if not start_urls:
|
|
138
|
+
Actor.log.info('No start URLs specified in Actor input, exiting...')
|
|
139
|
+
await Actor.exit()
|
|
140
|
+
|
|
141
|
+
# Create a crawler.
|
|
142
|
+
crawler = PlaywrightCrawler(
|
|
143
|
+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
|
|
144
|
+
max_requests_per_crawl=50,
|
|
145
|
+
headless=True,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Define a request handler, which will be called for every request.
|
|
149
|
+
@crawler.router.default_handler
|
|
150
|
+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
|
|
151
|
+
url = context.request.url
|
|
152
|
+
Actor.log.info(f'Scraping {url}...')
|
|
153
|
+
|
|
154
|
+
# Extract the desired data.
|
|
155
|
+
data = {
|
|
156
|
+
'url': context.request.url,
|
|
157
|
+
'title': await context.page.title(),
|
|
158
|
+
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
|
|
159
|
+
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
|
|
160
|
+
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Store the extracted data to the default dataset.
|
|
164
|
+
await context.push_data(data)
|
|
165
|
+
|
|
166
|
+
# Enqueue additional links found on the current page.
|
|
167
|
+
await context.enqueue_links()
|
|
168
|
+
|
|
169
|
+
# Run the crawler with the starting URLs.
|
|
170
|
+
await crawler.run(start_urls)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## What are Actors?
|
|
174
|
+
|
|
175
|
+
Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
|
|
176
|
+
They can do anything from small tasks such as filling in forms or unsubscribing from online services,
|
|
177
|
+
all the way up to scraping and processing vast numbers of web pages.
|
|
178
|
+
|
|
179
|
+
They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
|
|
180
|
+
where you can run them at scale, monitor them, schedule them, or publish and monetize them.
|
|
181
|
+
|
|
182
|
+
If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
|
|
183
|
+
in the Apify platform documentation.
|
|
184
|
+
|
|
185
|
+
## Creating Actors
|
|
186
|
+
|
|
187
|
+
To create and run Actors through Apify Console,
|
|
188
|
+
see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
|
|
189
|
+
|
|
190
|
+
To create and run Python Actors locally, check the documentation for
|
|
191
|
+
[how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
|
|
192
|
+
|
|
193
|
+
## Guides
|
|
194
|
+
|
|
195
|
+
To see how you can use the Apify SDK with other popular libraries used for web scraping,
|
|
196
|
+
check out our guides for using
|
|
197
|
+
[Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
|
|
198
|
+
[Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
|
|
199
|
+
[Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
|
|
200
|
+
[Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
|
|
201
|
+
or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
|
|
202
|
+
|
|
203
|
+
## Usage concepts
|
|
204
|
+
|
|
205
|
+
To learn more about the features of the Apify SDK and how to use them,
|
|
206
|
+
check out the Usage Concepts section in the sidebar,
|
|
207
|
+
particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
|
|
208
|
+
[working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
|
|
209
|
+
[handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
|
|
210
|
+
or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
|
|
211
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
apify/__init__.py,sha256=99ynaDWBLEcCjdLq7R0Exy_iACsXiXoQ8VUZKmbzTeM,550
|
|
2
|
+
apify/_actor.py,sha256=X8UCTYVJmsq0Nus7aru1ayEgrbo8QE__0jKaN8aWMZ0,44313
|
|
3
|
+
apify/_configuration.py,sha256=T3Z_o_W98iSyTbrutfb578yW51aexZ_V0FcLwTxFLjI,10878
|
|
4
|
+
apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
|
|
5
|
+
apify/_crypto.py,sha256=e0_aM3l9_5Osk-jszYOOjrAKK60OggSHbiw5c30QnsU,5638
|
|
6
|
+
apify/_models.py,sha256=Btlz-23obKY5tJ75JnUwkVNC2lmU1IEBbdU3HvWaVhg,5748
|
|
7
|
+
apify/_platform_event_manager.py,sha256=44xyV0Lpzf4h4VZ0rkyYg_nhbQkEONNor8_Z9gIKO40,7899
|
|
8
|
+
apify/_proxy_configuration.py,sha256=c-O6_PZ9pUD-i4J0RFEKTtfyJPP2rTRJJA1TH8NVsV8,13189
|
|
9
|
+
apify/_utils.py,sha256=CCLkpAsZKp00ykm88Z_Fbck5PNT0j6mJYOuD0RxzZUs,1620
|
|
10
|
+
apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
|
|
11
|
+
apify/apify_storage_client/_apify_storage_client.py,sha256=0rS75JoRHt7stRYS9-oqm3DmaSIZQN5C11N5MZQUvlA,2616
|
|
12
|
+
apify/apify_storage_client/_dataset_client.py,sha256=UUodnR_MQBg5RkURrfegkGJWR5OmdPPgPfGepvkdQoU,5580
|
|
13
|
+
apify/apify_storage_client/_dataset_collection_client.py,sha256=qCcKZlA0bkO-sL7xED0Yose85NlrRa9AKr4oCSrYX6k,1489
|
|
14
|
+
apify/apify_storage_client/_key_value_store_client.py,sha256=MSuoIeqEHLu92WfUU7kyB3Cc_gKUlm8TghnU3_xkPtE,3363
|
|
15
|
+
apify/apify_storage_client/_key_value_store_collection_client.py,sha256=NxD-3XDJP6JGMDyIa6ib0gl8op7rQjSQ0vlToCiV190,1563
|
|
16
|
+
apify/apify_storage_client/_request_queue_client.py,sha256=n-CR-hA5LM6_8IwiMwQ9tT2juavq7X2zC3ZNlrtv-2s,5156
|
|
17
|
+
apify/apify_storage_client/_request_queue_collection_client.py,sha256=MdzgbQb2D8rHWpUlPCrQSHRlAi0fI0PSZ9bYagr-MhY,1571
|
|
18
|
+
apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
apify/log.py,sha256=j-E4t-WeA93bc1NCQRG8sTntehQCiiN8ia-MdQe3_Ts,1291
|
|
20
|
+
apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
apify/scrapy/__init__.py,sha256=HE5wCN7-DZKPydLCOvjNyLuL3CvN2fUFweXfrDfe1Ss,348
|
|
22
|
+
apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
|
|
23
|
+
apify/scrapy/middlewares/apify_proxy.py,sha256=9_-hJqTwQ4yVMjvN9zkJ_GXJADzrrYu8QoHZ6IX6fDs,5764
|
|
24
|
+
apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
|
|
26
|
+
apify/scrapy/pipelines/actor_dataset_push.py,sha256=otggoULfUdCqOPJLb9wMROZ9WylnlL-209930tMS2Rg,971
|
|
27
|
+
apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
apify/scrapy/requests.py,sha256=yZ9hIsz2YyqOoOwzN9F1h76wG4qwawrI6h_6xq0I7Iw,7599
|
|
30
|
+
apify/scrapy/scheduler.py,sha256=03kZxejWWb-TofJ-vpSZuQ28rT-qNjhhpC-QeO2OzoU,5977
|
|
31
|
+
apify/scrapy/utils.py,sha256=758DcHCSAgCTProY0QX74uJ1XrzVsQwvCmFanj2f_3Q,2928
|
|
32
|
+
apify/storages/__init__.py,sha256=FW-z6ubuPnHGM-Wp15T8mR5q6lnpDGrCW-IkgZd5L30,177
|
|
33
|
+
apify/storages/_request_list.py,sha256=-lZJcE5nq69aJhGFJ7Sh2ctqgAWUDyOwYm5_0y1hdAE,5865
|
|
34
|
+
apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
+
apify-2.2.0b14.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
36
|
+
apify-2.2.0b14.dist-info/METADATA,sha256=vagvl8FfixTKDEiYx3WI7D541NifN21SGjWuFVCuOHE,8714
|
|
37
|
+
apify-2.2.0b14.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
38
|
+
apify-2.2.0b14.dist-info/RECORD,,
|