apify 2.0.0__tar.gz → 2.0.0b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- {apify-2.0.0 → apify-2.0.0b1}/PKG-INFO +29 -104
- apify-2.0.0b1/README.md +90 -0
- {apify-2.0.0 → apify-2.0.0b1}/pyproject.toml +35 -26
- apify-2.0.0b1/src/apify/__init__.py +11 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_actor.py +39 -81
- apify-2.0.0b1/src/apify/_log.py +15 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_platform_event_manager.py +2 -11
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_proxy_configuration.py +1 -1
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_dataset_client.py +12 -14
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_request_queue_client.py +3 -15
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/middlewares/apify_proxy.py +6 -4
- apify-2.0.0/README.md +0 -171
- apify-2.0.0/src/apify/__init__.py +0 -24
- apify-2.0.0/src/apify/_models.py +0 -110
- apify-2.0.0/src/apify/log.py +0 -43
- apify-2.0.0/src/apify/py.typed +0 -0
- apify-2.0.0/src/apify/scrapy/middlewares/py.typed +0 -0
- apify-2.0.0/src/apify/scrapy/pipelines/py.typed +0 -0
- apify-2.0.0/src/apify/scrapy/py.typed +0 -0
- apify-2.0.0/src/apify/storages/py.typed +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/LICENSE +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_configuration.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_consts.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_crypto.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/_utils.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/__init__.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_apify_storage_client.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_dataset_collection_client.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_key_value_store_client.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_key_value_store_collection_client.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_request_queue_collection_client.py +0 -0
- {apify-2.0.0/src/apify/apify_storage_client → apify-2.0.0b1/src/apify}/py.typed +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/__init__.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/requests.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/scheduler.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/scrapy/utils.py +0 -0
- {apify-2.0.0 → apify-2.0.0b1}/src/apify/storages/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.0b1
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
|
|
@@ -18,15 +18,21 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries
|
|
20
20
|
Provides-Extra: scrapy
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
21
|
+
Requires-Dist: aiofiles (>=22.1.0,<23.0.0)
|
|
22
|
+
Requires-Dist: aioshutil (>=1.0,<2.0)
|
|
23
|
+
Requires-Dist: apify-client (>=1.7.1,<2.0.0)
|
|
24
|
+
Requires-Dist: apify-shared (>=1.1.2,<2.0.0)
|
|
25
|
+
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
|
26
|
+
Requires-Dist: crawlee (>=0.3.0,<0.4.0)
|
|
27
|
+
Requires-Dist: cryptography (>=39.0.0,<40.0.0)
|
|
28
|
+
Requires-Dist: httpx (>=0.27.0,<0.28.0)
|
|
29
|
+
Requires-Dist: lazy-object-proxy (>=1.10.0,<2.0.0)
|
|
30
|
+
Requires-Dist: psutil (>=6.0.0,<7.0.0)
|
|
31
|
+
Requires-Dist: pyee (>=11.0.0,<12.0.0)
|
|
32
|
+
Requires-Dist: scrapy (>=2.11.0,<3.0.0) ; extra == "scrapy"
|
|
33
|
+
Requires-Dist: sortedcollections (>=2.0.0,<3.0.0)
|
|
34
|
+
Requires-Dist: typing-extensions (>=4.1.0,<5.0.0)
|
|
35
|
+
Requires-Dist: websockets (>=10.1,<11.0)
|
|
30
36
|
Project-URL: Apify Homepage, https://apify.com
|
|
31
37
|
Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
|
|
32
38
|
Project-URL: Documentation, https://docs.apify.com/sdk/python/
|
|
@@ -64,108 +70,27 @@ pip install apify[scrapy]
|
|
|
64
70
|
|
|
65
71
|
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
|
|
66
72
|
|
|
67
|
-
##
|
|
68
|
-
|
|
69
|
-
Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
|
|
70
|
-
|
|
71
|
-
### Apify SDK with HTTPX and BeautifulSoup
|
|
72
|
-
|
|
73
|
-
This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
|
|
73
|
+
## Example
|
|
74
74
|
|
|
75
75
|
```python
|
|
76
76
|
from apify import Actor
|
|
77
77
|
from bs4 import BeautifulSoup
|
|
78
78
|
from httpx import AsyncClient
|
|
79
79
|
|
|
80
|
-
|
|
81
|
-
async def main() -> None:
|
|
82
|
-
async with Actor:
|
|
83
|
-
# Retrieve the Actor input, and use default values if not provided.
|
|
84
|
-
actor_input = await Actor.get_input() or {}
|
|
85
|
-
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
|
|
86
|
-
|
|
87
|
-
# Open the default request queue for handling URLs to be processed.
|
|
88
|
-
request_queue = await Actor.open_request_queue()
|
|
89
|
-
|
|
90
|
-
# Enqueue the start URLs.
|
|
91
|
-
for start_url in start_urls:
|
|
92
|
-
url = start_url.get('url')
|
|
93
|
-
await request_queue.add_request(url)
|
|
94
|
-
|
|
95
|
-
# Process the URLs from the request queue.
|
|
96
|
-
while request := await request_queue.fetch_next_request():
|
|
97
|
-
Actor.log.info(f'Scraping {request.url} ...')
|
|
98
|
-
|
|
99
|
-
# Fetch the HTTP response from the specified URL using HTTPX.
|
|
100
|
-
async with AsyncClient() as client:
|
|
101
|
-
response = await client.get(request.url)
|
|
102
|
-
|
|
103
|
-
# Parse the HTML content using Beautiful Soup.
|
|
104
|
-
soup = BeautifulSoup(response.content, 'html.parser')
|
|
105
|
-
|
|
106
|
-
# Extract the desired data.
|
|
107
|
-
data = {
|
|
108
|
-
'url': actor_input['url'],
|
|
109
|
-
'title': soup.title.string,
|
|
110
|
-
'h1s': [h1.text for h1 in soup.find_all('h1')],
|
|
111
|
-
'h2s': [h2.text for h2 in soup.find_all('h2')],
|
|
112
|
-
'h3s': [h3.text for h3 in soup.find_all('h3')],
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
# Store the extracted data to the default dataset.
|
|
116
|
-
await Actor.push_data(data)
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### Apify SDK with PlaywrightCrawler from Crawlee
|
|
120
|
-
|
|
121
|
-
This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
|
|
122
|
-
|
|
123
|
-
```python
|
|
124
|
-
from apify import Actor, Request
|
|
125
|
-
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
|
|
126
|
-
|
|
127
|
-
|
|
128
80
|
async def main() -> None:
|
|
129
81
|
async with Actor:
|
|
130
|
-
#
|
|
131
|
-
actor_input = await Actor.get_input()
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
max_requests_per_crawl=50,
|
|
143
|
-
headless=True,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
# Define a request handler, which will be called for every request.
|
|
147
|
-
@crawler.router.default_handler
|
|
148
|
-
async def request_handler(context: PlaywrightCrawlingContext) -> None:
|
|
149
|
-
url = context.request.url
|
|
150
|
-
Actor.log.info(f'Scraping {url}...')
|
|
151
|
-
|
|
152
|
-
# Extract the desired data.
|
|
153
|
-
data = {
|
|
154
|
-
'url': context.request.url,
|
|
155
|
-
'title': await context.page.title(),
|
|
156
|
-
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
|
|
157
|
-
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
|
|
158
|
-
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
# Store the extracted data to the default dataset.
|
|
162
|
-
await context.push_data(data)
|
|
163
|
-
|
|
164
|
-
# Enqueue additional links found on the current page.
|
|
165
|
-
await context.enqueue_links()
|
|
166
|
-
|
|
167
|
-
# Run the crawler with the starting URLs.
|
|
168
|
-
await crawler.run(start_urls)
|
|
82
|
+
# Read the input parameters from the Actor input
|
|
83
|
+
actor_input = await Actor.get_input()
|
|
84
|
+
# Fetch the HTTP response from the specified URL
|
|
85
|
+
async with AsyncClient() as client:
|
|
86
|
+
response = await client.get(actor_input['url'])
|
|
87
|
+
# Process the HTML content
|
|
88
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
89
|
+
# Push the extracted data
|
|
90
|
+
await Actor.push_data({
|
|
91
|
+
'url': actor_input['url'],
|
|
92
|
+
'title': soup.title.string,
|
|
93
|
+
})
|
|
169
94
|
```
|
|
170
95
|
|
|
171
96
|
## What are Actors?
|
apify-2.0.0b1/README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Apify SDK for Python
|
|
2
|
+
|
|
3
|
+
The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
|
|
4
|
+
in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
|
|
5
|
+
event handling.
|
|
6
|
+
|
|
7
|
+
If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
|
|
8
|
+
check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
The Apify SDK for Python is available on PyPI as the `apify` package.
|
|
13
|
+
For default installation, using Pip, run the following:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install apify
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
|
|
20
|
+
To install Apify with the `scrapy` extra, use the following command:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install apify[scrapy]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Documentation
|
|
27
|
+
|
|
28
|
+
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
|
|
29
|
+
|
|
30
|
+
## Example
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from apify import Actor
|
|
34
|
+
from bs4 import BeautifulSoup
|
|
35
|
+
from httpx import AsyncClient
|
|
36
|
+
|
|
37
|
+
async def main() -> None:
|
|
38
|
+
async with Actor:
|
|
39
|
+
# Read the input parameters from the Actor input
|
|
40
|
+
actor_input = await Actor.get_input()
|
|
41
|
+
# Fetch the HTTP response from the specified URL
|
|
42
|
+
async with AsyncClient() as client:
|
|
43
|
+
response = await client.get(actor_input['url'])
|
|
44
|
+
# Process the HTML content
|
|
45
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
46
|
+
# Push the extracted data
|
|
47
|
+
await Actor.push_data({
|
|
48
|
+
'url': actor_input['url'],
|
|
49
|
+
'title': soup.title.string,
|
|
50
|
+
})
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## What are Actors?
|
|
54
|
+
|
|
55
|
+
Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
|
|
56
|
+
They can do anything from small tasks such as filling in forms or unsubscribing from online services,
|
|
57
|
+
all the way up to scraping and processing vast numbers of web pages.
|
|
58
|
+
|
|
59
|
+
They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
|
|
60
|
+
where you can run them at scale, monitor them, schedule them, or publish and monetize them.
|
|
61
|
+
|
|
62
|
+
If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
|
|
63
|
+
in the Apify platform documentation.
|
|
64
|
+
|
|
65
|
+
## Creating Actors
|
|
66
|
+
|
|
67
|
+
To create and run Actors through Apify Console,
|
|
68
|
+
see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
|
|
69
|
+
|
|
70
|
+
To create and run Python Actors locally, check the documentation for
|
|
71
|
+
[how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
|
|
72
|
+
|
|
73
|
+
## Guides
|
|
74
|
+
|
|
75
|
+
To see how you can use the Apify SDK with other popular libraries used for web scraping,
|
|
76
|
+
check out our guides for using
|
|
77
|
+
[Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
|
|
78
|
+
[Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
|
|
79
|
+
[Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
|
|
80
|
+
[Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
|
|
81
|
+
or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
|
|
82
|
+
|
|
83
|
+
## Usage concepts
|
|
84
|
+
|
|
85
|
+
To learn more about the features of the Apify SDK and how to use them,
|
|
86
|
+
check out the Usage Concepts section in the sidebar,
|
|
87
|
+
particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
|
|
88
|
+
[working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
|
|
89
|
+
[handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
|
|
90
|
+
or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "apify"
|
|
7
|
-
version = "2.0.
|
|
7
|
+
version = "2.0.0b1"
|
|
8
8
|
description = "Apify SDK for Python"
|
|
9
9
|
authors = ["Apify Technologies s.r.o. <support@apify.com>"]
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -46,32 +46,41 @@ keywords = [
|
|
|
46
46
|
# https://github.com/apify/apify-sdk-python/pull/154.
|
|
47
47
|
[tool.poetry.dependencies]
|
|
48
48
|
python = "^3.9"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
49
|
+
aiofiles = "^22.1.0"
|
|
50
|
+
aioshutil = "^1.0"
|
|
51
|
+
apify-client = "^1.7.1"
|
|
52
|
+
apify-shared = "^1.1.2"
|
|
53
|
+
colorama = "^0.4.6"
|
|
54
|
+
crawlee = "^0.3.0"
|
|
55
|
+
cryptography = "^39.0.0"
|
|
56
|
+
httpx = "^0.27.0"
|
|
57
|
+
lazy-object-proxy = "^1.10.0"
|
|
58
|
+
psutil = "^6.0.0"
|
|
59
|
+
pyee = "^11.0.0"
|
|
60
|
+
scrapy = { version = "^2.11.0", optional = true }
|
|
61
|
+
sortedcollections = "^2.0.0"
|
|
62
|
+
typing-extensions = "^4.1.0"
|
|
63
|
+
websockets = "^10.1"
|
|
58
64
|
|
|
59
65
|
[tool.poetry.group.dev.dependencies]
|
|
60
|
-
build = "
|
|
61
|
-
filelock = "
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
pytest = "
|
|
67
|
-
pytest-
|
|
68
|
-
pytest-
|
|
69
|
-
pytest-
|
|
70
|
-
pytest-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
66
|
+
build = "^1.2.0"
|
|
67
|
+
filelock = "^3.15.0"
|
|
68
|
+
mypy = "^1.11.0"
|
|
69
|
+
pre-commit = "^3.8.0"
|
|
70
|
+
pydoc-markdown = "^4.8.0"
|
|
71
|
+
pytest = "^8.3.0"
|
|
72
|
+
pytest-asyncio = "^0.24.0"
|
|
73
|
+
pytest-cov = "^5.0.0"
|
|
74
|
+
pytest-only = "^2.1.0"
|
|
75
|
+
pytest-timeout = "^2.3.0"
|
|
76
|
+
pytest-xdist = "^3.6.0"
|
|
77
|
+
respx = "^0.21.0"
|
|
78
|
+
ruff = "^0.6.0"
|
|
79
|
+
setuptools = "^74.0.0" # setuptools are used by pytest but not explicitly required
|
|
80
|
+
twine = "^5.1.0"
|
|
81
|
+
types-aiofiles = "^24.1.0.20240626"
|
|
82
|
+
types-colorama = "^0.4.15.20240311"
|
|
83
|
+
types-psutil = "^6.0.0.20240621"
|
|
75
84
|
|
|
76
85
|
[tool.poetry.extras]
|
|
77
86
|
scrapy = ["scrapy"]
|
|
@@ -182,7 +191,7 @@ warn_unused_ignores = true
|
|
|
182
191
|
exclude = []
|
|
183
192
|
|
|
184
193
|
[[tool.mypy.overrides]]
|
|
185
|
-
module = ['scrapy', 'scrapy.*', 'lazy_object_proxy']
|
|
194
|
+
module = ['scrapy', 'scrapy.*', 'sortedcollections', 'lazy_object_proxy']
|
|
186
195
|
ignore_missing_imports = true
|
|
187
196
|
|
|
188
197
|
[tool.coverage.report]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from importlib import metadata
|
|
2
|
+
|
|
3
|
+
from crawlee.events._types import Event
|
|
4
|
+
|
|
5
|
+
from apify._actor import Actor
|
|
6
|
+
from apify._configuration import Configuration
|
|
7
|
+
from apify._proxy_configuration import ProxyConfiguration, ProxyInfo
|
|
8
|
+
|
|
9
|
+
__version__ = metadata.version('apify')
|
|
10
|
+
|
|
11
|
+
__all__ = ['Actor', 'Event', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__']
|
|
@@ -11,7 +11,7 @@ from pydantic import AliasChoices
|
|
|
11
11
|
from typing_extensions import Self
|
|
12
12
|
|
|
13
13
|
from apify_client import ApifyClientAsync
|
|
14
|
-
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
|
|
14
|
+
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType
|
|
15
15
|
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
|
|
16
16
|
from crawlee import service_container
|
|
17
17
|
from crawlee.events._types import Event, EventPersistStateData
|
|
@@ -19,12 +19,11 @@ from crawlee.events._types import Event, EventPersistStateData
|
|
|
19
19
|
from apify._configuration import Configuration
|
|
20
20
|
from apify._consts import EVENT_LISTENERS_TIMEOUT
|
|
21
21
|
from apify._crypto import decrypt_input_secrets, load_private_key
|
|
22
|
-
from apify.
|
|
22
|
+
from apify._log import logger
|
|
23
23
|
from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
|
|
24
24
|
from apify._proxy_configuration import ProxyConfiguration
|
|
25
25
|
from apify._utils import get_system_info, is_running_in_ipython
|
|
26
26
|
from apify.apify_storage_client import ApifyStorageClient
|
|
27
|
-
from apify.log import _configure_logging, logger
|
|
28
27
|
from apify.storages import Dataset, KeyValueStore, RequestQueue
|
|
29
28
|
|
|
30
29
|
if TYPE_CHECKING:
|
|
@@ -33,8 +32,6 @@ if TYPE_CHECKING:
|
|
|
33
32
|
|
|
34
33
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
35
34
|
|
|
36
|
-
from apify._models import Webhook
|
|
37
|
-
|
|
38
35
|
|
|
39
36
|
MainReturnType = TypeVar('MainReturnType')
|
|
40
37
|
|
|
@@ -46,24 +43,16 @@ class _ActorType:
|
|
|
46
43
|
_configuration: Configuration
|
|
47
44
|
_is_exiting = False
|
|
48
45
|
|
|
49
|
-
def __init__(
|
|
50
|
-
self,
|
|
51
|
-
configuration: Configuration | None = None,
|
|
52
|
-
*,
|
|
53
|
-
configure_logging: bool = True,
|
|
54
|
-
) -> None:
|
|
46
|
+
def __init__(self, config: Configuration | None = None) -> None:
|
|
55
47
|
"""Create an Actor instance.
|
|
56
48
|
|
|
57
49
|
Note that you don't have to do this, all the functionality is accessible using the default instance
|
|
58
50
|
(e.g. `Actor.open_dataset()`).
|
|
59
51
|
|
|
60
52
|
Args:
|
|
61
|
-
|
|
62
|
-
be created.
|
|
63
|
-
configure_logging: Should the default logging configuration be configured?
|
|
53
|
+
config: The Actor configuration to be used. If not passed, a new Configuration instance will be created.
|
|
64
54
|
"""
|
|
65
|
-
self._configuration =
|
|
66
|
-
self._configure_logging = configure_logging
|
|
55
|
+
self._configuration = config or Configuration.get_global_configuration()
|
|
67
56
|
self._apify_client = self.new_client()
|
|
68
57
|
|
|
69
58
|
self._event_manager: EventManager
|
|
@@ -89,9 +78,6 @@ class _ActorType:
|
|
|
89
78
|
When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
|
|
90
79
|
executing the block code, the `Actor.fail` method is called.
|
|
91
80
|
"""
|
|
92
|
-
if self._configure_logging:
|
|
93
|
-
_configure_logging(self._configuration)
|
|
94
|
-
|
|
95
81
|
await self.init()
|
|
96
82
|
return self
|
|
97
83
|
|
|
@@ -122,20 +108,15 @@ class _ActorType:
|
|
|
122
108
|
|
|
123
109
|
return super().__repr__()
|
|
124
110
|
|
|
125
|
-
def __call__(self,
|
|
111
|
+
def __call__(self, config: Configuration) -> Self:
|
|
126
112
|
"""Make a new Actor instance with a non-default configuration."""
|
|
127
|
-
return self.__class__(
|
|
113
|
+
return self.__class__(config=config)
|
|
128
114
|
|
|
129
115
|
@property
|
|
130
116
|
def apify_client(self) -> ApifyClientAsync:
|
|
131
117
|
"""The ApifyClientAsync instance the Actor instance uses."""
|
|
132
118
|
return self._apify_client
|
|
133
119
|
|
|
134
|
-
@property
|
|
135
|
-
def configuration(self) -> Configuration:
|
|
136
|
-
"""The Configuration instance the Actor instance uses."""
|
|
137
|
-
return self._configuration
|
|
138
|
-
|
|
139
120
|
@property
|
|
140
121
|
def config(self) -> Configuration:
|
|
141
122
|
"""The Configuration instance the Actor instance uses."""
|
|
@@ -552,8 +533,8 @@ class _ActorType:
|
|
|
552
533
|
memory_mbytes: int | None = None,
|
|
553
534
|
timeout: timedelta | None = None,
|
|
554
535
|
wait_for_finish: int | None = None,
|
|
555
|
-
webhooks: list[
|
|
556
|
-
) ->
|
|
536
|
+
webhooks: list[dict] | None = None,
|
|
537
|
+
) -> dict:
|
|
557
538
|
"""Run an Actor on the Apify platform.
|
|
558
539
|
|
|
559
540
|
Unlike `Actor.call`, this method just starts the run without waiting for finish.
|
|
@@ -574,6 +555,10 @@ class _ActorType:
|
|
|
574
555
|
webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with
|
|
575
556
|
the Actor run which can be used to receive a notification, e.g. when the Actor finished or failed.
|
|
576
557
|
If you already have a webhook set up for the Actor or task, you do not have to add it again here.
|
|
558
|
+
Each webhook is represented by a dictionary containing these items:
|
|
559
|
+
* `event_types`: list of `WebhookEventType` values which trigger the webhook
|
|
560
|
+
* `request_url`: URL to which to send the webhook HTTP request
|
|
561
|
+
* `payload_template` (optional): Optional template for the request payload
|
|
577
562
|
|
|
578
563
|
Returns:
|
|
579
564
|
Info about the started Actor run
|
|
@@ -582,25 +567,16 @@ class _ActorType:
|
|
|
582
567
|
|
|
583
568
|
client = self.new_client(token=token) if token else self._apify_client
|
|
584
569
|
|
|
585
|
-
|
|
586
|
-
serialized_webhooks = [
|
|
587
|
-
hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
|
|
588
|
-
]
|
|
589
|
-
else:
|
|
590
|
-
serialized_webhooks = None
|
|
591
|
-
|
|
592
|
-
api_result = await client.actor(actor_id).start(
|
|
570
|
+
return await client.actor(actor_id).start(
|
|
593
571
|
run_input=run_input,
|
|
594
572
|
content_type=content_type,
|
|
595
573
|
build=build,
|
|
596
574
|
memory_mbytes=memory_mbytes,
|
|
597
575
|
timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
|
|
598
576
|
wait_for_finish=wait_for_finish,
|
|
599
|
-
webhooks=
|
|
577
|
+
webhooks=webhooks,
|
|
600
578
|
)
|
|
601
579
|
|
|
602
|
-
return ActorRun.model_validate(api_result)
|
|
603
|
-
|
|
604
580
|
async def abort(
|
|
605
581
|
self,
|
|
606
582
|
run_id: str,
|
|
@@ -608,7 +584,7 @@ class _ActorType:
|
|
|
608
584
|
token: str | None = None,
|
|
609
585
|
status_message: str | None = None,
|
|
610
586
|
gracefully: bool | None = None,
|
|
611
|
-
) ->
|
|
587
|
+
) -> dict:
|
|
612
588
|
"""Abort given Actor run on the Apify platform using the current user account.
|
|
613
589
|
|
|
614
590
|
The user account is determined by the `APIFY_TOKEN` environment variable.
|
|
@@ -631,9 +607,7 @@ class _ActorType:
|
|
|
631
607
|
if status_message:
|
|
632
608
|
await client.run(run_id).update(status_message=status_message)
|
|
633
609
|
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
return ActorRun.model_validate(api_result)
|
|
610
|
+
return await client.run(run_id).abort(gracefully=gracefully)
|
|
637
611
|
|
|
638
612
|
async def call(
|
|
639
613
|
self,
|
|
@@ -645,9 +619,9 @@ class _ActorType:
|
|
|
645
619
|
build: str | None = None,
|
|
646
620
|
memory_mbytes: int | None = None,
|
|
647
621
|
timeout: timedelta | None = None,
|
|
648
|
-
webhooks: list[
|
|
622
|
+
webhooks: list[dict] | None = None,
|
|
649
623
|
wait: timedelta | None = None,
|
|
650
|
-
) ->
|
|
624
|
+
) -> dict | None:
|
|
651
625
|
"""Start an Actor on the Apify Platform and wait for it to finish before returning.
|
|
652
626
|
|
|
653
627
|
It waits indefinitely, unless the wait argument is provided.
|
|
@@ -676,25 +650,16 @@ class _ActorType:
|
|
|
676
650
|
|
|
677
651
|
client = self.new_client(token=token) if token else self._apify_client
|
|
678
652
|
|
|
679
|
-
|
|
680
|
-
serialized_webhooks = [
|
|
681
|
-
hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
|
|
682
|
-
]
|
|
683
|
-
else:
|
|
684
|
-
serialized_webhooks = None
|
|
685
|
-
|
|
686
|
-
api_result = await client.actor(actor_id).call(
|
|
653
|
+
return await client.actor(actor_id).call(
|
|
687
654
|
run_input=run_input,
|
|
688
655
|
content_type=content_type,
|
|
689
656
|
build=build,
|
|
690
657
|
memory_mbytes=memory_mbytes,
|
|
691
658
|
timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
|
|
692
|
-
webhooks=
|
|
659
|
+
webhooks=webhooks,
|
|
693
660
|
wait_secs=int(wait.total_seconds()) if wait is not None else None,
|
|
694
661
|
)
|
|
695
662
|
|
|
696
|
-
return ActorRun.model_validate(api_result)
|
|
697
|
-
|
|
698
663
|
async def call_task(
|
|
699
664
|
self,
|
|
700
665
|
task_id: str,
|
|
@@ -703,10 +668,10 @@ class _ActorType:
|
|
|
703
668
|
build: str | None = None,
|
|
704
669
|
memory_mbytes: int | None = None,
|
|
705
670
|
timeout: timedelta | None = None,
|
|
706
|
-
webhooks: list[
|
|
671
|
+
webhooks: list[dict] | None = None,
|
|
707
672
|
wait: timedelta | None = None,
|
|
708
673
|
token: str | None = None,
|
|
709
|
-
) ->
|
|
674
|
+
) -> dict | None:
|
|
710
675
|
"""Start an Actor task on the Apify Platform and wait for it to finish before returning.
|
|
711
676
|
|
|
712
677
|
It waits indefinitely, unless the wait argument is provided.
|
|
@@ -738,24 +703,15 @@ class _ActorType:
|
|
|
738
703
|
|
|
739
704
|
client = self.new_client(token=token) if token else self._apify_client
|
|
740
705
|
|
|
741
|
-
|
|
742
|
-
serialized_webhooks = [
|
|
743
|
-
hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
|
|
744
|
-
]
|
|
745
|
-
else:
|
|
746
|
-
serialized_webhooks = None
|
|
747
|
-
|
|
748
|
-
api_result = await client.task(task_id).call(
|
|
706
|
+
return await client.task(task_id).call(
|
|
749
707
|
task_input=task_input,
|
|
750
708
|
build=build,
|
|
751
709
|
memory_mbytes=memory_mbytes,
|
|
752
710
|
timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
|
|
753
|
-
webhooks=
|
|
711
|
+
webhooks=webhooks,
|
|
754
712
|
wait_secs=int(wait.total_seconds()) if wait is not None else None,
|
|
755
713
|
)
|
|
756
714
|
|
|
757
|
-
return ActorRun.model_validate(api_result)
|
|
758
|
-
|
|
759
715
|
async def metamorph(
|
|
760
716
|
self,
|
|
761
717
|
target_actor_id: str,
|
|
@@ -840,12 +796,14 @@ class _ActorType:
|
|
|
840
796
|
|
|
841
797
|
async def add_webhook(
|
|
842
798
|
self,
|
|
843
|
-
webhook: Webhook,
|
|
844
799
|
*,
|
|
800
|
+
event_types: list[WebhookEventType],
|
|
801
|
+
request_url: str,
|
|
802
|
+
payload_template: str | None = None,
|
|
845
803
|
ignore_ssl_errors: bool | None = None,
|
|
846
804
|
do_not_retry: bool | None = None,
|
|
847
805
|
idempotency_key: str | None = None,
|
|
848
|
-
) -> None:
|
|
806
|
+
) -> dict | None:
|
|
849
807
|
"""Create an ad-hoc webhook for the current Actor run.
|
|
850
808
|
|
|
851
809
|
This webhook lets you receive a notification when the Actor run finished or failed.
|
|
@@ -856,7 +814,9 @@ class _ActorType:
|
|
|
856
814
|
For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks).
|
|
857
815
|
|
|
858
816
|
Args:
|
|
859
|
-
|
|
817
|
+
event_types: List of event types that should trigger the webhook. At least one is required.
|
|
818
|
+
request_url: URL that will be invoked once the webhook is triggered.
|
|
819
|
+
payload_template: Specification of the payload that will be sent to request_url
|
|
860
820
|
ignore_ssl_errors: Whether the webhook should ignore SSL errors returned by request_url
|
|
861
821
|
do_not_retry: Whether the webhook should retry sending the payload to request_url upon failure.
|
|
862
822
|
idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create
|
|
@@ -869,17 +829,17 @@ class _ActorType:
|
|
|
869
829
|
|
|
870
830
|
if not self.is_at_home():
|
|
871
831
|
self.log.error('Actor.add_webhook() is only supported when running on the Apify platform.')
|
|
872
|
-
return
|
|
832
|
+
return None
|
|
873
833
|
|
|
874
834
|
# If is_at_home() is True, config.actor_run_id is always set
|
|
875
835
|
if not self._configuration.actor_run_id:
|
|
876
836
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
877
837
|
|
|
878
|
-
await self._apify_client.webhooks().create(
|
|
838
|
+
return await self._apify_client.webhooks().create(
|
|
879
839
|
actor_run_id=self._configuration.actor_run_id,
|
|
880
|
-
event_types=
|
|
881
|
-
request_url=
|
|
882
|
-
payload_template=
|
|
840
|
+
event_types=event_types,
|
|
841
|
+
request_url=request_url,
|
|
842
|
+
payload_template=payload_template,
|
|
883
843
|
ignore_ssl_errors=ignore_ssl_errors,
|
|
884
844
|
do_not_retry=do_not_retry,
|
|
885
845
|
idempotency_key=idempotency_key,
|
|
@@ -890,7 +850,7 @@ class _ActorType:
|
|
|
890
850
|
status_message: str,
|
|
891
851
|
*,
|
|
892
852
|
is_terminal: bool | None = None,
|
|
893
|
-
) ->
|
|
853
|
+
) -> dict | None:
|
|
894
854
|
"""Set the status message for the current Actor run.
|
|
895
855
|
|
|
896
856
|
Args:
|
|
@@ -911,12 +871,10 @@ class _ActorType:
|
|
|
911
871
|
if not self._configuration.actor_run_id:
|
|
912
872
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
913
873
|
|
|
914
|
-
|
|
874
|
+
return await self._apify_client.run(self._configuration.actor_run_id).update(
|
|
915
875
|
status_message=status_message, is_status_message_terminal=is_terminal
|
|
916
876
|
)
|
|
917
877
|
|
|
918
|
-
return ActorRun.model_validate(api_result)
|
|
919
|
-
|
|
920
878
|
async def create_proxy_configuration(
|
|
921
879
|
self,
|
|
922
880
|
*,
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from crawlee._log_config import CrawleeLogFormatter
|
|
6
|
+
|
|
7
|
+
# Name of the logger used throughout the library (resolves to 'apify')
|
|
8
|
+
logger_name = __name__.split('.')[0]
|
|
9
|
+
|
|
10
|
+
# Logger used throughout the library
|
|
11
|
+
logger = logging.getLogger(logger_name)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ActorLogFormatter(CrawleeLogFormatter): # Inherited from parent class
|
|
15
|
+
pass
|
|
@@ -20,7 +20,7 @@ from crawlee.events._types import (
|
|
|
20
20
|
EventSystemInfoData,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
-
from apify.
|
|
23
|
+
from apify._log import logger
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from types import TracebackType
|
|
@@ -94,11 +94,6 @@ class EventWithoutData(BaseModel):
|
|
|
94
94
|
data: Any = None
|
|
95
95
|
|
|
96
96
|
|
|
97
|
-
class DeprecatedEvent(BaseModel):
|
|
98
|
-
name: Literal['cpuInfo']
|
|
99
|
-
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
100
|
-
|
|
101
|
-
|
|
102
97
|
class UnknownEvent(BaseModel):
|
|
103
98
|
name: str
|
|
104
99
|
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
@@ -114,13 +109,12 @@ EventMessage = Union[
|
|
|
114
109
|
]
|
|
115
110
|
|
|
116
111
|
|
|
117
|
-
event_data_adapter: TypeAdapter[EventMessage |
|
|
112
|
+
event_data_adapter: TypeAdapter[EventMessage | UnknownEvent] = TypeAdapter(
|
|
118
113
|
Union[
|
|
119
114
|
Annotated[
|
|
120
115
|
EventMessage,
|
|
121
116
|
Discriminator('name'),
|
|
122
117
|
],
|
|
123
|
-
DeprecatedEvent,
|
|
124
118
|
UnknownEvent,
|
|
125
119
|
]
|
|
126
120
|
)
|
|
@@ -195,9 +189,6 @@ class PlatformEventManager(EventManager):
|
|
|
195
189
|
try:
|
|
196
190
|
parsed_message = event_data_adapter.validate_json(message)
|
|
197
191
|
|
|
198
|
-
if isinstance(parsed_message, DeprecatedEvent):
|
|
199
|
-
continue
|
|
200
|
-
|
|
201
192
|
if isinstance(parsed_message, UnknownEvent):
|
|
202
193
|
logger.info(
|
|
203
194
|
f'Unknown message received: event_name={parsed_message.name}, '
|
|
@@ -16,7 +16,7 @@ from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
|
|
|
16
16
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
17
17
|
|
|
18
18
|
from apify._configuration import Configuration
|
|
19
|
-
from apify.
|
|
19
|
+
from apify._log import logger
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
22
22
|
from apify_client import ApifyClientAsync
|
|
@@ -60,20 +60,18 @@ class DatasetClient(BaseDatasetClient):
|
|
|
60
60
|
view: str | None = None,
|
|
61
61
|
) -> DatasetItemsListPage:
|
|
62
62
|
return DatasetItemsListPage.model_validate(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
view=view,
|
|
76
|
-
)
|
|
63
|
+
await self._client.list_items(
|
|
64
|
+
offset=offset,
|
|
65
|
+
limit=limit,
|
|
66
|
+
clean=clean,
|
|
67
|
+
desc=desc,
|
|
68
|
+
fields=fields,
|
|
69
|
+
omit=omit,
|
|
70
|
+
unwind=unwind,
|
|
71
|
+
skip_empty=skip_empty,
|
|
72
|
+
skip_hidden=skip_hidden,
|
|
73
|
+
flatten=flatten,
|
|
74
|
+
view=view,
|
|
77
75
|
)
|
|
78
76
|
)
|
|
79
77
|
|
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from more_itertools import chunked
|
|
6
5
|
from typing_extensions import override
|
|
7
6
|
|
|
8
7
|
from crawlee import Request
|
|
@@ -158,11 +157,8 @@ class RequestQueueClient(BaseRequestQueueClient):
|
|
|
158
157
|
*,
|
|
159
158
|
forefront: bool = False,
|
|
160
159
|
) -> BatchRequestsOperationResponse:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
for chunk in chunked(requests, 25): # The API endpoint won't accept more than 25 requests at once
|
|
165
|
-
response = await self._client.batch_add_requests(
|
|
160
|
+
return BatchRequestsOperationResponse.model_validate(
|
|
161
|
+
await self._client.batch_add_requests(
|
|
166
162
|
requests=[
|
|
167
163
|
r.model_dump(
|
|
168
164
|
by_alias=True,
|
|
@@ -174,18 +170,10 @@ class RequestQueueClient(BaseRequestQueueClient):
|
|
|
174
170
|
'data',
|
|
175
171
|
},
|
|
176
172
|
)
|
|
177
|
-
for r in
|
|
173
|
+
for r in requests
|
|
178
174
|
],
|
|
179
175
|
forefront=forefront,
|
|
180
176
|
)
|
|
181
|
-
processed.extend(response['processedRequests'])
|
|
182
|
-
unprocessed.extend(response['unprocessedRequests'])
|
|
183
|
-
|
|
184
|
-
return BatchRequestsOperationResponse.model_validate(
|
|
185
|
-
{
|
|
186
|
-
'processedRequests': processed,
|
|
187
|
-
'unprocessedRequests': unprocessed,
|
|
188
|
-
}
|
|
189
177
|
)
|
|
190
178
|
|
|
191
179
|
@override
|
|
@@ -93,7 +93,7 @@ class ApifyHttpProxyMiddleware:
|
|
|
93
93
|
request: Request,
|
|
94
94
|
exception: Exception,
|
|
95
95
|
spider: Spider,
|
|
96
|
-
) -> None:
|
|
96
|
+
) -> None | Request:
|
|
97
97
|
"""Process an exception that occurs during request processing.
|
|
98
98
|
|
|
99
99
|
Args:
|
|
@@ -102,9 +102,8 @@ class ApifyHttpProxyMiddleware:
|
|
|
102
102
|
spider: Scrapy Spider object.
|
|
103
103
|
|
|
104
104
|
Returns:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
exception handling kicks in.
|
|
105
|
+
If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
|
|
106
|
+
Return None otherwise to allow the continuation of request processing.
|
|
108
107
|
"""
|
|
109
108
|
Actor.log.debug(
|
|
110
109
|
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
|
|
@@ -115,6 +114,9 @@ class ApifyHttpProxyMiddleware:
|
|
|
115
114
|
f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", '
|
|
116
115
|
'reason="{exception}", skipping...'
|
|
117
116
|
)
|
|
117
|
+
return request
|
|
118
|
+
|
|
119
|
+
return None
|
|
118
120
|
|
|
119
121
|
async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
|
|
120
122
|
"""Get a new proxy URL.
|
apify-2.0.0/README.md
DELETED
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
# Apify SDK for Python
|
|
2
|
-
|
|
3
|
-
The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
|
|
4
|
-
in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
|
|
5
|
-
event handling.
|
|
6
|
-
|
|
7
|
-
If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
|
|
8
|
-
check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
|
|
9
|
-
|
|
10
|
-
## Installation
|
|
11
|
-
|
|
12
|
-
The Apify SDK for Python is available on PyPI as the `apify` package.
|
|
13
|
-
For default installation, using Pip, run the following:
|
|
14
|
-
|
|
15
|
-
```bash
|
|
16
|
-
pip install apify
|
|
17
|
-
```
|
|
18
|
-
|
|
19
|
-
For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
|
|
20
|
-
To install Apify with the `scrapy` extra, use the following command:
|
|
21
|
-
|
|
22
|
-
```bash
|
|
23
|
-
pip install apify[scrapy]
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## Documentation
|
|
27
|
-
|
|
28
|
-
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
|
|
29
|
-
|
|
30
|
-
## Examples
|
|
31
|
-
|
|
32
|
-
Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
|
|
33
|
-
|
|
34
|
-
### Apify SDK with HTTPX and BeautifulSoup
|
|
35
|
-
|
|
36
|
-
This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
|
|
37
|
-
|
|
38
|
-
```python
|
|
39
|
-
from apify import Actor
|
|
40
|
-
from bs4 import BeautifulSoup
|
|
41
|
-
from httpx import AsyncClient
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
async def main() -> None:
|
|
45
|
-
async with Actor:
|
|
46
|
-
# Retrieve the Actor input, and use default values if not provided.
|
|
47
|
-
actor_input = await Actor.get_input() or {}
|
|
48
|
-
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
|
|
49
|
-
|
|
50
|
-
# Open the default request queue for handling URLs to be processed.
|
|
51
|
-
request_queue = await Actor.open_request_queue()
|
|
52
|
-
|
|
53
|
-
# Enqueue the start URLs.
|
|
54
|
-
for start_url in start_urls:
|
|
55
|
-
url = start_url.get('url')
|
|
56
|
-
await request_queue.add_request(url)
|
|
57
|
-
|
|
58
|
-
# Process the URLs from the request queue.
|
|
59
|
-
while request := await request_queue.fetch_next_request():
|
|
60
|
-
Actor.log.info(f'Scraping {request.url} ...')
|
|
61
|
-
|
|
62
|
-
# Fetch the HTTP response from the specified URL using HTTPX.
|
|
63
|
-
async with AsyncClient() as client:
|
|
64
|
-
response = await client.get(request.url)
|
|
65
|
-
|
|
66
|
-
# Parse the HTML content using Beautiful Soup.
|
|
67
|
-
soup = BeautifulSoup(response.content, 'html.parser')
|
|
68
|
-
|
|
69
|
-
# Extract the desired data.
|
|
70
|
-
data = {
|
|
71
|
-
'url': actor_input['url'],
|
|
72
|
-
'title': soup.title.string,
|
|
73
|
-
'h1s': [h1.text for h1 in soup.find_all('h1')],
|
|
74
|
-
'h2s': [h2.text for h2 in soup.find_all('h2')],
|
|
75
|
-
'h3s': [h3.text for h3 in soup.find_all('h3')],
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
# Store the extracted data to the default dataset.
|
|
79
|
-
await Actor.push_data(data)
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
### Apify SDK with PlaywrightCrawler from Crawlee
|
|
83
|
-
|
|
84
|
-
This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
|
|
85
|
-
|
|
86
|
-
```python
|
|
87
|
-
from apify import Actor, Request
|
|
88
|
-
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
async def main() -> None:
|
|
92
|
-
async with Actor:
|
|
93
|
-
# Retrieve the Actor input, and use default values if not provided.
|
|
94
|
-
actor_input = await Actor.get_input() or {}
|
|
95
|
-
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
|
|
96
|
-
|
|
97
|
-
# Exit if no start URLs are provided.
|
|
98
|
-
if not start_urls:
|
|
99
|
-
Actor.log.info('No start URLs specified in Actor input, exiting...')
|
|
100
|
-
await Actor.exit()
|
|
101
|
-
|
|
102
|
-
# Create a crawler.
|
|
103
|
-
crawler = PlaywrightCrawler(
|
|
104
|
-
# Limit the crawl to max requests. Remove or increase it for crawling all links.
|
|
105
|
-
max_requests_per_crawl=50,
|
|
106
|
-
headless=True,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Define a request handler, which will be called for every request.
|
|
110
|
-
@crawler.router.default_handler
|
|
111
|
-
async def request_handler(context: PlaywrightCrawlingContext) -> None:
|
|
112
|
-
url = context.request.url
|
|
113
|
-
Actor.log.info(f'Scraping {url}...')
|
|
114
|
-
|
|
115
|
-
# Extract the desired data.
|
|
116
|
-
data = {
|
|
117
|
-
'url': context.request.url,
|
|
118
|
-
'title': await context.page.title(),
|
|
119
|
-
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
|
|
120
|
-
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
|
|
121
|
-
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
# Store the extracted data to the default dataset.
|
|
125
|
-
await context.push_data(data)
|
|
126
|
-
|
|
127
|
-
# Enqueue additional links found on the current page.
|
|
128
|
-
await context.enqueue_links()
|
|
129
|
-
|
|
130
|
-
# Run the crawler with the starting URLs.
|
|
131
|
-
await crawler.run(start_urls)
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
## What are Actors?
|
|
135
|
-
|
|
136
|
-
Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
|
|
137
|
-
They can do anything from small tasks such as filling in forms or unsubscribing from online services,
|
|
138
|
-
all the way up to scraping and processing vast numbers of web pages.
|
|
139
|
-
|
|
140
|
-
They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
|
|
141
|
-
where you can run them at scale, monitor them, schedule them, or publish and monetize them.
|
|
142
|
-
|
|
143
|
-
If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
|
|
144
|
-
in the Apify platform documentation.
|
|
145
|
-
|
|
146
|
-
## Creating Actors
|
|
147
|
-
|
|
148
|
-
To create and run Actors through Apify Console,
|
|
149
|
-
see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
|
|
150
|
-
|
|
151
|
-
To create and run Python Actors locally, check the documentation for
|
|
152
|
-
[how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
|
|
153
|
-
|
|
154
|
-
## Guides
|
|
155
|
-
|
|
156
|
-
To see how you can use the Apify SDK with other popular libraries used for web scraping,
|
|
157
|
-
check out our guides for using
|
|
158
|
-
[Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
|
|
159
|
-
[Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
|
|
160
|
-
[Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
|
|
161
|
-
[Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
|
|
162
|
-
or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
|
|
163
|
-
|
|
164
|
-
## Usage concepts
|
|
165
|
-
|
|
166
|
-
To learn more about the features of the Apify SDK and how to use them,
|
|
167
|
-
check out the Usage Concepts section in the sidebar,
|
|
168
|
-
particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
|
|
169
|
-
[working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
|
|
170
|
-
[handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
|
|
171
|
-
or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from importlib import metadata
|
|
2
|
-
|
|
3
|
-
from apify_shared.consts import WebhookEventType
|
|
4
|
-
from crawlee import Request
|
|
5
|
-
from crawlee.events._types import Event
|
|
6
|
-
|
|
7
|
-
from apify._actor import Actor
|
|
8
|
-
from apify._configuration import Configuration
|
|
9
|
-
from apify._models import Webhook
|
|
10
|
-
from apify._proxy_configuration import ProxyConfiguration, ProxyInfo
|
|
11
|
-
|
|
12
|
-
__version__ = metadata.version('apify')
|
|
13
|
-
|
|
14
|
-
__all__ = [
|
|
15
|
-
'Actor',
|
|
16
|
-
'Event',
|
|
17
|
-
'Configuration',
|
|
18
|
-
'ProxyConfiguration',
|
|
19
|
-
'ProxyInfo',
|
|
20
|
-
'Request',
|
|
21
|
-
'Webhook',
|
|
22
|
-
'WebhookEventType',
|
|
23
|
-
'__version__',
|
|
24
|
-
]
|
apify-2.0.0/src/apify/_models.py
DELETED
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime, timedelta
|
|
5
|
-
from typing import Annotated
|
|
6
|
-
|
|
7
|
-
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
|
|
8
|
-
|
|
9
|
-
from apify_shared.consts import ActorJobStatus, MetaOrigin, WebhookEventType
|
|
10
|
-
from crawlee._utils.models import timedelta_ms
|
|
11
|
-
from crawlee._utils.urls import validate_http_url
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Webhook(BaseModel):
|
|
15
|
-
__model_config__ = ConfigDict(populate_by_name=True)
|
|
16
|
-
|
|
17
|
-
event_types: Annotated[
|
|
18
|
-
list[WebhookEventType],
|
|
19
|
-
Field(description='Event types that should trigger the webhook'),
|
|
20
|
-
]
|
|
21
|
-
request_url: Annotated[
|
|
22
|
-
str,
|
|
23
|
-
Field(description='URL that the webhook should call'),
|
|
24
|
-
BeforeValidator(validate_http_url),
|
|
25
|
-
]
|
|
26
|
-
payload_template: Annotated[
|
|
27
|
-
str | None,
|
|
28
|
-
Field(description='Template for the payload sent by the webook'),
|
|
29
|
-
] = None
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class ActorRunMeta(BaseModel):
|
|
33
|
-
__model_config__ = ConfigDict(populate_by_name=True)
|
|
34
|
-
|
|
35
|
-
origin: Annotated[MetaOrigin, Field()]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class ActorRunStats(BaseModel):
|
|
39
|
-
__model_config__ = ConfigDict(populate_by_name=True)
|
|
40
|
-
|
|
41
|
-
input_body_len: Annotated[int, Field(alias='inputBodyLen')]
|
|
42
|
-
restart_count: Annotated[int, Field(alias='restartCount')]
|
|
43
|
-
resurrect_count: Annotated[int, Field(alias='resurrectCount')]
|
|
44
|
-
mem_avg_bytes: Annotated[float | None, Field(alias='memAvgBytes')] = None
|
|
45
|
-
mem_max_bytes: Annotated[int | None, Field(alias='memMaxBytes')] = None
|
|
46
|
-
mem_current_bytes: Annotated[int | None, Field(alias='memCurrentBytes')] = None
|
|
47
|
-
cpu_avg_usage: Annotated[float | None, Field(alias='cpuAvgUsage')] = None
|
|
48
|
-
cpu_max_usage: Annotated[float | None, Field(alias='cpuMaxUsage')] = None
|
|
49
|
-
cpu_current_usage: Annotated[float | None, Field(alias='cpuCurrentUsage')] = None
|
|
50
|
-
net_rx_bytes: Annotated[int | None, Field(alias='netRxBytes')] = None
|
|
51
|
-
net_tx_bytes: Annotated[int | None, Field(alias='netTxBytes')] = None
|
|
52
|
-
duration: Annotated[timedelta_ms | None, Field(alias='durationMillis')] = None
|
|
53
|
-
run_time: Annotated[timedelta | None, Field(alias='runTimeSecs')] = None
|
|
54
|
-
metamorph: Annotated[int | None, Field(alias='metamorph')] = None
|
|
55
|
-
compute_units: Annotated[float, Field(alias='computeUnits')]
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class ActorRunOptions(BaseModel):
|
|
59
|
-
__model_config__ = ConfigDict(populate_by_name=True)
|
|
60
|
-
|
|
61
|
-
build: str
|
|
62
|
-
timeout: Annotated[timedelta, Field(alias='timeoutSecs')]
|
|
63
|
-
memory_mbytes: Annotated[int, Field(alias='memoryMbytes')]
|
|
64
|
-
disk_mbytes: Annotated[int, Field(alias='diskMbytes')]
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
class ActorRunUsage(BaseModel):
|
|
68
|
-
__model_config__ = ConfigDict(populate_by_name=True)
|
|
69
|
-
|
|
70
|
-
actor_compute_units: Annotated[float | None, Field(alias='ACTOR_COMPUTE_UNITS')] = None
|
|
71
|
-
dataset_reads: Annotated[float | None, Field(alias='DATASET_READS')] = None
|
|
72
|
-
dataset_writes: Annotated[float | None, Field(alias='DATASET_WRITES')] = None
|
|
73
|
-
key_value_store_reads: Annotated[float | None, Field(alias='KEY_VALUE_STORE_READS')] = None
|
|
74
|
-
key_value_store_writes: Annotated[float | None, Field(alias='KEY_VALUE_STORE_WRITES')] = None
|
|
75
|
-
key_value_store_lists: Annotated[float | None, Field(alias='KEY_VALUE_STORE_LISTS')] = None
|
|
76
|
-
request_queue_reads: Annotated[float | None, Field(alias='REQUEST_QUEUE_READS')] = None
|
|
77
|
-
request_queue_writes: Annotated[float | None, Field(alias='REQUEST_QUEUE_WRITES')] = None
|
|
78
|
-
data_transfer_internal_gbytes: Annotated[float | None, Field(alias='DATA_TRANSFER_INTERNAL_GBYTES')] = None
|
|
79
|
-
data_transfer_external_gbytes: Annotated[float | None, Field(alias='DATA_TRANSFER_EXTERNAL_GBYTES')] = None
|
|
80
|
-
proxy_residential_transfer_gbytes: Annotated[float | None, Field(alias='PROXY_RESIDENTIAL_TRANSFER_GBYTES')] = None
|
|
81
|
-
proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class ActorRun(BaseModel):
|
|
85
|
-
__model_config__ = ConfigDict(populate_by_name=True)
|
|
86
|
-
|
|
87
|
-
id: Annotated[str, Field(alias='id')]
|
|
88
|
-
act_id: Annotated[str, Field(alias='actId')]
|
|
89
|
-
user_id: Annotated[str, Field(alias='userId')]
|
|
90
|
-
actor_task_id: Annotated[str | None, Field(alias='actorTaskId')] = None
|
|
91
|
-
started_at: Annotated[datetime, Field(alias='startedAt')]
|
|
92
|
-
finished_at: Annotated[datetime | None, Field(alias='finishedAt')] = None
|
|
93
|
-
status: Annotated[ActorJobStatus, Field(alias='status')]
|
|
94
|
-
status_message: Annotated[str | None, Field(alias='statusMessage')] = None
|
|
95
|
-
is_status_message_terminal: Annotated[bool | None, Field(alias='isStatusMessageTerminal')] = None
|
|
96
|
-
meta: Annotated[ActorRunMeta, Field(alias='meta')]
|
|
97
|
-
stats: Annotated[ActorRunStats, Field(alias='stats')]
|
|
98
|
-
options: Annotated[ActorRunOptions, Field(alias='options')]
|
|
99
|
-
build_id: Annotated[str, Field(alias='buildId')]
|
|
100
|
-
exit_code: Annotated[int | None, Field(alias='exitCode')] = None
|
|
101
|
-
default_key_value_store_id: Annotated[str, Field(alias='defaultKeyValueStoreId')]
|
|
102
|
-
default_dataset_id: Annotated[str, Field(alias='defaultDatasetId')]
|
|
103
|
-
default_request_queue_id: Annotated[str, Field(alias='defaultRequestQueueId')]
|
|
104
|
-
build_number: Annotated[str | None, Field(alias='buildNumber')] = None
|
|
105
|
-
container_url: Annotated[str, Field(alias='containerUrl')]
|
|
106
|
-
is_container_server_ready: Annotated[bool | None, Field(alias='isContainerServerReady')] = None
|
|
107
|
-
git_branch_name: Annotated[str | None, Field(alias='gitBranchName')] = None
|
|
108
|
-
usage: Annotated[ActorRunUsage | None, Field(alias='usage')] = None
|
|
109
|
-
usage_total_usd: Annotated[float | None, Field(alias='usageTotalUsd')] = None
|
|
110
|
-
usage_usd: Annotated[ActorRunUsage | None, Field(alias='usageUsd')] = None
|
apify-2.0.0/src/apify/log.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
5
|
-
|
|
6
|
-
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
|
|
7
|
-
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
from apify import Configuration
|
|
10
|
-
|
|
11
|
-
# Name of the logger used throughout the library (resolves to 'apify')
|
|
12
|
-
logger_name = __name__.split('.')[0]
|
|
13
|
-
|
|
14
|
-
# Logger used throughout the library
|
|
15
|
-
logger = logging.getLogger(logger_name)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
|
|
19
|
-
pass
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logging(configuration: Configuration) -> None:
|
|
23
|
-
apify_client_logger = logging.getLogger('apify_client')
|
|
24
|
-
configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
|
|
25
|
-
|
|
26
|
-
level = get_configured_log_level(configuration)
|
|
27
|
-
|
|
28
|
-
# Keep apify_client logger quiet unless debug logging is requested
|
|
29
|
-
if level > logging.DEBUG:
|
|
30
|
-
apify_client_logger.setLevel(logging.INFO)
|
|
31
|
-
else:
|
|
32
|
-
apify_client_logger.setLevel(level)
|
|
33
|
-
|
|
34
|
-
# Silence HTTPX logger unless debug logging is requested
|
|
35
|
-
httpx_logger = logging.getLogger('httpx')
|
|
36
|
-
if level > logging.DEBUG:
|
|
37
|
-
httpx_logger.setLevel(logging.WARNING)
|
|
38
|
-
else:
|
|
39
|
-
httpx_logger.setLevel(level)
|
|
40
|
-
|
|
41
|
-
# Use configured log level for apify logger
|
|
42
|
-
apify_logger = logging.getLogger('apify')
|
|
43
|
-
configure_logger(apify_logger, configuration, remove_old_handlers=True)
|
apify-2.0.0/src/apify/py.typed
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_key_value_store_collection_client.py
RENAMED
|
File without changes
|
{apify-2.0.0 → apify-2.0.0b1}/src/apify/apify_storage_client/_request_queue_collection_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|