apify 1.7.3b4__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +19 -4
- apify/_actor.py +979 -0
- apify/_configuration.py +310 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +29 -27
- apify/_models.py +110 -0
- apify/_platform_event_manager.py +222 -0
- apify/_proxy_configuration.py +316 -0
- apify/_utils.py +0 -497
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +56 -0
- apify/apify_storage_client/_dataset_client.py +188 -0
- apify/apify_storage_client/_dataset_collection_client.py +50 -0
- apify/apify_storage_client/_key_value_store_client.py +98 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +50 -0
- apify/apify_storage_client/_request_queue_client.py +208 -0
- apify/apify_storage_client/_request_queue_collection_client.py +50 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +24 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +21 -21
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +1 -1
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +55 -54
- apify/scrapy/scheduler.py +19 -13
- apify/scrapy/utils.py +2 -31
- apify/storages/__init__.py +2 -10
- apify/storages/py.typed +0 -0
- apify-2.0.0.dist-info/METADATA +209 -0
- apify-2.0.0.dist-info/RECORD +37 -0
- {apify-1.7.3b4.dist-info → apify-2.0.0.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1357
- apify/config.py +0 -130
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.3b4.dist-info/METADATA +0 -150
- apify-1.7.3b4.dist-info/RECORD +0 -41
- apify-1.7.3b4.dist-info/top_level.txt +0 -1
- {apify-1.7.3b4.dist-info → apify-2.0.0.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: apify
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Apify SDK for Python
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
|
|
7
|
+
Author: Apify Technologies s.r.o.
|
|
8
|
+
Author-email: support@apify.com
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Provides-Extra: scrapy
|
|
21
|
+
Requires-Dist: apify-client (>=1.7.1)
|
|
22
|
+
Requires-Dist: apify-shared (>=1.1.2)
|
|
23
|
+
Requires-Dist: crawlee (>=0.3.5)
|
|
24
|
+
Requires-Dist: cryptography (>=42.0.0)
|
|
25
|
+
Requires-Dist: httpx (>=0.27.0)
|
|
26
|
+
Requires-Dist: lazy-object-proxy (>=1.10.0)
|
|
27
|
+
Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
|
|
28
|
+
Requires-Dist: typing-extensions (>=4.1.0)
|
|
29
|
+
Requires-Dist: websockets (>=10.0)
|
|
30
|
+
Project-URL: Apify Homepage, https://apify.com
|
|
31
|
+
Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
|
|
32
|
+
Project-URL: Documentation, https://docs.apify.com/sdk/python/
|
|
33
|
+
Project-URL: Homepage, https://docs.apify.com/sdk/python/
|
|
34
|
+
Project-URL: Issue Tracker, https://github.com/apify/apify-sdk-python/issues
|
|
35
|
+
Project-URL: Repository, https://github.com/apify/apify-sdk-python
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# Apify SDK for Python
|
|
39
|
+
|
|
40
|
+
The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
|
|
41
|
+
in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
|
|
42
|
+
event handling.
|
|
43
|
+
|
|
44
|
+
If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
|
|
45
|
+
check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
The Apify SDK for Python is available on PyPI as the `apify` package.
|
|
50
|
+
For default installation, using Pip, run the following:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install apify
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
|
|
57
|
+
To install Apify with the `scrapy` extra, use the following command:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install apify[scrapy]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Documentation
|
|
64
|
+
|
|
65
|
+
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
|
|
66
|
+
|
|
67
|
+
## Examples
|
|
68
|
+
|
|
69
|
+
Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
|
|
70
|
+
|
|
71
|
+
### Apify SDK with HTTPX and BeautifulSoup
|
|
72
|
+
|
|
73
|
+
This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from apify import Actor
|
|
77
|
+
from bs4 import BeautifulSoup
|
|
78
|
+
from httpx import AsyncClient
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def main() -> None:
|
|
82
|
+
async with Actor:
|
|
83
|
+
# Retrieve the Actor input, and use default values if not provided.
|
|
84
|
+
actor_input = await Actor.get_input() or {}
|
|
85
|
+
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
|
|
86
|
+
|
|
87
|
+
# Open the default request queue for handling URLs to be processed.
|
|
88
|
+
request_queue = await Actor.open_request_queue()
|
|
89
|
+
|
|
90
|
+
# Enqueue the start URLs.
|
|
91
|
+
for start_url in start_urls:
|
|
92
|
+
url = start_url.get('url')
|
|
93
|
+
await request_queue.add_request(url)
|
|
94
|
+
|
|
95
|
+
# Process the URLs from the request queue.
|
|
96
|
+
while request := await request_queue.fetch_next_request():
|
|
97
|
+
Actor.log.info(f'Scraping {request.url} ...')
|
|
98
|
+
|
|
99
|
+
# Fetch the HTTP response from the specified URL using HTTPX.
|
|
100
|
+
async with AsyncClient() as client:
|
|
101
|
+
response = await client.get(request.url)
|
|
102
|
+
|
|
103
|
+
# Parse the HTML content using Beautiful Soup.
|
|
104
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
105
|
+
|
|
106
|
+
# Extract the desired data.
|
|
107
|
+
data = {
|
|
108
|
+
'url': actor_input['url'],
|
|
109
|
+
'title': soup.title.string,
|
|
110
|
+
'h1s': [h1.text for h1 in soup.find_all('h1')],
|
|
111
|
+
'h2s': [h2.text for h2 in soup.find_all('h2')],
|
|
112
|
+
'h3s': [h3.text for h3 in soup.find_all('h3')],
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Store the extracted data to the default dataset.
|
|
116
|
+
await Actor.push_data(data)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Apify SDK with PlaywrightCrawler from Crawlee
|
|
120
|
+
|
|
121
|
+
This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from apify import Actor, Request
|
|
125
|
+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def main() -> None:
|
|
129
|
+
async with Actor:
|
|
130
|
+
# Retrieve the Actor input, and use default values if not provided.
|
|
131
|
+
actor_input = await Actor.get_input() or {}
|
|
132
|
+
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
|
|
133
|
+
|
|
134
|
+
# Exit if no start URLs are provided.
|
|
135
|
+
if not start_urls:
|
|
136
|
+
Actor.log.info('No start URLs specified in Actor input, exiting...')
|
|
137
|
+
await Actor.exit()
|
|
138
|
+
|
|
139
|
+
# Create a crawler.
|
|
140
|
+
crawler = PlaywrightCrawler(
|
|
141
|
+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
|
|
142
|
+
max_requests_per_crawl=50,
|
|
143
|
+
headless=True,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Define a request handler, which will be called for every request.
|
|
147
|
+
@crawler.router.default_handler
|
|
148
|
+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
|
|
149
|
+
url = context.request.url
|
|
150
|
+
Actor.log.info(f'Scraping {url}...')
|
|
151
|
+
|
|
152
|
+
# Extract the desired data.
|
|
153
|
+
data = {
|
|
154
|
+
'url': context.request.url,
|
|
155
|
+
'title': await context.page.title(),
|
|
156
|
+
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
|
|
157
|
+
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
|
|
158
|
+
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Store the extracted data to the default dataset.
|
|
162
|
+
await context.push_data(data)
|
|
163
|
+
|
|
164
|
+
# Enqueue additional links found on the current page.
|
|
165
|
+
await context.enqueue_links()
|
|
166
|
+
|
|
167
|
+
# Run the crawler with the starting URLs.
|
|
168
|
+
await crawler.run(start_urls)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## What are Actors?
|
|
172
|
+
|
|
173
|
+
Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
|
|
174
|
+
They can do anything from small tasks such as filling in forms or unsubscribing from online services,
|
|
175
|
+
all the way up to scraping and processing vast numbers of web pages.
|
|
176
|
+
|
|
177
|
+
They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
|
|
178
|
+
where you can run them at scale, monitor them, schedule them, or publish and monetize them.
|
|
179
|
+
|
|
180
|
+
If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
|
|
181
|
+
in the Apify platform documentation.
|
|
182
|
+
|
|
183
|
+
## Creating Actors
|
|
184
|
+
|
|
185
|
+
To create and run Actors through Apify Console,
|
|
186
|
+
see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
|
|
187
|
+
|
|
188
|
+
To create and run Python Actors locally, check the documentation for
|
|
189
|
+
[how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
|
|
190
|
+
|
|
191
|
+
## Guides
|
|
192
|
+
|
|
193
|
+
To see how you can use the Apify SDK with other popular libraries used for web scraping,
|
|
194
|
+
check out our guides for using
|
|
195
|
+
[Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
|
|
196
|
+
[Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
|
|
197
|
+
[Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
|
|
198
|
+
[Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
|
|
199
|
+
or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
|
|
200
|
+
|
|
201
|
+
## Usage concepts
|
|
202
|
+
|
|
203
|
+
To learn more about the features of the Apify SDK and how to use them,
|
|
204
|
+
check out the Usage Concepts section in the sidebar,
|
|
205
|
+
particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
|
|
206
|
+
[working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
|
|
207
|
+
[handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
|
|
208
|
+
or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
|
|
209
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
apify/__init__.py,sha256=ikoi2EpDYl6y-XSVtlU8UsdQdMEyOiIJCRRAaZFDOP8,550
|
|
2
|
+
apify/_actor.py,sha256=oPgQ3rxxIEzVcZ9XtI3lf1a_6gwIMgxihNuYGjJpGww,41816
|
|
3
|
+
apify/_configuration.py,sha256=gf7YOun32Whc9DamhoWDLmcUeNwtWVmmBPrl4oq6s4I,8997
|
|
4
|
+
apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
|
|
5
|
+
apify/_crypto.py,sha256=b4Czs1NLPkaNzkPjovObjSIbsKnRrgtBkM9JvOysUMA,5612
|
|
6
|
+
apify/_models.py,sha256=oYlTEr-DyQAE-V2rrYD5PhUxTXVPdAig7QV-u6CJw3E,5571
|
|
7
|
+
apify/_platform_event_manager.py,sha256=h5fBmXtKD4t-yCdOSiLM1-DnCrIbGEmYmz2mOU3A8bA,7627
|
|
8
|
+
apify/_proxy_configuration.py,sha256=VdKh_AyCwaCUlpCyaCe30L2S9OZ-vL1SN1g8oLwSeYA,13074
|
|
9
|
+
apify/_utils.py,sha256=x4lnR9RNulySiEQTft-GeQqUcJsRr0k8p0Sv9NTeWFg,638
|
|
10
|
+
apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
|
|
11
|
+
apify/apify_storage_client/_apify_storage_client.py,sha256=xi4OFchxhe-1-sykanH6Zcya4OcBhn2uf7OQ1pV4Ins,2338
|
|
12
|
+
apify/apify_storage_client/_dataset_client.py,sha256=j9seF2OKvbSMD9R9XF9fpa1vtr_1w4JcRV--WCmvU4E,5501
|
|
13
|
+
apify/apify_storage_client/_dataset_collection_client.py,sha256=fkYvYGQCigHD2CDzpWk0swNAkfvAinAhMGpYqllle3E,1445
|
|
14
|
+
apify/apify_storage_client/_key_value_store_client.py,sha256=uyeQgb75sGFsqIS4sq4hEZ3QP81COLfS3tmTqHc0tso,3340
|
|
15
|
+
apify/apify_storage_client/_key_value_store_collection_client.py,sha256=vCtMTI-jx89Qp5WHILDNkCthwLuv0MAwm1J_5E4aypU,1519
|
|
16
|
+
apify/apify_storage_client/_request_queue_client.py,sha256=P8ws8jEzi2PWpp-cvYfV7kwuKbgH813BpNQ_wMSVtTA,6278
|
|
17
|
+
apify/apify_storage_client/_request_queue_collection_client.py,sha256=NnO73UJ9ZrjV8xoudo30wfaM-SojRkG0guhxDyB-K1g,1527
|
|
18
|
+
apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
apify/log.py,sha256=pX6ppIvds8OKqjFpIcshqG4zp_5DiOUU31ksyfSExto,1392
|
|
20
|
+
apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
apify/scrapy/__init__.py,sha256=qDPV_zTRFaUqoFOyS5g4uBfz-UCkmWYJ82VXQ_3Cw6k,348
|
|
22
|
+
apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
|
|
23
|
+
apify/scrapy/middlewares/apify_proxy.py,sha256=_1WO7NKHxIcPf8mSNjsqANTEsx7ygMTuRQW9fbwKMO8,5837
|
|
24
|
+
apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
|
|
26
|
+
apify/scrapy/pipelines/actor_dataset_push.py,sha256=QERmmExQOGIKQ70-p-lCj5qyE-c-fnYplEqd4mgaB1Q,953
|
|
27
|
+
apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
apify/scrapy/requests.py,sha256=pmm2M-cwrTXyI3t1nRBo9pS6nHfc4zkzS25-NXxzd9I,7637
|
|
30
|
+
apify/scrapy/scheduler.py,sha256=AAIKY5i1QxkC1mtmix6n3M2eQaOw-d1T56Noue9xToc,6013
|
|
31
|
+
apify/scrapy/utils.py,sha256=tz_Y8CTqe6KbyMMhLF3m7qqR46jtNH5U7Ty7e19roPU,2814
|
|
32
|
+
apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
|
|
33
|
+
apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
|
+
apify-2.0.0.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
35
|
+
apify-2.0.0.dist-info/METADATA,sha256=DhojQDiiwKEwS7VcAufA7ERVHYHKk5mqHFtddWXL4Qk,8604
|
|
36
|
+
apify-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
37
|
+
apify-2.0.0.dist-info/RECORD,,
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
import aiofiles
|
|
6
|
-
from aiofiles.os import makedirs
|
|
7
|
-
from apify_shared.utils import json_dumps
|
|
8
|
-
|
|
9
|
-
from apify._utils import force_remove
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
async def update_metadata(*, data: dict, entity_directory: str, write_metadata: bool) -> None:
|
|
13
|
-
# Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present
|
|
14
|
-
if not write_metadata:
|
|
15
|
-
return
|
|
16
|
-
|
|
17
|
-
# Ensure the directory for the entity exists
|
|
18
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
19
|
-
|
|
20
|
-
# Write the metadata to the file
|
|
21
|
-
file_path = os.path.join(entity_directory, '__metadata__.json')
|
|
22
|
-
async with aiofiles.open(file_path, mode='wb') as f:
|
|
23
|
-
await f.write(json_dumps(data).encode('utf-8'))
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
async def _update_dataset_items(
|
|
27
|
-
*,
|
|
28
|
-
data: list[tuple[str, dict]],
|
|
29
|
-
entity_directory: str,
|
|
30
|
-
persist_storage: bool,
|
|
31
|
-
) -> None:
|
|
32
|
-
# Skip writing files to the disk if the client has the option set to false
|
|
33
|
-
if not persist_storage:
|
|
34
|
-
return
|
|
35
|
-
|
|
36
|
-
# Ensure the directory for the entity exists
|
|
37
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
38
|
-
|
|
39
|
-
# Save all the new items to the disk
|
|
40
|
-
for idx, item in data:
|
|
41
|
-
file_path = os.path.join(entity_directory, f'{idx}.json')
|
|
42
|
-
async with aiofiles.open(file_path, mode='wb') as f:
|
|
43
|
-
await f.write(json_dumps(item).encode('utf-8'))
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
async def update_request_queue_item(
|
|
47
|
-
*,
|
|
48
|
-
request_id: str,
|
|
49
|
-
request: dict,
|
|
50
|
-
entity_directory: str,
|
|
51
|
-
persist_storage: bool,
|
|
52
|
-
) -> None:
|
|
53
|
-
# Skip writing files to the disk if the client has the option set to false
|
|
54
|
-
if not persist_storage:
|
|
55
|
-
return
|
|
56
|
-
|
|
57
|
-
# Ensure the directory for the entity exists
|
|
58
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
59
|
-
|
|
60
|
-
# Write the request to the file
|
|
61
|
-
file_path = os.path.join(entity_directory, f'{request_id}.json')
|
|
62
|
-
async with aiofiles.open(file_path, mode='wb') as f:
|
|
63
|
-
await f.write(json_dumps(request).encode('utf-8'))
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
async def delete_request(*, request_id: str, entity_directory: str) -> None:
|
|
67
|
-
# Ensure the directory for the entity exists
|
|
68
|
-
await makedirs(entity_directory, exist_ok=True)
|
|
69
|
-
|
|
70
|
-
file_path = os.path.join(entity_directory, f'{request_id}.json')
|
|
71
|
-
await force_remove(file_path)
|
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import contextlib
|
|
5
|
-
import os
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import aioshutil
|
|
9
|
-
from aiofiles import ospath
|
|
10
|
-
from aiofiles.os import rename, scandir
|
|
11
|
-
from apify_shared.consts import ApifyEnvVars
|
|
12
|
-
from apify_shared.utils import ignore_docs
|
|
13
|
-
|
|
14
|
-
from apify._memory_storage.resource_clients.dataset import DatasetClient
|
|
15
|
-
from apify._memory_storage.resource_clients.dataset_collection import DatasetCollectionClient
|
|
16
|
-
from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient
|
|
17
|
-
from apify._memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient
|
|
18
|
-
from apify._memory_storage.resource_clients.request_queue import RequestQueueClient
|
|
19
|
-
from apify._memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient
|
|
20
|
-
from apify._utils import maybe_parse_bool
|
|
21
|
-
|
|
22
|
-
"""
|
|
23
|
-
Memory storage emulates data storages that are available on the Apify platform.
|
|
24
|
-
Specifically, it emulates clients for datasets, key-value stores and request queues.
|
|
25
|
-
The data are held in-memory and persisted locally if `persist_storage` is True.
|
|
26
|
-
The metadata of the storages is also persisted if `write_metadata` is True.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@ignore_docs
|
|
31
|
-
class MemoryStorageClient:
|
|
32
|
-
"""Class representing an in-memory storage."""
|
|
33
|
-
|
|
34
|
-
_local_data_directory: str
|
|
35
|
-
_datasets_directory: str
|
|
36
|
-
_key_value_stores_directory: str
|
|
37
|
-
_request_queues_directory: str
|
|
38
|
-
_write_metadata: bool
|
|
39
|
-
_persist_storage: bool
|
|
40
|
-
_datasets_handled: list[DatasetClient]
|
|
41
|
-
_key_value_stores_handled: list[KeyValueStoreClient]
|
|
42
|
-
_request_queues_handled: list[RequestQueueClient]
|
|
43
|
-
|
|
44
|
-
_purged_on_start: bool = False
|
|
45
|
-
_purge_lock: asyncio.Lock
|
|
46
|
-
|
|
47
|
-
"""Indicates whether a purge was already performed on this instance"""
|
|
48
|
-
|
|
49
|
-
def __init__(
|
|
50
|
-
self: MemoryStorageClient,
|
|
51
|
-
*,
|
|
52
|
-
local_data_directory: str | None = None,
|
|
53
|
-
write_metadata: bool | None = None,
|
|
54
|
-
persist_storage: bool | None = None,
|
|
55
|
-
) -> None:
|
|
56
|
-
"""Initialize the MemoryStorageClient.
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
local_data_directory (str, optional): A local directory where all data will be persisted
|
|
60
|
-
persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory
|
|
61
|
-
write_metadata (bool, optional): Whether to persist metadata of the storages as well
|
|
62
|
-
"""
|
|
63
|
-
self._local_data_directory = local_data_directory or os.getenv(ApifyEnvVars.LOCAL_STORAGE_DIR) or './storage'
|
|
64
|
-
self._datasets_directory = os.path.join(self._local_data_directory, 'datasets')
|
|
65
|
-
self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores')
|
|
66
|
-
self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues')
|
|
67
|
-
self._write_metadata = write_metadata if write_metadata is not None else '*' in os.getenv('DEBUG', '')
|
|
68
|
-
self._persist_storage = persist_storage if persist_storage is not None else maybe_parse_bool(os.getenv(ApifyEnvVars.PERSIST_STORAGE, 'true'))
|
|
69
|
-
self._datasets_handled = []
|
|
70
|
-
self._key_value_stores_handled = []
|
|
71
|
-
self._request_queues_handled = []
|
|
72
|
-
self._purge_lock = asyncio.Lock()
|
|
73
|
-
|
|
74
|
-
def datasets(self: MemoryStorageClient) -> DatasetCollectionClient:
|
|
75
|
-
"""Retrieve the sub-client for manipulating datasets."""
|
|
76
|
-
return DatasetCollectionClient(base_storage_directory=self._datasets_directory, memory_storage_client=self)
|
|
77
|
-
|
|
78
|
-
def dataset(self: MemoryStorageClient, dataset_id: str) -> DatasetClient:
|
|
79
|
-
"""Retrieve the sub-client for manipulating a single dataset.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
dataset_id (str): ID of the dataset to be manipulated
|
|
83
|
-
"""
|
|
84
|
-
return DatasetClient(base_storage_directory=self._datasets_directory, memory_storage_client=self, id=dataset_id)
|
|
85
|
-
|
|
86
|
-
def key_value_stores(self: MemoryStorageClient) -> KeyValueStoreCollectionClient:
|
|
87
|
-
"""Retrieve the sub-client for manipulating key-value stores."""
|
|
88
|
-
return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self)
|
|
89
|
-
|
|
90
|
-
def key_value_store(self: MemoryStorageClient, key_value_store_id: str) -> KeyValueStoreClient:
|
|
91
|
-
"""Retrieve the sub-client for manipulating a single key-value store.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
key_value_store_id (str): ID of the key-value store to be manipulated
|
|
95
|
-
"""
|
|
96
|
-
return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self, id=key_value_store_id)
|
|
97
|
-
|
|
98
|
-
def request_queues(self: MemoryStorageClient) -> RequestQueueCollectionClient:
|
|
99
|
-
"""Retrieve the sub-client for manipulating request queues."""
|
|
100
|
-
return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self)
|
|
101
|
-
|
|
102
|
-
def request_queue(
|
|
103
|
-
self: MemoryStorageClient,
|
|
104
|
-
request_queue_id: str,
|
|
105
|
-
*,
|
|
106
|
-
client_key: str | None = None, # noqa: ARG002
|
|
107
|
-
) -> RequestQueueClient:
|
|
108
|
-
"""Retrieve the sub-client for manipulating a single request queue.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
request_queue_id (str): ID of the request queue to be manipulated
|
|
112
|
-
client_key (str): A unique identifier of the client accessing the request queue
|
|
113
|
-
"""
|
|
114
|
-
return RequestQueueClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self, id=request_queue_id)
|
|
115
|
-
|
|
116
|
-
async def _purge_on_start(self: MemoryStorageClient) -> None:
|
|
117
|
-
# Optimistic, non-blocking check
|
|
118
|
-
if self._purged_on_start is True:
|
|
119
|
-
return
|
|
120
|
-
|
|
121
|
-
async with self._purge_lock:
|
|
122
|
-
# Another check under the lock just to be sure
|
|
123
|
-
if self._purged_on_start is True:
|
|
124
|
-
return # type: ignore[unreachable] # Mypy doesn't understand that the _purged_on_start can change while we're getting the async lock
|
|
125
|
-
|
|
126
|
-
await self._purge()
|
|
127
|
-
self._purged_on_start = True
|
|
128
|
-
|
|
129
|
-
async def _purge(self: MemoryStorageClient) -> None:
|
|
130
|
-
"""Clean up the default storage directories before the run starts.
|
|
131
|
-
|
|
132
|
-
Specifically, `purge` cleans up:
|
|
133
|
-
- local directory containing the default dataset;
|
|
134
|
-
- all records from the default key-value store in the local directory, except for the "INPUT" key;
|
|
135
|
-
- local directory containing the default request queue.
|
|
136
|
-
"""
|
|
137
|
-
# Key-value stores
|
|
138
|
-
if await ospath.exists(self._key_value_stores_directory):
|
|
139
|
-
key_value_store_folders = await scandir(self._key_value_stores_directory)
|
|
140
|
-
for key_value_store_folder in key_value_store_folders:
|
|
141
|
-
if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'):
|
|
142
|
-
await self._batch_remove_files(key_value_store_folder.path)
|
|
143
|
-
elif key_value_store_folder.name == 'default':
|
|
144
|
-
await self._handle_default_key_value_store(key_value_store_folder.path)
|
|
145
|
-
|
|
146
|
-
# Datasets
|
|
147
|
-
if await ospath.exists(self._datasets_directory):
|
|
148
|
-
dataset_folders = await scandir(self._datasets_directory)
|
|
149
|
-
for dataset_folder in dataset_folders:
|
|
150
|
-
if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'):
|
|
151
|
-
await self._batch_remove_files(dataset_folder.path)
|
|
152
|
-
# Request queues
|
|
153
|
-
if await ospath.exists(self._request_queues_directory):
|
|
154
|
-
request_queue_folders = await scandir(self._request_queues_directory)
|
|
155
|
-
for request_queue_folder in request_queue_folders:
|
|
156
|
-
if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'):
|
|
157
|
-
await self._batch_remove_files(request_queue_folder.path)
|
|
158
|
-
|
|
159
|
-
async def _handle_default_key_value_store(self: MemoryStorageClient, folder: str) -> None:
|
|
160
|
-
"""Remove everything from the default key-value store folder except `possible_input_keys`."""
|
|
161
|
-
folder_exists = await ospath.exists(folder)
|
|
162
|
-
temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__'))
|
|
163
|
-
|
|
164
|
-
# For optimization, we want to only attempt to copy a few files from the default key-value store
|
|
165
|
-
possible_input_keys = [
|
|
166
|
-
'INPUT',
|
|
167
|
-
'INPUT.json',
|
|
168
|
-
'INPUT.bin',
|
|
169
|
-
'INPUT.txt',
|
|
170
|
-
]
|
|
171
|
-
|
|
172
|
-
if folder_exists:
|
|
173
|
-
# Create a temporary folder to save important files in
|
|
174
|
-
Path(temporary_path).mkdir(parents=True, exist_ok=True)
|
|
175
|
-
|
|
176
|
-
# Go through each file and save the ones that are important
|
|
177
|
-
for entity in possible_input_keys:
|
|
178
|
-
original_file_path = os.path.join(folder, entity)
|
|
179
|
-
temp_file_path = os.path.join(temporary_path, entity)
|
|
180
|
-
with contextlib.suppress(Exception):
|
|
181
|
-
await rename(original_file_path, temp_file_path)
|
|
182
|
-
|
|
183
|
-
# Remove the original folder and all its content
|
|
184
|
-
counter = 0
|
|
185
|
-
temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__'))
|
|
186
|
-
done = False
|
|
187
|
-
try:
|
|
188
|
-
while not done:
|
|
189
|
-
await rename(folder, temp_path_for_old_folder)
|
|
190
|
-
done = True
|
|
191
|
-
except Exception:
|
|
192
|
-
counter += 1
|
|
193
|
-
temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__'))
|
|
194
|
-
|
|
195
|
-
# Replace the temporary folder with the original folder
|
|
196
|
-
await rename(temporary_path, folder)
|
|
197
|
-
|
|
198
|
-
# Remove the old folder
|
|
199
|
-
await self._batch_remove_files(temp_path_for_old_folder)
|
|
200
|
-
|
|
201
|
-
async def _batch_remove_files(self: MemoryStorageClient, folder: str, counter: int = 0) -> None:
|
|
202
|
-
folder_exists = await ospath.exists(folder)
|
|
203
|
-
|
|
204
|
-
if folder_exists:
|
|
205
|
-
temporary_folder = (
|
|
206
|
-
folder
|
|
207
|
-
if os.path.basename(folder).startswith('__APIFY_TEMPORARY_')
|
|
208
|
-
else os.path.normpath(os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__'))
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
try:
|
|
212
|
-
# Rename the old folder to the new one to allow background deletions
|
|
213
|
-
await rename(folder, temporary_folder)
|
|
214
|
-
except Exception:
|
|
215
|
-
# Folder exists already, try again with an incremented counter
|
|
216
|
-
return await self._batch_remove_files(folder, counter + 1)
|
|
217
|
-
|
|
218
|
-
await aioshutil.rmtree(temporary_folder, ignore_errors=True)
|
|
219
|
-
return None
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from .base_resource_client import BaseResourceClient
|
|
2
|
-
from .base_resource_collection_client import BaseResourceCollectionClient
|
|
3
|
-
from .dataset import DatasetClient
|
|
4
|
-
from .dataset_collection import DatasetCollectionClient
|
|
5
|
-
from .key_value_store import KeyValueStoreClient
|
|
6
|
-
from .key_value_store_collection import KeyValueStoreCollectionClient
|
|
7
|
-
from .request_queue import RequestQueueClient
|
|
8
|
-
from .request_queue_collection import RequestQueueCollectionClient
|
|
9
|
-
|
|
10
|
-
__all__ = [
|
|
11
|
-
'BaseResourceClient',
|
|
12
|
-
'BaseResourceCollectionClient',
|
|
13
|
-
'DatasetClient',
|
|
14
|
-
'DatasetCollectionClient',
|
|
15
|
-
'KeyValueStoreClient',
|
|
16
|
-
'KeyValueStoreCollectionClient',
|
|
17
|
-
'RequestQueueClient',
|
|
18
|
-
'RequestQueueCollectionClient',
|
|
19
|
-
]
|