apify 1.7.3b3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (61) hide show
  1. apify/__init__.py +19 -4
  2. apify/_actor.py +979 -0
  3. apify/_configuration.py +310 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +29 -27
  6. apify/_models.py +110 -0
  7. apify/_platform_event_manager.py +222 -0
  8. apify/_proxy_configuration.py +316 -0
  9. apify/_utils.py +0 -497
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +56 -0
  12. apify/apify_storage_client/_dataset_client.py +188 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +50 -0
  14. apify/apify_storage_client/_key_value_store_client.py +98 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +50 -0
  16. apify/apify_storage_client/_request_queue_client.py +208 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +50 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +24 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +21 -21
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +1 -1
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +55 -54
  29. apify/scrapy/scheduler.py +19 -13
  30. apify/scrapy/utils.py +2 -31
  31. apify/storages/__init__.py +2 -10
  32. apify/storages/py.typed +0 -0
  33. apify-2.0.0.dist-info/METADATA +209 -0
  34. apify-2.0.0.dist-info/RECORD +37 -0
  35. {apify-1.7.3b3.dist-info → apify-2.0.0.dist-info}/WHEEL +1 -2
  36. apify/_memory_storage/__init__.py +0 -3
  37. apify/_memory_storage/file_storage_utils.py +0 -71
  38. apify/_memory_storage/memory_storage_client.py +0 -219
  39. apify/_memory_storage/resource_clients/__init__.py +0 -19
  40. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  41. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  42. apify/_memory_storage/resource_clients/dataset.py +0 -452
  43. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  44. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  45. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  46. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  47. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  48. apify/actor.py +0 -1357
  49. apify/config.py +0 -130
  50. apify/consts.py +0 -67
  51. apify/event_manager.py +0 -236
  52. apify/proxy_configuration.py +0 -365
  53. apify/storages/base_storage.py +0 -181
  54. apify/storages/dataset.py +0 -494
  55. apify/storages/key_value_store.py +0 -257
  56. apify/storages/request_queue.py +0 -602
  57. apify/storages/storage_client_manager.py +0 -72
  58. apify-1.7.3b3.dist-info/METADATA +0 -150
  59. apify-1.7.3b3.dist-info/RECORD +0 -41
  60. apify-1.7.3b3.dist-info/top_level.txt +0 -1
  61. {apify-1.7.3b3.dist-info → apify-2.0.0.dist-info}/LICENSE +0 -0
@@ -0,0 +1,209 @@
1
+ Metadata-Version: 2.1
2
+ Name: apify
3
+ Version: 2.0.0
4
+ Summary: Apify SDK for Python
5
+ License: Apache-2.0
6
+ Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
7
+ Author: Apify Technologies s.r.o.
8
+ Author-email: support@apify.com
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Provides-Extra: scrapy
21
+ Requires-Dist: apify-client (>=1.7.1)
22
+ Requires-Dist: apify-shared (>=1.1.2)
23
+ Requires-Dist: crawlee (>=0.3.5)
24
+ Requires-Dist: cryptography (>=42.0.0)
25
+ Requires-Dist: httpx (>=0.27.0)
26
+ Requires-Dist: lazy-object-proxy (>=1.10.0)
27
+ Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
28
+ Requires-Dist: typing-extensions (>=4.1.0)
29
+ Requires-Dist: websockets (>=10.0)
30
+ Project-URL: Apify Homepage, https://apify.com
31
+ Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
32
+ Project-URL: Documentation, https://docs.apify.com/sdk/python/
33
+ Project-URL: Homepage, https://docs.apify.com/sdk/python/
34
+ Project-URL: Issue Tracker, https://github.com/apify/apify-sdk-python/issues
35
+ Project-URL: Repository, https://github.com/apify/apify-sdk-python
36
+ Description-Content-Type: text/markdown
37
+
38
+ # Apify SDK for Python
39
+
40
+ The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
41
+ in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
42
+ event handling.
43
+
44
+ If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
45
+ check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
46
+
47
+ ## Installation
48
+
49
+ The Apify SDK for Python is available on PyPI as the `apify` package.
50
+ For default installation, using Pip, run the following:
51
+
52
+ ```bash
53
+ pip install apify
54
+ ```
55
+
56
+ For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
57
+ To install Apify with the `scrapy` extra, use the following command:
58
+
59
+ ```bash
60
+ pip install apify[scrapy]
61
+ ```
62
+
63
+ ## Documentation
64
+
65
+ For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
66
+
67
+ ## Examples
68
+
69
+ Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
70
+
71
+ ### Apify SDK with HTTPX and BeautifulSoup
72
+
73
+ This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
74
+
75
+ ```python
76
+ from apify import Actor
77
+ from bs4 import BeautifulSoup
78
+ from httpx import AsyncClient
79
+
80
+
81
+ async def main() -> None:
82
+ async with Actor:
83
+ # Retrieve the Actor input, and use default values if not provided.
84
+ actor_input = await Actor.get_input() or {}
85
+ start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
86
+
87
+ # Open the default request queue for handling URLs to be processed.
88
+ request_queue = await Actor.open_request_queue()
89
+
90
+ # Enqueue the start URLs.
91
+ for start_url in start_urls:
92
+ url = start_url.get('url')
93
+ await request_queue.add_request(url)
94
+
95
+ # Process the URLs from the request queue.
96
+ while request := await request_queue.fetch_next_request():
97
+ Actor.log.info(f'Scraping {request.url} ...')
98
+
99
+ # Fetch the HTTP response from the specified URL using HTTPX.
100
+ async with AsyncClient() as client:
101
+ response = await client.get(request.url)
102
+
103
+ # Parse the HTML content using Beautiful Soup.
104
+ soup = BeautifulSoup(response.content, 'html.parser')
105
+
106
+ # Extract the desired data.
107
+ data = {
108
+ 'url': actor_input['url'],
109
+ 'title': soup.title.string,
110
+ 'h1s': [h1.text for h1 in soup.find_all('h1')],
111
+ 'h2s': [h2.text for h2 in soup.find_all('h2')],
112
+ 'h3s': [h3.text for h3 in soup.find_all('h3')],
113
+ }
114
+
115
+ # Store the extracted data to the default dataset.
116
+ await Actor.push_data(data)
117
+ ```
118
+
119
+ ### Apify SDK with PlaywrightCrawler from Crawlee
120
+
121
+ This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
122
+
123
+ ```python
124
+ from apify import Actor, Request
125
+ from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
126
+
127
+
128
+ async def main() -> None:
129
+ async with Actor:
130
+ # Retrieve the Actor input, and use default values if not provided.
131
+ actor_input = await Actor.get_input() or {}
132
+ start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
133
+
134
+ # Exit if no start URLs are provided.
135
+ if not start_urls:
136
+ Actor.log.info('No start URLs specified in Actor input, exiting...')
137
+ await Actor.exit()
138
+
139
+ # Create a crawler.
140
+ crawler = PlaywrightCrawler(
141
+ # Limit the crawl to max requests. Remove or increase it for crawling all links.
142
+ max_requests_per_crawl=50,
143
+ headless=True,
144
+ )
145
+
146
+ # Define a request handler, which will be called for every request.
147
+ @crawler.router.default_handler
148
+ async def request_handler(context: PlaywrightCrawlingContext) -> None:
149
+ url = context.request.url
150
+ Actor.log.info(f'Scraping {url}...')
151
+
152
+ # Extract the desired data.
153
+ data = {
154
+ 'url': context.request.url,
155
+ 'title': await context.page.title(),
156
+ 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
157
+ 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
158
+ 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
159
+ }
160
+
161
+ # Store the extracted data to the default dataset.
162
+ await context.push_data(data)
163
+
164
+ # Enqueue additional links found on the current page.
165
+ await context.enqueue_links()
166
+
167
+ # Run the crawler with the starting URLs.
168
+ await crawler.run(start_urls)
169
+ ```
170
+
171
+ ## What are Actors?
172
+
173
+ Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
174
+ They can do anything from small tasks such as filling in forms or unsubscribing from online services,
175
+ all the way up to scraping and processing vast numbers of web pages.
176
+
177
+ They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
178
+ where you can run them at scale, monitor them, schedule them, or publish and monetize them.
179
+
180
+ If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
181
+ in the Apify platform documentation.
182
+
183
+ ## Creating Actors
184
+
185
+ To create and run Actors through Apify Console,
186
+ see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
187
+
188
+ To create and run Python Actors locally, check the documentation for
189
+ [how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
190
+
191
+ ## Guides
192
+
193
+ To see how you can use the Apify SDK with other popular libraries used for web scraping,
194
+ check out our guides for using
195
+ [Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
196
+ [Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
197
+ [Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
198
+ [Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
199
+ or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
200
+
201
+ ## Usage concepts
202
+
203
+ To learn more about the features of the Apify SDK and how to use them,
204
+ check out the Usage Concepts section in the sidebar,
205
+ particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
206
+ [working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
207
+ [handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
208
+ or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
209
+
@@ -0,0 +1,37 @@
1
+ apify/__init__.py,sha256=ikoi2EpDYl6y-XSVtlU8UsdQdMEyOiIJCRRAaZFDOP8,550
2
+ apify/_actor.py,sha256=oPgQ3rxxIEzVcZ9XtI3lf1a_6gwIMgxihNuYGjJpGww,41816
3
+ apify/_configuration.py,sha256=gf7YOun32Whc9DamhoWDLmcUeNwtWVmmBPrl4oq6s4I,8997
4
+ apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
5
+ apify/_crypto.py,sha256=b4Czs1NLPkaNzkPjovObjSIbsKnRrgtBkM9JvOysUMA,5612
6
+ apify/_models.py,sha256=oYlTEr-DyQAE-V2rrYD5PhUxTXVPdAig7QV-u6CJw3E,5571
7
+ apify/_platform_event_manager.py,sha256=h5fBmXtKD4t-yCdOSiLM1-DnCrIbGEmYmz2mOU3A8bA,7627
8
+ apify/_proxy_configuration.py,sha256=VdKh_AyCwaCUlpCyaCe30L2S9OZ-vL1SN1g8oLwSeYA,13074
9
+ apify/_utils.py,sha256=x4lnR9RNulySiEQTft-GeQqUcJsRr0k8p0Sv9NTeWFg,638
10
+ apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
11
+ apify/apify_storage_client/_apify_storage_client.py,sha256=xi4OFchxhe-1-sykanH6Zcya4OcBhn2uf7OQ1pV4Ins,2338
12
+ apify/apify_storage_client/_dataset_client.py,sha256=j9seF2OKvbSMD9R9XF9fpa1vtr_1w4JcRV--WCmvU4E,5501
13
+ apify/apify_storage_client/_dataset_collection_client.py,sha256=fkYvYGQCigHD2CDzpWk0swNAkfvAinAhMGpYqllle3E,1445
14
+ apify/apify_storage_client/_key_value_store_client.py,sha256=uyeQgb75sGFsqIS4sq4hEZ3QP81COLfS3tmTqHc0tso,3340
15
+ apify/apify_storage_client/_key_value_store_collection_client.py,sha256=vCtMTI-jx89Qp5WHILDNkCthwLuv0MAwm1J_5E4aypU,1519
16
+ apify/apify_storage_client/_request_queue_client.py,sha256=P8ws8jEzi2PWpp-cvYfV7kwuKbgH813BpNQ_wMSVtTA,6278
17
+ apify/apify_storage_client/_request_queue_collection_client.py,sha256=NnO73UJ9ZrjV8xoudo30wfaM-SojRkG0guhxDyB-K1g,1527
18
+ apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ apify/log.py,sha256=pX6ppIvds8OKqjFpIcshqG4zp_5DiOUU31ksyfSExto,1392
20
+ apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ apify/scrapy/__init__.py,sha256=qDPV_zTRFaUqoFOyS5g4uBfz-UCkmWYJ82VXQ_3Cw6k,348
22
+ apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
23
+ apify/scrapy/middlewares/apify_proxy.py,sha256=_1WO7NKHxIcPf8mSNjsqANTEsx7ygMTuRQW9fbwKMO8,5837
24
+ apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
26
+ apify/scrapy/pipelines/actor_dataset_push.py,sha256=QERmmExQOGIKQ70-p-lCj5qyE-c-fnYplEqd4mgaB1Q,953
27
+ apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ apify/scrapy/requests.py,sha256=pmm2M-cwrTXyI3t1nRBo9pS6nHfc4zkzS25-NXxzd9I,7637
30
+ apify/scrapy/scheduler.py,sha256=AAIKY5i1QxkC1mtmix6n3M2eQaOw-d1T56Noue9xToc,6013
31
+ apify/scrapy/utils.py,sha256=tz_Y8CTqe6KbyMMhLF3m7qqR46jtNH5U7Ty7e19roPU,2814
32
+ apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
33
+ apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ apify-2.0.0.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
35
+ apify-2.0.0.dist-info/METADATA,sha256=DhojQDiiwKEwS7VcAufA7ERVHYHKk5mqHFtddWXL4Qk,8604
36
+ apify-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
37
+ apify-2.0.0.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: poetry-core 1.9.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,3 +0,0 @@
1
- from .memory_storage_client import MemoryStorageClient
2
-
3
- __all__ = ['MemoryStorageClient']
@@ -1,71 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
-
5
- import aiofiles
6
- from aiofiles.os import makedirs
7
- from apify_shared.utils import json_dumps
8
-
9
- from apify._utils import force_remove
10
-
11
-
12
- async def update_metadata(*, data: dict, entity_directory: str, write_metadata: bool) -> None:
13
- # Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present
14
- if not write_metadata:
15
- return
16
-
17
- # Ensure the directory for the entity exists
18
- await makedirs(entity_directory, exist_ok=True)
19
-
20
- # Write the metadata to the file
21
- file_path = os.path.join(entity_directory, '__metadata__.json')
22
- async with aiofiles.open(file_path, mode='wb') as f:
23
- await f.write(json_dumps(data).encode('utf-8'))
24
-
25
-
26
- async def _update_dataset_items(
27
- *,
28
- data: list[tuple[str, dict]],
29
- entity_directory: str,
30
- persist_storage: bool,
31
- ) -> None:
32
- # Skip writing files to the disk if the client has the option set to false
33
- if not persist_storage:
34
- return
35
-
36
- # Ensure the directory for the entity exists
37
- await makedirs(entity_directory, exist_ok=True)
38
-
39
- # Save all the new items to the disk
40
- for idx, item in data:
41
- file_path = os.path.join(entity_directory, f'{idx}.json')
42
- async with aiofiles.open(file_path, mode='wb') as f:
43
- await f.write(json_dumps(item).encode('utf-8'))
44
-
45
-
46
- async def update_request_queue_item(
47
- *,
48
- request_id: str,
49
- request: dict,
50
- entity_directory: str,
51
- persist_storage: bool,
52
- ) -> None:
53
- # Skip writing files to the disk if the client has the option set to false
54
- if not persist_storage:
55
- return
56
-
57
- # Ensure the directory for the entity exists
58
- await makedirs(entity_directory, exist_ok=True)
59
-
60
- # Write the request to the file
61
- file_path = os.path.join(entity_directory, f'{request_id}.json')
62
- async with aiofiles.open(file_path, mode='wb') as f:
63
- await f.write(json_dumps(request).encode('utf-8'))
64
-
65
-
66
- async def delete_request(*, request_id: str, entity_directory: str) -> None:
67
- # Ensure the directory for the entity exists
68
- await makedirs(entity_directory, exist_ok=True)
69
-
70
- file_path = os.path.join(entity_directory, f'{request_id}.json')
71
- await force_remove(file_path)
@@ -1,219 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import contextlib
5
- import os
6
- from pathlib import Path
7
-
8
- import aioshutil
9
- from aiofiles import ospath
10
- from aiofiles.os import rename, scandir
11
- from apify_shared.consts import ApifyEnvVars
12
- from apify_shared.utils import ignore_docs
13
-
14
- from apify._memory_storage.resource_clients.dataset import DatasetClient
15
- from apify._memory_storage.resource_clients.dataset_collection import DatasetCollectionClient
16
- from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient
17
- from apify._memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient
18
- from apify._memory_storage.resource_clients.request_queue import RequestQueueClient
19
- from apify._memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient
20
- from apify._utils import maybe_parse_bool
21
-
22
- """
23
- Memory storage emulates data storages that are available on the Apify platform.
24
- Specifically, it emulates clients for datasets, key-value stores and request queues.
25
- The data are held in-memory and persisted locally if `persist_storage` is True.
26
- The metadata of the storages is also persisted if `write_metadata` is True.
27
- """
28
-
29
-
30
- @ignore_docs
31
- class MemoryStorageClient:
32
- """Class representing an in-memory storage."""
33
-
34
- _local_data_directory: str
35
- _datasets_directory: str
36
- _key_value_stores_directory: str
37
- _request_queues_directory: str
38
- _write_metadata: bool
39
- _persist_storage: bool
40
- _datasets_handled: list[DatasetClient]
41
- _key_value_stores_handled: list[KeyValueStoreClient]
42
- _request_queues_handled: list[RequestQueueClient]
43
-
44
- _purged_on_start: bool = False
45
- _purge_lock: asyncio.Lock
46
-
47
- """Indicates whether a purge was already performed on this instance"""
48
-
49
- def __init__(
50
- self: MemoryStorageClient,
51
- *,
52
- local_data_directory: str | None = None,
53
- write_metadata: bool | None = None,
54
- persist_storage: bool | None = None,
55
- ) -> None:
56
- """Initialize the MemoryStorageClient.
57
-
58
- Args:
59
- local_data_directory (str, optional): A local directory where all data will be persisted
60
- persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory
61
- write_metadata (bool, optional): Whether to persist metadata of the storages as well
62
- """
63
- self._local_data_directory = local_data_directory or os.getenv(ApifyEnvVars.LOCAL_STORAGE_DIR) or './storage'
64
- self._datasets_directory = os.path.join(self._local_data_directory, 'datasets')
65
- self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores')
66
- self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues')
67
- self._write_metadata = write_metadata if write_metadata is not None else '*' in os.getenv('DEBUG', '')
68
- self._persist_storage = persist_storage if persist_storage is not None else maybe_parse_bool(os.getenv(ApifyEnvVars.PERSIST_STORAGE, 'true'))
69
- self._datasets_handled = []
70
- self._key_value_stores_handled = []
71
- self._request_queues_handled = []
72
- self._purge_lock = asyncio.Lock()
73
-
74
- def datasets(self: MemoryStorageClient) -> DatasetCollectionClient:
75
- """Retrieve the sub-client for manipulating datasets."""
76
- return DatasetCollectionClient(base_storage_directory=self._datasets_directory, memory_storage_client=self)
77
-
78
- def dataset(self: MemoryStorageClient, dataset_id: str) -> DatasetClient:
79
- """Retrieve the sub-client for manipulating a single dataset.
80
-
81
- Args:
82
- dataset_id (str): ID of the dataset to be manipulated
83
- """
84
- return DatasetClient(base_storage_directory=self._datasets_directory, memory_storage_client=self, id=dataset_id)
85
-
86
- def key_value_stores(self: MemoryStorageClient) -> KeyValueStoreCollectionClient:
87
- """Retrieve the sub-client for manipulating key-value stores."""
88
- return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self)
89
-
90
- def key_value_store(self: MemoryStorageClient, key_value_store_id: str) -> KeyValueStoreClient:
91
- """Retrieve the sub-client for manipulating a single key-value store.
92
-
93
- Args:
94
- key_value_store_id (str): ID of the key-value store to be manipulated
95
- """
96
- return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self, id=key_value_store_id)
97
-
98
- def request_queues(self: MemoryStorageClient) -> RequestQueueCollectionClient:
99
- """Retrieve the sub-client for manipulating request queues."""
100
- return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self)
101
-
102
- def request_queue(
103
- self: MemoryStorageClient,
104
- request_queue_id: str,
105
- *,
106
- client_key: str | None = None, # noqa: ARG002
107
- ) -> RequestQueueClient:
108
- """Retrieve the sub-client for manipulating a single request queue.
109
-
110
- Args:
111
- request_queue_id (str): ID of the request queue to be manipulated
112
- client_key (str): A unique identifier of the client accessing the request queue
113
- """
114
- return RequestQueueClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self, id=request_queue_id)
115
-
116
- async def _purge_on_start(self: MemoryStorageClient) -> None:
117
- # Optimistic, non-blocking check
118
- if self._purged_on_start is True:
119
- return
120
-
121
- async with self._purge_lock:
122
- # Another check under the lock just to be sure
123
- if self._purged_on_start is True:
124
- return # type: ignore[unreachable] # Mypy doesn't understand that the _purged_on_start can change while we're getting the async lock
125
-
126
- await self._purge()
127
- self._purged_on_start = True
128
-
129
- async def _purge(self: MemoryStorageClient) -> None:
130
- """Clean up the default storage directories before the run starts.
131
-
132
- Specifically, `purge` cleans up:
133
- - local directory containing the default dataset;
134
- - all records from the default key-value store in the local directory, except for the "INPUT" key;
135
- - local directory containing the default request queue.
136
- """
137
- # Key-value stores
138
- if await ospath.exists(self._key_value_stores_directory):
139
- key_value_store_folders = await scandir(self._key_value_stores_directory)
140
- for key_value_store_folder in key_value_store_folders:
141
- if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'):
142
- await self._batch_remove_files(key_value_store_folder.path)
143
- elif key_value_store_folder.name == 'default':
144
- await self._handle_default_key_value_store(key_value_store_folder.path)
145
-
146
- # Datasets
147
- if await ospath.exists(self._datasets_directory):
148
- dataset_folders = await scandir(self._datasets_directory)
149
- for dataset_folder in dataset_folders:
150
- if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'):
151
- await self._batch_remove_files(dataset_folder.path)
152
- # Request queues
153
- if await ospath.exists(self._request_queues_directory):
154
- request_queue_folders = await scandir(self._request_queues_directory)
155
- for request_queue_folder in request_queue_folders:
156
- if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'):
157
- await self._batch_remove_files(request_queue_folder.path)
158
-
159
- async def _handle_default_key_value_store(self: MemoryStorageClient, folder: str) -> None:
160
- """Remove everything from the default key-value store folder except `possible_input_keys`."""
161
- folder_exists = await ospath.exists(folder)
162
- temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__'))
163
-
164
- # For optimization, we want to only attempt to copy a few files from the default key-value store
165
- possible_input_keys = [
166
- 'INPUT',
167
- 'INPUT.json',
168
- 'INPUT.bin',
169
- 'INPUT.txt',
170
- ]
171
-
172
- if folder_exists:
173
- # Create a temporary folder to save important files in
174
- Path(temporary_path).mkdir(parents=True, exist_ok=True)
175
-
176
- # Go through each file and save the ones that are important
177
- for entity in possible_input_keys:
178
- original_file_path = os.path.join(folder, entity)
179
- temp_file_path = os.path.join(temporary_path, entity)
180
- with contextlib.suppress(Exception):
181
- await rename(original_file_path, temp_file_path)
182
-
183
- # Remove the original folder and all its content
184
- counter = 0
185
- temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__'))
186
- done = False
187
- try:
188
- while not done:
189
- await rename(folder, temp_path_for_old_folder)
190
- done = True
191
- except Exception:
192
- counter += 1
193
- temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__'))
194
-
195
- # Replace the temporary folder with the original folder
196
- await rename(temporary_path, folder)
197
-
198
- # Remove the old folder
199
- await self._batch_remove_files(temp_path_for_old_folder)
200
-
201
- async def _batch_remove_files(self: MemoryStorageClient, folder: str, counter: int = 0) -> None:
202
- folder_exists = await ospath.exists(folder)
203
-
204
- if folder_exists:
205
- temporary_folder = (
206
- folder
207
- if os.path.basename(folder).startswith('__APIFY_TEMPORARY_')
208
- else os.path.normpath(os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__'))
209
- )
210
-
211
- try:
212
- # Rename the old folder to the new one to allow background deletions
213
- await rename(folder, temporary_folder)
214
- except Exception:
215
- # Folder exists already, try again with an incremented counter
216
- return await self._batch_remove_files(folder, counter + 1)
217
-
218
- await aioshutil.rmtree(temporary_folder, ignore_errors=True)
219
- return None
@@ -1,19 +0,0 @@
1
- from .base_resource_client import BaseResourceClient
2
- from .base_resource_collection_client import BaseResourceCollectionClient
3
- from .dataset import DatasetClient
4
- from .dataset_collection import DatasetCollectionClient
5
- from .key_value_store import KeyValueStoreClient
6
- from .key_value_store_collection import KeyValueStoreCollectionClient
7
- from .request_queue import RequestQueueClient
8
- from .request_queue_collection import RequestQueueCollectionClient
9
-
10
- __all__ = [
11
- 'BaseResourceClient',
12
- 'BaseResourceCollectionClient',
13
- 'DatasetClient',
14
- 'DatasetCollectionClient',
15
- 'KeyValueStoreClient',
16
- 'KeyValueStoreCollectionClient',
17
- 'RequestQueueClient',
18
- 'RequestQueueCollectionClient',
19
- ]