apify 3.0.1b1__py3-none-any.whl → 3.0.1b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_configuration.py +9 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +66 -20
- {apify-3.0.1b1.dist-info → apify-3.0.1b2.dist-info}/METADATA +2 -2
- {apify-3.0.1b1.dist-info → apify-3.0.1b2.dist-info}/RECORD +6 -6
- {apify-3.0.1b1.dist-info → apify-3.0.1b2.dist-info}/WHEEL +0 -0
- {apify-3.0.1b1.dist-info → apify-3.0.1b2.dist-info}/licenses/LICENSE +0 -0
apify/_configuration.py
CHANGED
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
4
|
from decimal import Decimal
|
|
5
5
|
from logging import getLogger
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
from typing import Annotated, Any
|
|
7
8
|
|
|
8
9
|
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
|
|
@@ -421,6 +422,14 @@ class Configuration(CrawleeConfiguration):
|
|
|
421
422
|
logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.')
|
|
422
423
|
return self
|
|
423
424
|
|
|
425
|
+
@property
|
|
426
|
+
def canonical_input_key(self) -> str:
|
|
427
|
+
return str(Path(self.input_key).with_suffix('.json'))
|
|
428
|
+
|
|
429
|
+
@property
|
|
430
|
+
def input_key_candidates(self) -> set[str]:
|
|
431
|
+
return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).stem}
|
|
432
|
+
|
|
424
433
|
@classmethod
|
|
425
434
|
def get_global_configuration(cls) -> Configuration:
|
|
426
435
|
"""Retrieve the global instance of the configuration.
|
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
|
-
|
|
3
|
+
import logging
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from more_itertools import flatten
|
|
6
|
+
from typing_extensions import Self, override
|
|
6
7
|
|
|
7
8
|
from crawlee._consts import METADATA_FILENAME
|
|
9
|
+
from crawlee.configuration import Configuration as CrawleeConfiguration
|
|
8
10
|
from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
|
|
11
|
+
from crawlee.storage_clients.models import KeyValueStoreRecord
|
|
9
12
|
|
|
10
13
|
from apify._configuration import Configuration
|
|
11
14
|
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
|
|
14
19
|
"""Apify-specific implementation of the `FileSystemKeyValueStoreClient`.
|
|
@@ -17,6 +22,22 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
|
|
|
17
22
|
directory, except for the metadata file and the `INPUT.json` file.
|
|
18
23
|
"""
|
|
19
24
|
|
|
25
|
+
@override
|
|
26
|
+
@classmethod
|
|
27
|
+
async def open(
|
|
28
|
+
cls,
|
|
29
|
+
*,
|
|
30
|
+
id: str | None,
|
|
31
|
+
name: str | None,
|
|
32
|
+
alias: str | None,
|
|
33
|
+
configuration: CrawleeConfiguration,
|
|
34
|
+
) -> Self:
|
|
35
|
+
client = await super().open(id=id, name=name, alias=alias, configuration=configuration)
|
|
36
|
+
|
|
37
|
+
await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method
|
|
38
|
+
|
|
39
|
+
return client
|
|
40
|
+
|
|
20
41
|
@override
|
|
21
42
|
async def purge(self) -> None:
|
|
22
43
|
"""Purges the key-value store by deleting all its contents.
|
|
@@ -24,16 +45,16 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
|
|
|
24
45
|
It deletes all files in the key-value store directory, except for the metadata file and
|
|
25
46
|
the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
|
|
26
47
|
"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# First try to find the alternative format of the input file and process it if it exists.
|
|
30
|
-
for file_path in self.path_to_kvs.glob('*'):
|
|
31
|
-
if file_path.name == f'{kvs_input_key}.json':
|
|
32
|
-
await self._process_input_json(file_path)
|
|
48
|
+
configuration = Configuration.get_global_configuration()
|
|
33
49
|
|
|
34
50
|
async with self._lock:
|
|
51
|
+
files_to_keep = set(
|
|
52
|
+
flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates)
|
|
53
|
+
)
|
|
54
|
+
files_to_keep.add(METADATA_FILENAME)
|
|
55
|
+
|
|
35
56
|
for file_path in self.path_to_kvs.glob('*'):
|
|
36
|
-
if file_path.name in
|
|
57
|
+
if file_path.name in files_to_keep:
|
|
37
58
|
continue
|
|
38
59
|
if file_path.is_file():
|
|
39
60
|
await asyncio.to_thread(file_path.unlink, missing_ok=True)
|
|
@@ -43,15 +64,40 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
|
|
|
43
64
|
update_modified_at=True,
|
|
44
65
|
)
|
|
45
66
|
|
|
46
|
-
async def
|
|
47
|
-
"""
|
|
67
|
+
async def _sanitize_input_json_files(self) -> None:
|
|
68
|
+
"""Handle missing metadata for input files."""
|
|
69
|
+
configuration = Configuration.get_global_configuration()
|
|
70
|
+
alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key}
|
|
48
71
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
72
|
+
if (self.path_to_kvs / configuration.canonical_input_key).exists():
|
|
73
|
+
# Refresh metadata to prevent inconsistencies
|
|
74
|
+
input_data = await asyncio.to_thread(
|
|
75
|
+
lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text())
|
|
76
|
+
)
|
|
77
|
+
await self.set_value(key=configuration.canonical_input_key, value=input_data)
|
|
78
|
+
|
|
79
|
+
for alternative_key in alternative_keys:
|
|
80
|
+
if (alternative_input_file := self.path_to_kvs / alternative_key).exists():
|
|
81
|
+
logger.warning(f'Redundant input file found: {alternative_input_file}')
|
|
82
|
+
else:
|
|
83
|
+
for alternative_key in alternative_keys:
|
|
84
|
+
alternative_input_file = self.path_to_kvs / alternative_key
|
|
85
|
+
|
|
86
|
+
# Only process files that actually exist
|
|
87
|
+
if alternative_input_file.exists():
|
|
88
|
+
# Refresh metadata to prevent inconsistencies
|
|
89
|
+
with alternative_input_file.open() as f:
|
|
90
|
+
input_data = await asyncio.to_thread(lambda: json.load(f))
|
|
91
|
+
await self.set_value(key=alternative_key, value=input_data)
|
|
92
|
+
|
|
93
|
+
@override
|
|
94
|
+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
|
|
95
|
+
configuration = Configuration.get_global_configuration()
|
|
96
|
+
|
|
97
|
+
if key in configuration.input_key_candidates:
|
|
98
|
+
for candidate in configuration.input_key_candidates:
|
|
99
|
+
value = await super().get_value(key=candidate)
|
|
100
|
+
if value is not None:
|
|
101
|
+
return value
|
|
102
|
+
|
|
103
|
+
return await super().get_value(key=key)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.1b2
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
|
|
@@ -228,7 +228,7 @@ Requires-Python: >=3.10
|
|
|
228
228
|
Requires-Dist: apify-client<3.0.0,>=2.0.0
|
|
229
229
|
Requires-Dist: apify-shared<3.0.0,>=2.0.0
|
|
230
230
|
Requires-Dist: cachetools>=5.5.0
|
|
231
|
-
Requires-Dist: crawlee<2.0.0,>=1.0.
|
|
231
|
+
Requires-Dist: crawlee<2.0.0,>=1.0.2
|
|
232
232
|
Requires-Dist: cryptography>=42.0.0
|
|
233
233
|
Requires-Dist: impit>=0.6.1
|
|
234
234
|
Requires-Dist: lazy-object-proxy>=1.11.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
|
|
2
2
|
apify/_actor.py,sha256=DYHoyBAu6hDLs0BcTZL-IQveLK8gPTWvb6AgDnJc3EA,54755
|
|
3
3
|
apify/_charging.py,sha256=KjZ2DnEMS0Tt8ibizmmt0RwBq8FOAsD1z-hKFgdazcY,13143
|
|
4
|
-
apify/_configuration.py,sha256=
|
|
4
|
+
apify/_configuration.py,sha256=7ZHhgRp98kr35zx4k4EB2aImq7Dq1FJjPg7r5bucv_M,14984
|
|
5
5
|
apify/_consts.py,sha256=CjhyEJ4Mi0lcIrzfqz8dN7nPJWGjCeBrrXQy1PZ6zRI,440
|
|
6
6
|
apify/_crypto.py,sha256=tqUs13QkemDtGzvU41pIA2HUEawpDlgzqbwKjm4I8kM,6852
|
|
7
7
|
apify/_models.py,sha256=EzU-inWeJ7T5HNVYEwnYb79W-q4OAPhtrYctfRYzpTE,7848
|
|
@@ -45,13 +45,13 @@ apify/storage_clients/_apify/_storage_client.py,sha256=hFl_PuX1UgOydBD6pieZ0u2NW
|
|
|
45
45
|
apify/storage_clients/_apify/_utils.py,sha256=ywXoSM69amRokUZcshbAvQLIcSZq4L-bpYIGyeFxCGQ,7696
|
|
46
46
|
apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
47
|
apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
|
|
48
|
-
apify/storage_clients/_file_system/_key_value_store_client.py,sha256=
|
|
48
|
+
apify/storage_clients/_file_system/_key_value_store_client.py,sha256=gxM3ap67PnY80Rd7P3onPAf2pksYpU0LoAlJdayEMdU,4179
|
|
49
49
|
apify/storage_clients/_file_system/_storage_client.py,sha256=rcwpKYlrWzvlSA2xoxftg-EZAi_iGZ3vOCbu0C5lKDE,1396
|
|
50
50
|
apify/storage_clients/_smart_apify/__init__.py,sha256=614B2AaWY-dx6RQ6mod7VVR8gFh75-_jnq5BeDD7hSc,53
|
|
51
51
|
apify/storage_clients/_smart_apify/_storage_client.py,sha256=GCPmVe_xWAFcO2Cuej4su4i97_d33Q9Ih_Sc5xW2Wa4,4674
|
|
52
52
|
apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
|
|
53
53
|
apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
apify-3.0.
|
|
55
|
-
apify-3.0.
|
|
56
|
-
apify-3.0.
|
|
57
|
-
apify-3.0.
|
|
54
|
+
apify-3.0.1b2.dist-info/METADATA,sha256=2NLa54gUwW-FF9Rf6Qc4XQo2JsAVOvPaThJljzT4OZo,22582
|
|
55
|
+
apify-3.0.1b2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
56
|
+
apify-3.0.1b2.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
57
|
+
apify-3.0.1b2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|