apify 3.0.1b1__py3-none-any.whl → 3.0.1b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/_configuration.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from datetime import datetime, timedelta
4
4
  from decimal import Decimal
5
5
  from logging import getLogger
6
+ from pathlib import Path
6
7
  from typing import Annotated, Any
7
8
 
8
9
  from pydantic import AliasChoices, BeforeValidator, Field, model_validator
@@ -421,6 +422,14 @@ class Configuration(CrawleeConfiguration):
421
422
  logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.')
422
423
  return self
423
424
 
425
+ @property
426
+ def canonical_input_key(self) -> str:
427
+ return str(Path(self.input_key).with_suffix('.json'))
428
+
429
+ @property
430
+ def input_key_candidates(self) -> set[str]:
431
+ return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).stem}
432
+
424
433
  @classmethod
425
434
  def get_global_configuration(cls) -> Configuration:
426
435
  """Retrieve the global instance of the configuration.
@@ -1,14 +1,19 @@
1
1
  import asyncio
2
2
  import json
3
- from pathlib import Path
3
+ import logging
4
4
 
5
- from typing_extensions import override
5
+ from more_itertools import flatten
6
+ from typing_extensions import Self, override
6
7
 
7
8
  from crawlee._consts import METADATA_FILENAME
9
+ from crawlee.configuration import Configuration as CrawleeConfiguration
8
10
  from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
11
+ from crawlee.storage_clients.models import KeyValueStoreRecord
9
12
 
10
13
  from apify._configuration import Configuration
11
14
 
15
+ logger = logging.getLogger(__name__)
16
+
12
17
 
13
18
  class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
14
19
  """Apify-specific implementation of the `FileSystemKeyValueStoreClient`.
@@ -17,6 +22,22 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
17
22
  directory, except for the metadata file and the `INPUT.json` file.
18
23
  """
19
24
 
25
+ @override
26
+ @classmethod
27
+ async def open(
28
+ cls,
29
+ *,
30
+ id: str | None,
31
+ name: str | None,
32
+ alias: str | None,
33
+ configuration: CrawleeConfiguration,
34
+ ) -> Self:
35
+ client = await super().open(id=id, name=name, alias=alias, configuration=configuration)
36
+
37
+ await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method
38
+
39
+ return client
40
+
20
41
  @override
21
42
  async def purge(self) -> None:
22
43
  """Purges the key-value store by deleting all its contents.
@@ -24,16 +45,16 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
24
45
  It deletes all files in the key-value store directory, except for the metadata file and
25
46
  the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
26
47
  """
27
- kvs_input_key = Configuration.get_global_configuration().input_key
28
-
29
- # First try to find the alternative format of the input file and process it if it exists.
30
- for file_path in self.path_to_kvs.glob('*'):
31
- if file_path.name == f'{kvs_input_key}.json':
32
- await self._process_input_json(file_path)
48
+ configuration = Configuration.get_global_configuration()
33
49
 
34
50
  async with self._lock:
51
+ files_to_keep = set(
52
+ flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates)
53
+ )
54
+ files_to_keep.add(METADATA_FILENAME)
55
+
35
56
  for file_path in self.path_to_kvs.glob('*'):
36
- if file_path.name in {METADATA_FILENAME, kvs_input_key, f'{kvs_input_key}.{METADATA_FILENAME}'}:
57
+ if file_path.name in files_to_keep:
37
58
  continue
38
59
  if file_path.is_file():
39
60
  await asyncio.to_thread(file_path.unlink, missing_ok=True)
@@ -43,15 +64,40 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
43
64
  update_modified_at=True,
44
65
  )
45
66
 
46
- async def _process_input_json(self, path: Path) -> None:
47
- """Process simple input json file to format expected by the FileSystemKeyValueStoreClient.
67
+ async def _sanitize_input_json_files(self) -> None:
68
+ """Handle missing metadata for input files."""
69
+ configuration = Configuration.get_global_configuration()
70
+ alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key}
48
71
 
49
- For example: INPUT.json -> INPUT, INPUT.json.metadata
50
- """
51
- try:
52
- f = await asyncio.to_thread(path.open)
53
- input_data = json.load(f)
54
- finally:
55
- f.close()
56
- await asyncio.to_thread(path.unlink, missing_ok=True)
57
- await self.set_value(key=path.stem, value=input_data)
72
+ if (self.path_to_kvs / configuration.canonical_input_key).exists():
73
+ # Refresh metadata to prevent inconsistencies
74
+ input_data = await asyncio.to_thread(
75
+ lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text())
76
+ )
77
+ await self.set_value(key=configuration.canonical_input_key, value=input_data)
78
+
79
+ for alternative_key in alternative_keys:
80
+ if (alternative_input_file := self.path_to_kvs / alternative_key).exists():
81
+ logger.warning(f'Redundant input file found: {alternative_input_file}')
82
+ else:
83
+ for alternative_key in alternative_keys:
84
+ alternative_input_file = self.path_to_kvs / alternative_key
85
+
86
+ # Only process files that actually exist
87
+ if alternative_input_file.exists():
88
+ # Refresh metadata to prevent inconsistencies
89
+ with alternative_input_file.open() as f:
90
+ input_data = await asyncio.to_thread(lambda: json.load(f))
91
+ await self.set_value(key=alternative_key, value=input_data)
92
+
93
+ @override
94
+ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
95
+ configuration = Configuration.get_global_configuration()
96
+
97
+ if key in configuration.input_key_candidates:
98
+ for candidate in configuration.input_key_candidates:
99
+ value = await super().get_value(key=candidate)
100
+ if value is not None:
101
+ return value
102
+
103
+ return await super().get_value(key=key)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.0.1b1
3
+ Version: 3.0.1b2
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -228,7 +228,7 @@ Requires-Python: >=3.10
228
228
  Requires-Dist: apify-client<3.0.0,>=2.0.0
229
229
  Requires-Dist: apify-shared<3.0.0,>=2.0.0
230
230
  Requires-Dist: cachetools>=5.5.0
231
- Requires-Dist: crawlee<2.0.0,>=1.0.0
231
+ Requires-Dist: crawlee<2.0.0,>=1.0.2
232
232
  Requires-Dist: cryptography>=42.0.0
233
233
  Requires-Dist: impit>=0.6.1
234
234
  Requires-Dist: lazy-object-proxy>=1.11.0
@@ -1,7 +1,7 @@
1
1
  apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
2
2
  apify/_actor.py,sha256=DYHoyBAu6hDLs0BcTZL-IQveLK8gPTWvb6AgDnJc3EA,54755
3
3
  apify/_charging.py,sha256=KjZ2DnEMS0Tt8ibizmmt0RwBq8FOAsD1z-hKFgdazcY,13143
4
- apify/_configuration.py,sha256=gq_UfWTgcP1_0kEMLhXVg33SgSxXjShbuzoXyCFfK0w,14682
4
+ apify/_configuration.py,sha256=7ZHhgRp98kr35zx4k4EB2aImq7Dq1FJjPg7r5bucv_M,14984
5
5
  apify/_consts.py,sha256=CjhyEJ4Mi0lcIrzfqz8dN7nPJWGjCeBrrXQy1PZ6zRI,440
6
6
  apify/_crypto.py,sha256=tqUs13QkemDtGzvU41pIA2HUEawpDlgzqbwKjm4I8kM,6852
7
7
  apify/_models.py,sha256=EzU-inWeJ7T5HNVYEwnYb79W-q4OAPhtrYctfRYzpTE,7848
@@ -45,13 +45,13 @@ apify/storage_clients/_apify/_storage_client.py,sha256=hFl_PuX1UgOydBD6pieZ0u2NW
45
45
  apify/storage_clients/_apify/_utils.py,sha256=ywXoSM69amRokUZcshbAvQLIcSZq4L-bpYIGyeFxCGQ,7696
46
46
  apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
48
- apify/storage_clients/_file_system/_key_value_store_client.py,sha256=fnSJ1EIOPCGfcE6e5S3Tux9VbnMVLCJjugkaQoH_9yo,2267
48
+ apify/storage_clients/_file_system/_key_value_store_client.py,sha256=gxM3ap67PnY80Rd7P3onPAf2pksYpU0LoAlJdayEMdU,4179
49
49
  apify/storage_clients/_file_system/_storage_client.py,sha256=rcwpKYlrWzvlSA2xoxftg-EZAi_iGZ3vOCbu0C5lKDE,1396
50
50
  apify/storage_clients/_smart_apify/__init__.py,sha256=614B2AaWY-dx6RQ6mod7VVR8gFh75-_jnq5BeDD7hSc,53
51
51
  apify/storage_clients/_smart_apify/_storage_client.py,sha256=GCPmVe_xWAFcO2Cuej4su4i97_d33Q9Ih_Sc5xW2Wa4,4674
52
52
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
53
53
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- apify-3.0.1b1.dist-info/METADATA,sha256=Qy-fnT_4BnuEpoIhk_Aa0vIl6GVQtkqkk8diacKkzA0,22582
55
- apify-3.0.1b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- apify-3.0.1b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
- apify-3.0.1b1.dist-info/RECORD,,
54
+ apify-3.0.1b2.dist-info/METADATA,sha256=2NLa54gUwW-FF9Rf6Qc4XQo2JsAVOvPaThJljzT4OZo,22582
55
+ apify-3.0.1b2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ apify-3.0.1b2.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
57
+ apify-3.0.1b2.dist-info/RECORD,,