apify 1.7.3b3__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +19 -4
- apify/_actor.py +979 -0
- apify/_configuration.py +310 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +29 -27
- apify/_models.py +110 -0
- apify/_platform_event_manager.py +222 -0
- apify/_proxy_configuration.py +316 -0
- apify/_utils.py +0 -497
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +56 -0
- apify/apify_storage_client/_dataset_client.py +188 -0
- apify/apify_storage_client/_dataset_collection_client.py +50 -0
- apify/apify_storage_client/_key_value_store_client.py +98 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +50 -0
- apify/apify_storage_client/_request_queue_client.py +208 -0
- apify/apify_storage_client/_request_queue_collection_client.py +50 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +24 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +21 -21
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +1 -1
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +55 -54
- apify/scrapy/scheduler.py +19 -13
- apify/scrapy/utils.py +2 -31
- apify/storages/__init__.py +2 -10
- apify/storages/py.typed +0 -0
- apify-2.0.0.dist-info/METADATA +209 -0
- apify-2.0.0.dist-info/RECORD +37 -0
- {apify-1.7.3b3.dist-info → apify-2.0.0.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1357
- apify/config.py +0 -130
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.3b3.dist-info/METADATA +0 -150
- apify-1.7.3b3.dist-info/RECORD +0 -41
- apify-1.7.3b3.dist-info/top_level.txt +0 -1
- {apify-1.7.3b3.dist-info → apify-2.0.0.dist-info}/LICENSE +0 -0
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
|
-
|
|
8
|
-
from apify_shared.utils import ignore_docs
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from typing_extensions import Self
|
|
12
|
-
|
|
13
|
-
from apify._memory_storage.memory_storage_client import MemoryStorageClient
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@ignore_docs
|
|
17
|
-
class BaseResourceClient(ABC):
|
|
18
|
-
"""Base class for resource clients."""
|
|
19
|
-
|
|
20
|
-
_id: str
|
|
21
|
-
_name: str | None
|
|
22
|
-
_resource_directory: str
|
|
23
|
-
|
|
24
|
-
@abstractmethod
|
|
25
|
-
def __init__(
|
|
26
|
-
self: BaseResourceClient,
|
|
27
|
-
*,
|
|
28
|
-
base_storage_directory: str,
|
|
29
|
-
memory_storage_client: MemoryStorageClient,
|
|
30
|
-
id: str | None = None, # noqa: A002
|
|
31
|
-
name: str | None = None,
|
|
32
|
-
) -> None:
|
|
33
|
-
"""Initialize the BaseResourceClient."""
|
|
34
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
35
|
-
|
|
36
|
-
@abstractmethod
|
|
37
|
-
async def get(self: BaseResourceClient) -> dict | None:
|
|
38
|
-
"""Retrieve the storage.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
dict, optional: The retrieved storage, or None, if it does not exist
|
|
42
|
-
"""
|
|
43
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
44
|
-
|
|
45
|
-
@classmethod
|
|
46
|
-
@abstractmethod
|
|
47
|
-
def _get_storages_dir(cls: type[BaseResourceClient], memory_storage_client: MemoryStorageClient) -> str:
|
|
48
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
49
|
-
|
|
50
|
-
@classmethod
|
|
51
|
-
@abstractmethod
|
|
52
|
-
def _get_storage_client_cache(
|
|
53
|
-
cls,
|
|
54
|
-
memory_storage_client: MemoryStorageClient,
|
|
55
|
-
) -> list[Self]:
|
|
56
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
57
|
-
|
|
58
|
-
@abstractmethod
|
|
59
|
-
def _to_resource_info(self: BaseResourceClient) -> dict:
|
|
60
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
61
|
-
|
|
62
|
-
@classmethod
|
|
63
|
-
@abstractmethod
|
|
64
|
-
def _create_from_directory(
|
|
65
|
-
cls,
|
|
66
|
-
storage_directory: str,
|
|
67
|
-
memory_storage_client: MemoryStorageClient,
|
|
68
|
-
id: str | None = None, # noqa: A002
|
|
69
|
-
name: str | None = None,
|
|
70
|
-
) -> Self:
|
|
71
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
72
|
-
|
|
73
|
-
@classmethod
|
|
74
|
-
def _find_or_create_client_by_id_or_name(
|
|
75
|
-
cls,
|
|
76
|
-
memory_storage_client: MemoryStorageClient,
|
|
77
|
-
id: str | None = None, # noqa: A002
|
|
78
|
-
name: str | None = None,
|
|
79
|
-
) -> Self | None:
|
|
80
|
-
assert id is not None or name is not None # noqa: S101
|
|
81
|
-
|
|
82
|
-
storage_client_cache = cls._get_storage_client_cache(memory_storage_client)
|
|
83
|
-
storages_dir = cls._get_storages_dir(memory_storage_client)
|
|
84
|
-
|
|
85
|
-
# First check memory cache
|
|
86
|
-
found = next(
|
|
87
|
-
(
|
|
88
|
-
storage_client
|
|
89
|
-
for storage_client in storage_client_cache
|
|
90
|
-
if storage_client._id == id or (storage_client._name and name and storage_client._name.lower() == name.lower())
|
|
91
|
-
),
|
|
92
|
-
None,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
if found is not None:
|
|
96
|
-
return found
|
|
97
|
-
|
|
98
|
-
storage_path = None
|
|
99
|
-
|
|
100
|
-
# First try to find the storage by looking up the directory by name
|
|
101
|
-
if name:
|
|
102
|
-
possible_storage_path = os.path.join(storages_dir, name)
|
|
103
|
-
if os.access(possible_storage_path, os.F_OK):
|
|
104
|
-
storage_path = possible_storage_path
|
|
105
|
-
|
|
106
|
-
# If it's not found, try going through the storages dir and finding it by metadata
|
|
107
|
-
if not storage_path and os.access(storages_dir, os.F_OK):
|
|
108
|
-
for entry in os.scandir(storages_dir):
|
|
109
|
-
if not entry.is_dir():
|
|
110
|
-
continue
|
|
111
|
-
metadata_path = os.path.join(entry.path, '__metadata__.json')
|
|
112
|
-
if not os.access(metadata_path, os.F_OK):
|
|
113
|
-
continue
|
|
114
|
-
with open(metadata_path, encoding='utf-8') as metadata_file:
|
|
115
|
-
metadata = json.load(metadata_file)
|
|
116
|
-
if id and id == metadata.get('id'):
|
|
117
|
-
storage_path = entry.path
|
|
118
|
-
name = metadata.get(name)
|
|
119
|
-
break
|
|
120
|
-
if name and name == metadata.get('name'):
|
|
121
|
-
storage_path = entry.path
|
|
122
|
-
id = metadata.get(id) # noqa: A001
|
|
123
|
-
break
|
|
124
|
-
|
|
125
|
-
# As a last resort, try to check if the accessed storage is the default one,
|
|
126
|
-
# and the folder has no metadata
|
|
127
|
-
# TODO: make this respect the APIFY_DEFAULT_XXX_ID env var
|
|
128
|
-
# https://github.com/apify/apify-sdk-python/issues/149
|
|
129
|
-
if id == 'default':
|
|
130
|
-
possible_storage_path = os.path.join(storages_dir, id)
|
|
131
|
-
if os.access(possible_storage_path, os.F_OK):
|
|
132
|
-
storage_path = possible_storage_path
|
|
133
|
-
|
|
134
|
-
if not storage_path:
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
resource_client = cls._create_from_directory(storage_path, memory_storage_client, id, name)
|
|
138
|
-
|
|
139
|
-
storage_client_cache.append(resource_client)
|
|
140
|
-
|
|
141
|
-
return resource_client
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from operator import itemgetter
|
|
5
|
-
from typing import TYPE_CHECKING, Generic, TypeVar, cast
|
|
6
|
-
|
|
7
|
-
from apify_shared.models import ListPage
|
|
8
|
-
from apify_shared.utils import ignore_docs
|
|
9
|
-
|
|
10
|
-
from apify._memory_storage.file_storage_utils import update_metadata
|
|
11
|
-
from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
from apify._memory_storage.memory_storage_client import MemoryStorageClient
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ResourceClientType = TypeVar('ResourceClientType', bound=BaseResourceClient, contravariant=True) # noqa: PLC0105
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@ignore_docs
|
|
21
|
-
class BaseResourceCollectionClient(ABC, Generic[ResourceClientType]):
|
|
22
|
-
"""Base class for resource collection clients."""
|
|
23
|
-
|
|
24
|
-
_base_storage_directory: str
|
|
25
|
-
_memory_storage_client: MemoryStorageClient
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self: BaseResourceCollectionClient,
|
|
29
|
-
*,
|
|
30
|
-
base_storage_directory: str,
|
|
31
|
-
memory_storage_client: MemoryStorageClient,
|
|
32
|
-
) -> None:
|
|
33
|
-
"""Initialize the DatasetCollectionClient with the passed arguments."""
|
|
34
|
-
self._base_storage_directory = base_storage_directory
|
|
35
|
-
self._memory_storage_client = memory_storage_client
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def _get_storage_client_cache(self: BaseResourceCollectionClient) -> list[ResourceClientType]:
|
|
39
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def _get_resource_client_class(self: BaseResourceCollectionClient) -> type[ResourceClientType]:
|
|
43
|
-
raise NotImplementedError('You must override this method in the subclass!')
|
|
44
|
-
|
|
45
|
-
@abstractmethod
|
|
46
|
-
async def list(self: BaseResourceCollectionClient) -> ListPage:
|
|
47
|
-
"""List the available storages.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
ListPage: The list of available storages matching the specified filters.
|
|
51
|
-
"""
|
|
52
|
-
storage_client_cache = self._get_storage_client_cache()
|
|
53
|
-
|
|
54
|
-
items = [storage._to_resource_info() for storage in storage_client_cache]
|
|
55
|
-
|
|
56
|
-
return ListPage(
|
|
57
|
-
{
|
|
58
|
-
'total': len(items),
|
|
59
|
-
'count': len(items),
|
|
60
|
-
'offset': 0,
|
|
61
|
-
'limit': len(items),
|
|
62
|
-
'desc': False,
|
|
63
|
-
'items': sorted(items, key=itemgetter('createdAt')),
|
|
64
|
-
}
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
@abstractmethod
|
|
68
|
-
async def get_or_create(
|
|
69
|
-
self: BaseResourceCollectionClient,
|
|
70
|
-
*,
|
|
71
|
-
name: str | None = None,
|
|
72
|
-
schema: dict | None = None,
|
|
73
|
-
_id: str | None = None,
|
|
74
|
-
) -> dict:
|
|
75
|
-
"""Retrieve a named storage, or create a new one when it doesn't exist.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
name (str, optional): The name of the storage to retrieve or create.
|
|
79
|
-
schema (Dict, optional): The schema of the storage
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
dict: The retrieved or newly-created storage.
|
|
83
|
-
"""
|
|
84
|
-
resource_client_class = self._get_resource_client_class()
|
|
85
|
-
storage_client_cache = self._get_storage_client_cache()
|
|
86
|
-
|
|
87
|
-
if name or _id:
|
|
88
|
-
found = resource_client_class._find_or_create_client_by_id_or_name(
|
|
89
|
-
memory_storage_client=self._memory_storage_client,
|
|
90
|
-
name=name,
|
|
91
|
-
id=_id,
|
|
92
|
-
)
|
|
93
|
-
if found:
|
|
94
|
-
resource_info = found._to_resource_info()
|
|
95
|
-
return cast(dict, resource_info)
|
|
96
|
-
|
|
97
|
-
new_resource = resource_client_class(
|
|
98
|
-
id=_id,
|
|
99
|
-
name=name,
|
|
100
|
-
base_storage_directory=self._base_storage_directory,
|
|
101
|
-
memory_storage_client=self._memory_storage_client,
|
|
102
|
-
)
|
|
103
|
-
storage_client_cache.append(new_resource)
|
|
104
|
-
|
|
105
|
-
resource_info = new_resource._to_resource_info()
|
|
106
|
-
|
|
107
|
-
# Write to the disk
|
|
108
|
-
await update_metadata(
|
|
109
|
-
data=resource_info,
|
|
110
|
-
entity_directory=new_resource._resource_directory,
|
|
111
|
-
write_metadata=self._memory_storage_client._write_metadata,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
return cast(dict, resource_info)
|