apify 2.7.2__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (51) hide show
  1. apify/_actor.py +194 -126
  2. apify/_charging.py +34 -9
  3. apify/_configuration.py +79 -6
  4. apify/_crypto.py +0 -6
  5. apify/_models.py +7 -7
  6. apify/_proxy_configuration.py +10 -10
  7. apify/_utils.py +25 -2
  8. apify/events/__init__.py +5 -0
  9. apify/events/_apify_event_manager.py +140 -0
  10. apify/events/_types.py +102 -0
  11. apify/log.py +0 -9
  12. apify/request_loaders/__init__.py +18 -0
  13. apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
  14. apify/request_loaders/py.typed +0 -0
  15. apify/scrapy/_logging_config.py +1 -4
  16. apify/scrapy/extensions/_httpcache.py +9 -5
  17. apify/scrapy/requests.py +3 -3
  18. apify/scrapy/scheduler.py +8 -5
  19. apify/storage_clients/__init__.py +12 -0
  20. apify/storage_clients/_apify/__init__.py +11 -0
  21. apify/storage_clients/_apify/_dataset_client.py +328 -0
  22. apify/storage_clients/_apify/_key_value_store_client.py +265 -0
  23. apify/storage_clients/_apify/_models.py +131 -0
  24. apify/storage_clients/_apify/_request_queue_client.py +327 -0
  25. apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
  26. apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
  27. apify/storage_clients/_apify/_storage_client.py +106 -0
  28. apify/storage_clients/_apify/_utils.py +194 -0
  29. apify/storage_clients/_apify/py.typed +0 -0
  30. apify/storage_clients/_file_system/__init__.py +2 -0
  31. apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
  32. apify/storage_clients/_file_system/_storage_client.py +41 -0
  33. apify/storage_clients/_smart_apify/__init__.py +1 -0
  34. apify/storage_clients/_smart_apify/_storage_client.py +117 -0
  35. apify/storage_clients/py.typed +0 -0
  36. apify/storages/__init__.py +1 -3
  37. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
  38. apify-3.0.0.dist-info/RECORD +57 -0
  39. apify/_platform_event_manager.py +0 -231
  40. apify/apify_storage_client/__init__.py +0 -3
  41. apify/apify_storage_client/_apify_storage_client.py +0 -72
  42. apify/apify_storage_client/_dataset_client.py +0 -190
  43. apify/apify_storage_client/_dataset_collection_client.py +0 -51
  44. apify/apify_storage_client/_key_value_store_client.py +0 -109
  45. apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
  46. apify/apify_storage_client/_request_queue_client.py +0 -176
  47. apify/apify_storage_client/_request_queue_collection_client.py +0 -51
  48. apify-2.7.2.dist-info/RECORD +0 -44
  49. /apify/{apify_storage_client → events}/py.typed +0 -0
  50. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
  51. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
apify/log.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
 
5
- from apify_shared.utils import ignore_docs
6
5
  from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
7
6
 
8
7
  # Name of the logger used throughout the library (resolves to 'apify')
@@ -12,7 +11,6 @@ logger_name = __name__.split('.')[0]
12
11
  logger = logging.getLogger(logger_name)
13
12
 
14
13
 
15
- @ignore_docs
16
14
  class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from parent class)
17
15
  pass
18
16
 
@@ -29,13 +27,6 @@ def _configure_logging() -> None:
29
27
  else:
30
28
  apify_client_logger.setLevel(level)
31
29
 
32
- # Silence HTTPX logger unless debug logging is requested
33
- httpx_logger = logging.getLogger('httpx')
34
- if level > logging.DEBUG:
35
- httpx_logger.setLevel(logging.WARNING)
36
- else:
37
- httpx_logger.setLevel(level)
38
-
39
30
  # Use configured log level for apify logger
40
31
  apify_logger = logging.getLogger('apify')
41
32
  configure_logger(apify_logger, remove_old_handlers=True)
@@ -0,0 +1,18 @@
1
+ from crawlee.request_loaders import (
2
+ RequestList,
3
+ RequestLoader,
4
+ RequestManager,
5
+ RequestManagerTandem,
6
+ SitemapRequestLoader,
7
+ )
8
+
9
+ from ._apify_request_list import ApifyRequestList
10
+
11
+ __all__ = [
12
+ 'ApifyRequestList',
13
+ 'RequestList',
14
+ 'RequestLoader',
15
+ 'RequestManager',
16
+ 'RequestManagerTandem',
17
+ 'SitemapRequestLoader',
18
+ ]
@@ -3,16 +3,15 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import re
5
5
  from asyncio import Task
6
- from functools import partial
7
- from typing import Annotated, Any, Union
6
+ from typing import Annotated, Any
8
7
 
9
8
  from pydantic import BaseModel, Field, TypeAdapter
10
9
 
11
- from crawlee import Request
12
10
  from crawlee._types import HttpMethod
13
- from crawlee.http_clients import HttpClient, HttpxHttpClient
14
- from crawlee.request_loaders import RequestList as CrawleeRequestList
11
+ from crawlee.http_clients import HttpClient, ImpitHttpClient
12
+ from crawlee.request_loaders import RequestList
15
13
 
14
+ from apify import Request
16
15
  from apify._utils import docs_group
17
16
 
18
17
  URL_NO_COMMAS_REGEX = re.compile(
@@ -35,11 +34,11 @@ class _SimpleUrlInput(_RequestDetails):
35
34
  url: str
36
35
 
37
36
 
38
- url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
37
+ url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput])
39
38
 
40
39
 
41
- @docs_group('Classes')
42
- class RequestList(CrawleeRequestList):
40
+ @docs_group('Request loaders')
41
+ class ApifyRequestList(RequestList):
43
42
  """Extends crawlee RequestList.
44
43
 
45
44
  Method open is used to create RequestList from actor's requestListSources input.
@@ -50,7 +49,7 @@ class RequestList(CrawleeRequestList):
50
49
  name: str | None = None,
51
50
  request_list_sources_input: list[dict[str, Any]] | None = None,
52
51
  http_client: HttpClient | None = None,
53
- ) -> RequestList:
52
+ ) -> ApifyRequestList:
54
53
  """Initialize a new instance from request list source input.
55
54
 
56
55
  Args:
@@ -74,24 +73,26 @@ class RequestList(CrawleeRequestList):
74
73
  ```
75
74
  """
76
75
  request_list_sources_input = request_list_sources_input or []
77
- return await RequestList._create_request_list(name, request_list_sources_input, http_client)
76
+ return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
78
77
 
79
78
  @staticmethod
80
79
  async def _create_request_list(
81
80
  name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
82
- ) -> RequestList:
81
+ ) -> ApifyRequestList:
83
82
  if not http_client:
84
- http_client = HttpxHttpClient()
83
+ http_client = ImpitHttpClient()
85
84
 
86
85
  url_inputs = url_input_adapter.validate_python(request_list_sources_input)
87
86
 
88
87
  simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
89
88
  remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
90
89
 
91
- simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
92
- remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
90
+ simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
91
+ remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
92
+ remote_url_inputs, http_client=http_client
93
+ )
93
94
 
94
- return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
+ return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
96
 
96
97
  @staticmethod
97
98
  def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
@@ -119,13 +120,15 @@ class RequestList(CrawleeRequestList):
119
120
  """
120
121
  created_requests: list[Request] = []
121
122
 
122
- def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
123
+ async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
123
124
  """Extract links from response body and use them to create `Request` objects.
124
125
 
125
126
  Use the regular expression to find all matching links in the response body, then create `Request`
126
127
  objects from these links and the provided input attributes.
127
128
  """
128
- matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
129
+ response = await (task.result()).read()
130
+ matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8'))
131
+
129
132
  created_requests.extend(
130
133
  [
131
134
  Request.from_url(
@@ -148,7 +151,11 @@ class RequestList(CrawleeRequestList):
148
151
  )
149
152
  )
150
153
 
151
- get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
154
+ get_response_task.add_done_callback(
155
+ lambda task, inp=remote_url_requests_input: asyncio.create_task( # type: ignore[misc]
156
+ create_requests_from_response(inp, task)
157
+ )
158
+ )
152
159
  remote_url_requests.append(get_response_task)
153
160
 
154
161
  await asyncio.gather(*remote_url_requests)
File without changes
@@ -10,7 +10,7 @@ from apify.log import ActorLogFormatter
10
10
 
11
11
  # Define logger names.
12
12
  _PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
13
- _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
13
+ _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
14
14
  _ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
15
15
 
16
16
 
@@ -37,9 +37,6 @@ def initialize_logging() -> None:
37
37
  for logger_name in [None, *_ALL_LOGGERS]:
38
38
  _configure_logger(logger_name, logging_level, handler)
39
39
 
40
- # Set the 'httpx' logger to a less verbose level.
41
- logging.getLogger('httpx').setLevel('WARNING')
42
-
43
40
  # Monkey-patch Scrapy's logging configuration to re-apply our settings.
44
41
  original_configure_logging = scrapy_logging.configure_logging
45
42
 
@@ -13,8 +13,8 @@ from scrapy.http.headers import Headers
13
13
  from scrapy.responsetypes import responsetypes
14
14
 
15
15
  from apify import Configuration
16
- from apify.apify_storage_client import ApifyStorageClient
17
16
  from apify.scrapy._async_thread import AsyncThread
17
+ from apify.storage_clients import ApifyStorageClient
18
18
  from apify.storages import KeyValueStore
19
19
 
20
20
  if TYPE_CHECKING:
@@ -51,10 +51,14 @@ class ApifyCacheStorage:
51
51
  kvs_name = get_kvs_name(spider.name)
52
52
 
53
53
  async def open_kvs() -> KeyValueStore:
54
- config = Configuration.get_global_configuration()
55
- if config.is_at_home:
56
- storage_client = ApifyStorageClient.from_config(config)
57
- return await KeyValueStore.open(name=kvs_name, storage_client=storage_client)
54
+ configuration = Configuration.get_global_configuration()
55
+ if configuration.is_at_home:
56
+ storage_client = ApifyStorageClient()
57
+ return await KeyValueStore.open(
58
+ name=kvs_name,
59
+ configuration=configuration,
60
+ storage_client=storage_client,
61
+ )
58
62
  return await KeyValueStore.open(name=kvs_name)
59
63
 
60
64
  logger.debug("Starting background thread for cache storage's event loop")
apify/scrapy/requests.py CHANGED
@@ -10,9 +10,10 @@ from scrapy import Spider
10
10
  from scrapy.http.headers import Headers
11
11
  from scrapy.utils.request import request_from_dict
12
12
 
13
- from crawlee import Request as ApifyRequest
14
13
  from crawlee._types import HttpHeaders
15
14
 
15
+ from apify import Request as ApifyRequest
16
+
16
17
  logger = getLogger(__name__)
17
18
 
18
19
 
@@ -121,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
121
122
 
122
123
  # Update the meta field with the meta field from the apify_request
123
124
  meta = scrapy_request.meta or {}
124
- meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
125
+ meta.update({'apify_request_unique_key': apify_request.unique_key})
125
126
  # scrapy_request.meta is a property, so we have to set it like this
126
127
  scrapy_request._meta = meta # noqa: SLF001
127
128
 
@@ -133,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
133
134
  url=apify_request.url,
134
135
  method=apify_request.method,
135
136
  meta={
136
- 'apify_request_id': apify_request.id,
137
137
  'apify_request_unique_key': apify_request.unique_key,
138
138
  },
139
139
  )
apify/scrapy/scheduler.py CHANGED
@@ -11,7 +11,7 @@ from scrapy.utils.reactor import is_asyncio_reactor_installed
11
11
  from ._async_thread import AsyncThread
12
12
  from .requests import to_apify_request, to_scrapy_request
13
13
  from apify import Configuration
14
- from apify.apify_storage_client import ApifyStorageClient
14
+ from apify.storage_clients import ApifyStorageClient
15
15
  from apify.storages import RequestQueue
16
16
 
17
17
  if TYPE_CHECKING:
@@ -49,10 +49,13 @@ class ApifyScheduler(BaseScheduler):
49
49
  self.spider = spider
50
50
 
51
51
  async def open_rq() -> RequestQueue:
52
- config = Configuration.get_global_configuration()
53
- if config.is_at_home:
54
- storage_client = ApifyStorageClient.from_config(config)
55
- return await RequestQueue.open(storage_client=storage_client)
52
+ configuration = Configuration.get_global_configuration()
53
+ if configuration.is_at_home:
54
+ storage_client = ApifyStorageClient()
55
+ return await RequestQueue.open(
56
+ configuration=configuration,
57
+ storage_client=storage_client,
58
+ )
56
59
  return await RequestQueue.open()
57
60
 
58
61
  try:
@@ -0,0 +1,12 @@
1
+ from crawlee.storage_clients import MemoryStorageClient
2
+
3
+ from ._apify import ApifyStorageClient
4
+ from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
5
+ from ._smart_apify import SmartApifyStorageClient
6
+
7
+ __all__ = [
8
+ 'ApifyStorageClient',
9
+ 'FileSystemStorageClient',
10
+ 'MemoryStorageClient',
11
+ 'SmartApifyStorageClient',
12
+ ]
@@ -0,0 +1,11 @@
1
+ from ._dataset_client import ApifyDatasetClient
2
+ from ._key_value_store_client import ApifyKeyValueStoreClient
3
+ from ._request_queue_client import ApifyRequestQueueClient
4
+ from ._storage_client import ApifyStorageClient
5
+
6
+ __all__ = [
7
+ 'ApifyDatasetClient',
8
+ 'ApifyKeyValueStoreClient',
9
+ 'ApifyRequestQueueClient',
10
+ 'ApifyStorageClient',
11
+ ]
@@ -0,0 +1,328 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from logging import getLogger
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from typing_extensions import override
8
+
9
+ from apify_client import ApifyClientAsync
10
+ from crawlee._utils.byte_size import ByteSize
11
+ from crawlee._utils.file import json_dumps
12
+ from crawlee.storage_clients._base import DatasetClient
13
+ from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
14
+ from crawlee.storages import Dataset
15
+
16
+ from ._utils import AliasResolver
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import AsyncIterator
20
+
21
+ from apify_client.clients import DatasetClientAsync
22
+ from crawlee._types import JsonSerializable
23
+
24
+ from apify import Configuration
25
+
26
+ logger = getLogger(__name__)
27
+
28
+
29
+ class ApifyDatasetClient(DatasetClient):
30
+ """An Apify platform implementation of the dataset client."""
31
+
32
+ _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9)
33
+ """Maximum size for a single payload."""
34
+
35
+ _SAFETY_BUFFER_COEFFICIENT = 0.01 / 100 # 0.01%
36
+ """Percentage buffer to reduce payload limit slightly for safety."""
37
+
38
+ _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_COEFFICIENT)
39
+ """Calculated payload limit considering safety buffer."""
40
+
41
+ def __init__(
42
+ self,
43
+ *,
44
+ api_client: DatasetClientAsync,
45
+ api_public_base_url: str,
46
+ lock: asyncio.Lock,
47
+ ) -> None:
48
+ """Initialize a new instance.
49
+
50
+ Preferably use the `ApifyDatasetClient.open` class method to create a new instance.
51
+ """
52
+ self._api_client = api_client
53
+ """The Apify dataset client for API operations."""
54
+
55
+ self._api_public_base_url = api_public_base_url
56
+ """The public base URL for accessing the key-value store records."""
57
+
58
+ self._lock = lock
59
+ """A lock to ensure that only one operation is performed at a time."""
60
+
61
+ @override
62
+ async def get_metadata(self) -> DatasetMetadata:
63
+ metadata = await self._api_client.get()
64
+ return DatasetMetadata.model_validate(metadata)
65
+
66
+ @classmethod
67
+ async def open(
68
+ cls,
69
+ *,
70
+ id: str | None,
71
+ name: str | None,
72
+ alias: str | None,
73
+ configuration: Configuration,
74
+ ) -> ApifyDatasetClient:
75
+ """Open an Apify dataset client.
76
+
77
+ This method creates and initializes a new instance of the Apify dataset client.
78
+ It handles authentication, storage lookup/creation, and metadata retrieval.
79
+
80
+ Args:
81
+ id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
82
+ Mutually exclusive with name and alias.
83
+ name: The name of the dataset to open (global scope, persists across runs).
84
+ Mutually exclusive with id and alias.
85
+ alias: The alias of the dataset to open (run scope, creates unnamed storage).
86
+ Mutually exclusive with id and name.
87
+ configuration: The configuration object containing API credentials and settings. Must include a valid
88
+ `token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither
89
+ `id`, `name`, nor `alias` is provided.
90
+
91
+ Returns:
92
+ An instance for the opened or created storage client.
93
+
94
+ Raises:
95
+ ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
96
+ `id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
97
+ in the configuration.
98
+ """
99
+ if sum(1 for param in [id, name, alias] if param is not None) > 1:
100
+ raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
101
+
102
+ token = configuration.token
103
+ if not token:
104
+ raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
105
+
106
+ api_url = configuration.api_base_url
107
+ if not api_url:
108
+ raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
109
+
110
+ api_public_base_url = configuration.api_public_base_url
111
+ if not api_public_base_url:
112
+ raise ValueError(
113
+ 'Apify storage client requires a valid API public base URL in Configuration '
114
+ f'(api_public_base_url={api_public_base_url}).'
115
+ )
116
+
117
+ # Create Apify client with the provided token and API URL.
118
+ apify_client_async = ApifyClientAsync(
119
+ token=token,
120
+ api_url=api_url,
121
+ max_retries=8,
122
+ min_delay_between_retries_millis=500,
123
+ timeout_secs=360,
124
+ )
125
+ apify_datasets_client = apify_client_async.datasets()
126
+
127
+ # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
128
+ # storage aliased as `__default__`
129
+ if not any([alias, name, id, configuration.default_dataset_id]):
130
+ alias = '__default__'
131
+
132
+ if alias:
133
+ # Check if there is pre-existing alias mapping in the default KVS.
134
+ async with AliasResolver(storage_type=Dataset, alias=alias, configuration=configuration) as _alias:
135
+ id = await _alias.resolve_id()
136
+
137
+ # There was no pre-existing alias in the mapping.
138
+ # Create a new unnamed storage and store the mapping.
139
+ if id is None:
140
+ new_storage_metadata = DatasetMetadata.model_validate(
141
+ await apify_datasets_client.get_or_create(),
142
+ )
143
+ id = new_storage_metadata.id
144
+ await _alias.store_mapping(storage_id=id)
145
+
146
+ # If name is provided, get or create the storage by name.
147
+ elif name:
148
+ id = DatasetMetadata.model_validate(
149
+ await apify_datasets_client.get_or_create(name=name),
150
+ ).id
151
+
152
+ # If none are provided, try to get the default storage ID from environment variables.
153
+ elif id is None:
154
+ id = configuration.default_dataset_id
155
+ if not id:
156
+ raise ValueError(
157
+ 'Dataset "id", "name", or "alias" must be specified, '
158
+ 'or a default dataset ID must be set in the configuration.'
159
+ )
160
+
161
+ # Now create the client for the determined ID
162
+ apify_dataset_client = apify_client_async.dataset(dataset_id=id)
163
+
164
+ # Fetch its metadata.
165
+ metadata = await apify_dataset_client.get()
166
+
167
+ # If metadata is None, it means the storage does not exist, so we create it.
168
+ if metadata is None:
169
+ id = DatasetMetadata.model_validate(
170
+ await apify_datasets_client.get_or_create(),
171
+ ).id
172
+ apify_dataset_client = apify_client_async.dataset(dataset_id=id)
173
+
174
+ # Verify that the storage exists by fetching its metadata again.
175
+ metadata = await apify_dataset_client.get()
176
+ if metadata is None:
177
+ raise ValueError(f'Opening dataset with id={id}, name={name}, and alias={alias} failed.')
178
+
179
+ return cls(
180
+ api_client=apify_dataset_client,
181
+ api_public_base_url=api_public_base_url,
182
+ lock=asyncio.Lock(),
183
+ )
184
+
185
+ @override
186
+ async def purge(self) -> None:
187
+ raise NotImplementedError(
188
+ 'Purging datasets is not supported in the Apify platform. '
189
+ 'Use the `drop` method to delete the dataset instead.'
190
+ )
191
+
192
+ @override
193
+ async def drop(self) -> None:
194
+ async with self._lock:
195
+ await self._api_client.delete()
196
+
197
+ @override
198
+ async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
199
+ async def payloads_generator() -> AsyncIterator[str]:
200
+ for index, item in enumerate(data):
201
+ yield await self._check_and_serialize(item, index)
202
+
203
+ async with self._lock:
204
+ # Handle lists
205
+ if isinstance(data, list):
206
+ # Invoke client in series to preserve the order of data
207
+ async for items in self._chunk_by_size(payloads_generator()):
208
+ await self._api_client.push_items(items=items)
209
+
210
+ # Handle singular items
211
+ else:
212
+ items = await self._check_and_serialize(data)
213
+ await self._api_client.push_items(items=items)
214
+
215
+ @override
216
+ async def get_data(
217
+ self,
218
+ *,
219
+ offset: int = 0,
220
+ limit: int | None = 999_999_999_999,
221
+ clean: bool = False,
222
+ desc: bool = False,
223
+ fields: list[str] | None = None,
224
+ omit: list[str] | None = None,
225
+ unwind: list[str] | None = None,
226
+ skip_empty: bool = False,
227
+ skip_hidden: bool = False,
228
+ flatten: list[str] | None = None,
229
+ view: str | None = None,
230
+ ) -> DatasetItemsListPage:
231
+ response = await self._api_client.list_items(
232
+ offset=offset,
233
+ limit=limit,
234
+ clean=clean,
235
+ desc=desc,
236
+ fields=fields,
237
+ omit=omit,
238
+ unwind=unwind,
239
+ skip_empty=skip_empty,
240
+ skip_hidden=skip_hidden,
241
+ flatten=flatten,
242
+ view=view,
243
+ )
244
+ return DatasetItemsListPage.model_validate(vars(response))
245
+
246
+ @override
247
+ async def iterate_items(
248
+ self,
249
+ *,
250
+ offset: int = 0,
251
+ limit: int | None = None,
252
+ clean: bool = False,
253
+ desc: bool = False,
254
+ fields: list[str] | None = None,
255
+ omit: list[str] | None = None,
256
+ unwind: list[str] | None = None,
257
+ skip_empty: bool = False,
258
+ skip_hidden: bool = False,
259
+ ) -> AsyncIterator[dict]:
260
+ async for item in self._api_client.iterate_items(
261
+ offset=offset,
262
+ limit=limit,
263
+ clean=clean,
264
+ desc=desc,
265
+ fields=fields,
266
+ omit=omit,
267
+ unwind=unwind,
268
+ skip_empty=skip_empty,
269
+ skip_hidden=skip_hidden,
270
+ ):
271
+ yield item
272
+
273
+ @classmethod
274
+ async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str:
275
+ """Serialize a given item to JSON, checks its serializability and size against a limit.
276
+
277
+ Args:
278
+ item: The item to serialize.
279
+ index: Index of the item, used for error context.
280
+
281
+ Returns:
282
+ Serialized JSON string.
283
+
284
+ Raises:
285
+ ValueError: If item is not JSON serializable or exceeds size limit.
286
+ """
287
+ s = ' ' if index is None else f' at index {index} '
288
+
289
+ try:
290
+ payload = await json_dumps(item)
291
+ except Exception as exc:
292
+ raise ValueError(f'Data item{s}is not serializable to JSON.') from exc
293
+
294
+ payload_size = ByteSize(len(payload.encode('utf-8')))
295
+ if payload_size > cls._EFFECTIVE_LIMIT_SIZE:
296
+ raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})')
297
+
298
+ return payload
299
+
300
+ async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]:
301
+ """Yield chunks of JSON arrays composed of input strings, respecting a size limit.
302
+
303
+ Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size
304
+ of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that
305
+ contains as many payloads as possible without breaching the size threshold, maintaining the
306
+ order of the original payloads. Assumes individual items are below the size limit.
307
+
308
+ Args:
309
+ items: Iterable of JSON string payloads.
310
+
311
+ Yields:
312
+ Strings representing JSON arrays of payloads, each staying within the size limit.
313
+ """
314
+ last_chunk_size = ByteSize(2) # Add 2 bytes for [] wrapper.
315
+ current_chunk = []
316
+
317
+ async for payload in items:
318
+ payload_size = ByteSize(len(payload.encode('utf-8')))
319
+
320
+ if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE:
321
+ current_chunk.append(payload)
322
+ last_chunk_size += payload_size + ByteSize(1) # Add 1 byte for ',' separator.
323
+ else:
324
+ yield f'[{",".join(current_chunk)}]'
325
+ current_chunk = [payload]
326
+ last_chunk_size = payload_size + ByteSize(2) # Add 2 bytes for [] wrapper.
327
+
328
+ yield f'[{",".join(current_chunk)}]'