PyPI - apify - Versions diffs - 2.7.2__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

apify 2.7.2py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of apify might be problematic. Click here for more details.

Files changed (51) hide show

apify/_actor.py +194 -126
apify/_charging.py +34 -9
apify/_configuration.py +79 -6
apify/_crypto.py +0 -6
apify/_models.py +7 -7
apify/_proxy_configuration.py +10 -10
apify/_utils.py +25 -2
apify/events/__init__.py +5 -0
apify/events/_apify_event_manager.py +140 -0
apify/events/_types.py +102 -0
apify/log.py +0 -9
apify/request_loaders/__init__.py +18 -0
apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
apify/request_loaders/py.typed +0 -0
apify/scrapy/_logging_config.py +1 -4
apify/scrapy/extensions/_httpcache.py +9 -5
apify/scrapy/requests.py +3 -3
apify/scrapy/scheduler.py +8 -5
apify/storage_clients/__init__.py +12 -0
apify/storage_clients/_apify/__init__.py +11 -0
apify/storage_clients/_apify/_dataset_client.py +328 -0
apify/storage_clients/_apify/_key_value_store_client.py +265 -0
apify/storage_clients/_apify/_models.py +131 -0
apify/storage_clients/_apify/_request_queue_client.py +327 -0
apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
apify/storage_clients/_apify/_storage_client.py +106 -0
apify/storage_clients/_apify/_utils.py +194 -0
apify/storage_clients/_apify/py.typed +0 -0
apify/storage_clients/_file_system/__init__.py +2 -0
apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
apify/storage_clients/_file_system/_storage_client.py +41 -0
apify/storage_clients/_smart_apify/__init__.py +1 -0
apify/storage_clients/_smart_apify/_storage_client.py +117 -0
apify/storage_clients/py.typed +0 -0
apify/storages/__init__.py +1 -3
{apify-2.7.2.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
apify-3.0.0.dist-info/RECORD +57 -0
apify/_platform_event_manager.py +0 -231
apify/apify_storage_client/__init__.py +0 -3
apify/apify_storage_client/_apify_storage_client.py +0 -72
apify/apify_storage_client/_dataset_client.py +0 -190
apify/apify_storage_client/_dataset_collection_client.py +0 -51
apify/apify_storage_client/_key_value_store_client.py +0 -109
apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
apify/apify_storage_client/_request_queue_client.py +0 -176
apify/apify_storage_client/_request_queue_collection_client.py +0 -51
apify-2.7.2.dist-info/RECORD +0 -44
/apify/{apify_storage_client → events}/py.typed +0 -0
{apify-2.7.2.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
{apify-2.7.2.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0

apify/log.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 import logging
-from apify_shared.utils import ignore_docs
 from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
 # Name of the logger used throughout the library (resolves to 'apify')
@@ -12,7 +11,6 @@ logger_name = __name__.split('.')[0]
 logger = logging.getLogger(logger_name)
-@ignore_docs
 class ActorLogFormatter(CrawleeLogFormatter):  # noqa: D101 (Inherited from parent class)
     pass
@@ -29,13 +27,6 @@ def _configure_logging() -> None:
     else:
         apify_client_logger.setLevel(level)
-    # Silence HTTPX logger unless debug logging is requested
-    httpx_logger = logging.getLogger('httpx')
-    if level > logging.DEBUG:
-        httpx_logger.setLevel(logging.WARNING)
-    else:
-        httpx_logger.setLevel(level)
     # Use configured log level for apify logger
     apify_logger = logging.getLogger('apify')
     configure_logger(apify_logger, remove_old_handlers=True)

apify/request_loaders/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from crawlee.request_loaders import (
+    RequestList,
+    RequestLoader,
+    RequestManager,
+    RequestManagerTandem,
+    SitemapRequestLoader,
+)
+from ._apify_request_list import ApifyRequestList
+__all__ = [
+    'ApifyRequestList',
+    'RequestList',
+    'RequestLoader',
+    'RequestManager',
+    'RequestManagerTandem',
+    'SitemapRequestLoader',
+]

apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} RENAMED Viewed

@@ -3,16 +3,15 @@ from __future__ import annotations
 import asyncio
 import re
 from asyncio import Task
-from functools import partial
-from typing import Annotated, Any, Union
+from typing import Annotated, Any
 from pydantic import BaseModel, Field, TypeAdapter
-from crawlee import Request
 from crawlee._types import HttpMethod
-from crawlee.http_clients import HttpClient, HttpxHttpClient
-from crawlee.request_loaders import RequestList as CrawleeRequestList
+from crawlee.http_clients import HttpClient, ImpitHttpClient
+from crawlee.request_loaders import RequestList
+from apify import Request
 from apify._utils import docs_group
 URL_NO_COMMAS_REGEX = re.compile(
@@ -35,11 +34,11 @@ class _SimpleUrlInput(_RequestDetails):
     url: str
-url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
+url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput])
-@docs_group('Classes')
-class RequestList(CrawleeRequestList):
+@docs_group('Request loaders')
+class ApifyRequestList(RequestList):
     """Extends crawlee RequestList.
     Method open is used to create RequestList from actor's requestListSources input.
@@ -50,7 +49,7 @@ class RequestList(CrawleeRequestList):
         name: str | None = None,
         request_list_sources_input: list[dict[str, Any]] | None = None,
         http_client: HttpClient | None = None,
-    ) -> RequestList:
+    ) -> ApifyRequestList:
         """Initialize a new instance from request list source input.
         Args:
@@ -74,24 +73,26 @@ class RequestList(CrawleeRequestList):
         ```
         """
         request_list_sources_input = request_list_sources_input or []
-        return await RequestList._create_request_list(name, request_list_sources_input, http_client)
+        return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
     @staticmethod
     async def _create_request_list(
         name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
-    ) -> RequestList:
+    ) -> ApifyRequestList:
         if not http_client:
-            http_client = HttpxHttpClient()
+            http_client = ImpitHttpClient()
         url_inputs = url_input_adapter.validate_python(request_list_sources_input)
         simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
         remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
-        simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
-        remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
+        simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
+        remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
+            remote_url_inputs, http_client=http_client
+        )
-        return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
+        return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
     @staticmethod
     def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
@@ -119,13 +120,15 @@ class RequestList(CrawleeRequestList):
         """
         created_requests: list[Request] = []
-        def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
+        async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
             """Extract links from response body and use them to create `Request` objects.
             Use the regular expression to find all matching links in the response body, then create `Request`
             objects from these links and the provided input attributes.
             """
-            matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
+            response = await (task.result()).read()
+            matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8'))
             created_requests.extend(
                 [
                     Request.from_url(
@@ -148,7 +151,11 @@ class RequestList(CrawleeRequestList):
                 )
             )
-            get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
+            get_response_task.add_done_callback(
+                lambda task, inp=remote_url_requests_input: asyncio.create_task(  # type: ignore[misc]
+                    create_requests_from_response(inp, task)
+                )
+            )
             remote_url_requests.append(get_response_task)
         await asyncio.gather(*remote_url_requests)

apify/request_loaders/py.typed ADDED Viewed

File without changes

apify/scrapy/_logging_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ from apify.log import ActorLogFormatter
 # Define logger names.
 _PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
-_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
+_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
 _ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
@@ -37,9 +37,6 @@ def initialize_logging() -> None:
     for logger_name in [None, *_ALL_LOGGERS]:
         _configure_logger(logger_name, logging_level, handler)
-    # Set the 'httpx' logger to a less verbose level.
-    logging.getLogger('httpx').setLevel('WARNING')
     # Monkey-patch Scrapy's logging configuration to re-apply our settings.
     original_configure_logging = scrapy_logging.configure_logging

apify/scrapy/extensions/_httpcache.py CHANGED Viewed

@@ -13,8 +13,8 @@ from scrapy.http.headers import Headers
 from scrapy.responsetypes import responsetypes
 from apify import Configuration
-from apify.apify_storage_client import ApifyStorageClient
 from apify.scrapy._async_thread import AsyncThread
+from apify.storage_clients import ApifyStorageClient
 from apify.storages import KeyValueStore
 if TYPE_CHECKING:
@@ -51,10 +51,14 @@ class ApifyCacheStorage:
         kvs_name = get_kvs_name(spider.name)
         async def open_kvs() -> KeyValueStore:
-            config = Configuration.get_global_configuration()
-            if config.is_at_home:
-                storage_client = ApifyStorageClient.from_config(config)
-                return await KeyValueStore.open(name=kvs_name, storage_client=storage_client)
+            configuration = Configuration.get_global_configuration()
+            if configuration.is_at_home:
+                storage_client = ApifyStorageClient()
+                return await KeyValueStore.open(
+                    name=kvs_name,
+                    configuration=configuration,
+                    storage_client=storage_client,
+                )
             return await KeyValueStore.open(name=kvs_name)
         logger.debug("Starting background thread for cache storage's event loop")

apify/scrapy/requests.py CHANGED Viewed

@@ -10,9 +10,10 @@ from scrapy import Spider
 from scrapy.http.headers import Headers
 from scrapy.utils.request import request_from_dict
-from crawlee import Request as ApifyRequest
 from crawlee._types import HttpHeaders
+from apify import Request as ApifyRequest
 logger = getLogger(__name__)
@@ -121,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
         # Update the meta field with the meta field from the apify_request
         meta = scrapy_request.meta or {}
-        meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
+        meta.update({'apify_request_unique_key': apify_request.unique_key})
         # scrapy_request.meta is a property, so we have to set it like this
         scrapy_request._meta = meta  # noqa: SLF001
@@ -133,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
             url=apify_request.url,
             method=apify_request.method,
             meta={
-                'apify_request_id': apify_request.id,
                 'apify_request_unique_key': apify_request.unique_key,
             },
         )

apify/scrapy/scheduler.py CHANGED Viewed

@@ -11,7 +11,7 @@ from scrapy.utils.reactor import is_asyncio_reactor_installed
 from ._async_thread import AsyncThread
 from .requests import to_apify_request, to_scrapy_request
 from apify import Configuration
-from apify.apify_storage_client import ApifyStorageClient
+from apify.storage_clients import ApifyStorageClient
 from apify.storages import RequestQueue
 if TYPE_CHECKING:
@@ -49,10 +49,13 @@ class ApifyScheduler(BaseScheduler):
         self.spider = spider
         async def open_rq() -> RequestQueue:
-            config = Configuration.get_global_configuration()
-            if config.is_at_home:
-                storage_client = ApifyStorageClient.from_config(config)
-                return await RequestQueue.open(storage_client=storage_client)
+            configuration = Configuration.get_global_configuration()
+            if configuration.is_at_home:
+                storage_client = ApifyStorageClient()
+                return await RequestQueue.open(
+                    configuration=configuration,
+                    storage_client=storage_client,
+                )
             return await RequestQueue.open()
         try:

apify/storage_clients/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from crawlee.storage_clients import MemoryStorageClient
+from ._apify import ApifyStorageClient
+from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
+from ._smart_apify import SmartApifyStorageClient
+__all__ = [
+    'ApifyStorageClient',
+    'FileSystemStorageClient',
+    'MemoryStorageClient',
+    'SmartApifyStorageClient',
+]

apify/storage_clients/_apify/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from ._dataset_client import ApifyDatasetClient
+from ._key_value_store_client import ApifyKeyValueStoreClient
+from ._request_queue_client import ApifyRequestQueueClient
+from ._storage_client import ApifyStorageClient
+__all__ = [
+    'ApifyDatasetClient',
+    'ApifyKeyValueStoreClient',
+    'ApifyRequestQueueClient',
+    'ApifyStorageClient',
+]

apify/storage_clients/_apify/_dataset_client.py ADDED Viewed

@@ -0,0 +1,328 @@
+from __future__ import annotations
+import asyncio
+from logging import getLogger
+from typing import TYPE_CHECKING, Any
+from typing_extensions import override
+from apify_client import ApifyClientAsync
+from crawlee._utils.byte_size import ByteSize
+from crawlee._utils.file import json_dumps
+from crawlee.storage_clients._base import DatasetClient
+from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
+from crawlee.storages import Dataset
+from ._utils import AliasResolver
+if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
+    from apify_client.clients import DatasetClientAsync
+    from crawlee._types import JsonSerializable
+    from apify import Configuration
+logger = getLogger(__name__)
+class ApifyDatasetClient(DatasetClient):
+    """An Apify platform implementation of the dataset client."""
+    _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9)
+    """Maximum size for a single payload."""
+    _SAFETY_BUFFER_COEFFICIENT = 0.01 / 100  # 0.01%
+    """Percentage buffer to reduce payload limit slightly for safety."""
+    _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_COEFFICIENT)
+    """Calculated payload limit considering safety buffer."""
+    def __init__(
+        self,
+        *,
+        api_client: DatasetClientAsync,
+        api_public_base_url: str,
+        lock: asyncio.Lock,
+    ) -> None:
+        """Initialize a new instance.
+        Preferably use the `ApifyDatasetClient.open` class method to create a new instance.
+        """
+        self._api_client = api_client
+        """The Apify dataset client for API operations."""
+        self._api_public_base_url = api_public_base_url
+        """The public base URL for accessing the key-value store records."""
+        self._lock = lock
+        """A lock to ensure that only one operation is performed at a time."""
+    @override
+    async def get_metadata(self) -> DatasetMetadata:
+        metadata = await self._api_client.get()
+        return DatasetMetadata.model_validate(metadata)
+    @classmethod
+    async def open(
+        cls,
+        *,
+        id: str | None,
+        name: str | None,
+        alias: str | None,
+        configuration: Configuration,
+    ) -> ApifyDatasetClient:
+        """Open an Apify dataset client.
+        This method creates and initializes a new instance of the Apify dataset client.
+        It handles authentication, storage lookup/creation, and metadata retrieval.
+        Args:
+            id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
+                Mutually exclusive with name and alias.
+            name: The name of the dataset to open (global scope, persists across runs).
+                Mutually exclusive with id and alias.
+            alias: The alias of the dataset to open (run scope, creates unnamed storage).
+                Mutually exclusive with id and name.
+            configuration: The configuration object containing API credentials and settings. Must include a valid
+                `token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither
+                `id`, `name`, nor `alias` is provided.
+        Returns:
+            An instance for the opened or created storage client.
+        Raises:
+            ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
+                `id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
+                in the configuration.
+        """
+        if sum(1 for param in [id, name, alias] if param is not None) > 1:
+            raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
+        token = configuration.token
+        if not token:
+            raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
+        api_url = configuration.api_base_url
+        if not api_url:
+            raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
+        api_public_base_url = configuration.api_public_base_url
+        if not api_public_base_url:
+            raise ValueError(
+                'Apify storage client requires a valid API public base URL in Configuration '
+                f'(api_public_base_url={api_public_base_url}).'
+            )
+        # Create Apify client with the provided token and API URL.
+        apify_client_async = ApifyClientAsync(
+            token=token,
+            api_url=api_url,
+            max_retries=8,
+            min_delay_between_retries_millis=500,
+            timeout_secs=360,
+        )
+        apify_datasets_client = apify_client_async.datasets()
+        # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
+        # storage aliased as `__default__`
+        if not any([alias, name, id, configuration.default_dataset_id]):
+            alias = '__default__'
+        if alias:
+            # Check if there is pre-existing alias mapping in the default KVS.
+            async with AliasResolver(storage_type=Dataset, alias=alias, configuration=configuration) as _alias:
+                id = await _alias.resolve_id()
+                # There was no pre-existing alias in the mapping.
+                # Create a new unnamed storage and store the mapping.
+                if id is None:
+                    new_storage_metadata = DatasetMetadata.model_validate(
+                        await apify_datasets_client.get_or_create(),
+                    )
+                    id = new_storage_metadata.id
+                    await _alias.store_mapping(storage_id=id)
+        # If name is provided, get or create the storage by name.
+        elif name:
+            id = DatasetMetadata.model_validate(
+                await apify_datasets_client.get_or_create(name=name),
+            ).id
+        # If none are provided, try to get the default storage ID from environment variables.
+        elif id is None:
+            id = configuration.default_dataset_id
+            if not id:
+                raise ValueError(
+                    'Dataset "id", "name", or "alias" must be specified, '
+                    'or a default dataset ID must be set in the configuration.'
+                )
+        # Now create the client for the determined ID
+        apify_dataset_client = apify_client_async.dataset(dataset_id=id)
+        # Fetch its metadata.
+        metadata = await apify_dataset_client.get()
+        # If metadata is None, it means the storage does not exist, so we create it.
+        if metadata is None:
+            id = DatasetMetadata.model_validate(
+                await apify_datasets_client.get_or_create(),
+            ).id
+            apify_dataset_client = apify_client_async.dataset(dataset_id=id)
+        # Verify that the storage exists by fetching its metadata again.
+        metadata = await apify_dataset_client.get()
+        if metadata is None:
+            raise ValueError(f'Opening dataset with id={id}, name={name}, and alias={alias} failed.')
+        return cls(
+            api_client=apify_dataset_client,
+            api_public_base_url=api_public_base_url,
+            lock=asyncio.Lock(),
+        )
+    @override
+    async def purge(self) -> None:
+        raise NotImplementedError(
+            'Purging datasets is not supported in the Apify platform. '
+            'Use the `drop` method to delete the dataset instead.'
+        )
+    @override
+    async def drop(self) -> None:
+        async with self._lock:
+            await self._api_client.delete()
+    @override
+    async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
+        async def payloads_generator() -> AsyncIterator[str]:
+            for index, item in enumerate(data):
+                yield await self._check_and_serialize(item, index)
+        async with self._lock:
+            # Handle lists
+            if isinstance(data, list):
+                # Invoke client in series to preserve the order of data
+                async for items in self._chunk_by_size(payloads_generator()):
+                    await self._api_client.push_items(items=items)
+            # Handle singular items
+            else:
+                items = await self._check_and_serialize(data)
+                await self._api_client.push_items(items=items)
+    @override
+    async def get_data(
+        self,
+        *,
+        offset: int = 0,
+        limit: int | None = 999_999_999_999,
+        clean: bool = False,
+        desc: bool = False,
+        fields: list[str] | None = None,
+        omit: list[str] | None = None,
+        unwind: list[str] | None = None,
+        skip_empty: bool = False,
+        skip_hidden: bool = False,
+        flatten: list[str] | None = None,
+        view: str | None = None,
+    ) -> DatasetItemsListPage:
+        response = await self._api_client.list_items(
+            offset=offset,
+            limit=limit,
+            clean=clean,
+            desc=desc,
+            fields=fields,
+            omit=omit,
+            unwind=unwind,
+            skip_empty=skip_empty,
+            skip_hidden=skip_hidden,
+            flatten=flatten,
+            view=view,
+        )
+        return DatasetItemsListPage.model_validate(vars(response))
+    @override
+    async def iterate_items(
+        self,
+        *,
+        offset: int = 0,
+        limit: int | None = None,
+        clean: bool = False,
+        desc: bool = False,
+        fields: list[str] | None = None,
+        omit: list[str] | None = None,
+        unwind: list[str] | None = None,
+        skip_empty: bool = False,
+        skip_hidden: bool = False,
+    ) -> AsyncIterator[dict]:
+        async for item in self._api_client.iterate_items(
+            offset=offset,
+            limit=limit,
+            clean=clean,
+            desc=desc,
+            fields=fields,
+            omit=omit,
+            unwind=unwind,
+            skip_empty=skip_empty,
+            skip_hidden=skip_hidden,
+        ):
+            yield item
+    @classmethod
+    async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str:
+        """Serialize a given item to JSON, checks its serializability and size against a limit.
+        Args:
+            item: The item to serialize.
+            index: Index of the item, used for error context.
+        Returns:
+            Serialized JSON string.
+        Raises:
+            ValueError: If item is not JSON serializable or exceeds size limit.
+        """
+        s = ' ' if index is None else f' at index {index} '
+        try:
+            payload = await json_dumps(item)
+        except Exception as exc:
+            raise ValueError(f'Data item{s}is not serializable to JSON.') from exc
+        payload_size = ByteSize(len(payload.encode('utf-8')))
+        if payload_size > cls._EFFECTIVE_LIMIT_SIZE:
+            raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})')
+        return payload
+    async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]:
+        """Yield chunks of JSON arrays composed of input strings, respecting a size limit.
+        Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size
+        of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that
+        contains as many payloads as possible without breaching the size threshold, maintaining the
+        order of the original payloads. Assumes individual items are below the size limit.
+        Args:
+            items: Iterable of JSON string payloads.
+        Yields:
+            Strings representing JSON arrays of payloads, each staying within the size limit.
+        """
+        last_chunk_size = ByteSize(2)  # Add 2 bytes for [] wrapper.
+        current_chunk = []
+        async for payload in items:
+            payload_size = ByteSize(len(payload.encode('utf-8')))
+            if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE:
+                current_chunk.append(payload)
+                last_chunk_size += payload_size + ByteSize(1)  # Add 1 byte for ',' separator.
+            else:
+                yield f'[{",".join(current_chunk)}]'
+                current_chunk = [payload]
+                last_chunk_size = payload_size + ByteSize(2)  # Add 2 bytes for [] wrapper.
+        yield f'[{",".join(current_chunk)}]'

apify 2.7.2__py3-none-any.whl → 3.0.0__py3-none-any.whl

Potentially problematic release.

apify 2.7.2py3-none-any.whl → 3.0.0py3-none-any.whl