apify 2.7.3__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (46) hide show
  1. apify/_actor.py +47 -12
  2. apify/_charging.py +15 -9
  3. apify/_configuration.py +34 -1
  4. apify/_crypto.py +0 -6
  5. apify/_models.py +7 -7
  6. apify/_proxy_configuration.py +10 -10
  7. apify/_utils.py +25 -2
  8. apify/events/__init__.py +5 -0
  9. apify/events/_apify_event_manager.py +140 -0
  10. apify/events/_types.py +102 -0
  11. apify/log.py +0 -9
  12. apify/request_loaders/__init__.py +18 -0
  13. apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
  14. apify/request_loaders/py.typed +0 -0
  15. apify/scrapy/_logging_config.py +1 -4
  16. apify/scrapy/extensions/_httpcache.py +9 -5
  17. apify/scrapy/requests.py +3 -3
  18. apify/scrapy/scheduler.py +8 -5
  19. apify/storage_clients/__init__.py +10 -0
  20. apify/storage_clients/_apify/__init__.py +11 -0
  21. apify/storage_clients/_apify/_dataset_client.py +304 -0
  22. apify/storage_clients/_apify/_key_value_store_client.py +241 -0
  23. apify/storage_clients/_apify/_models.py +107 -0
  24. apify/storage_clients/_apify/_request_queue_client.py +787 -0
  25. apify/storage_clients/_apify/_storage_client.py +80 -0
  26. apify/storage_clients/_apify/py.typed +0 -0
  27. apify/storage_clients/_file_system/__init__.py +2 -0
  28. apify/storage_clients/_file_system/_key_value_store_client.py +36 -0
  29. apify/storage_clients/_file_system/_storage_client.py +35 -0
  30. apify/storage_clients/py.typed +0 -0
  31. apify/storages/__init__.py +1 -3
  32. {apify-2.7.3.dist-info → apify-3.0.0rc1.dist-info}/METADATA +8 -7
  33. apify-3.0.0rc1.dist-info/RECORD +52 -0
  34. apify/_platform_event_manager.py +0 -231
  35. apify/apify_storage_client/__init__.py +0 -3
  36. apify/apify_storage_client/_apify_storage_client.py +0 -72
  37. apify/apify_storage_client/_dataset_client.py +0 -190
  38. apify/apify_storage_client/_dataset_collection_client.py +0 -51
  39. apify/apify_storage_client/_key_value_store_client.py +0 -109
  40. apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
  41. apify/apify_storage_client/_request_queue_client.py +0 -176
  42. apify/apify_storage_client/_request_queue_collection_client.py +0 -51
  43. apify-2.7.3.dist-info/RECORD +0 -44
  44. /apify/{apify_storage_client → events}/py.typed +0 -0
  45. {apify-2.7.3.dist-info → apify-3.0.0rc1.dist-info}/WHEEL +0 -0
  46. {apify-2.7.3.dist-info → apify-3.0.0rc1.dist-info}/licenses/LICENSE +0 -0
apify/events/_types.py ADDED
@@ -0,0 +1,102 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Annotated, Any, Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from crawlee.events._types import (
9
+ Event,
10
+ EventAbortingData,
11
+ EventExitData,
12
+ EventMigratingData,
13
+ EventPersistStateData,
14
+ EventSystemInfoData,
15
+ )
16
+
17
+ from apify._utils import docs_group
18
+
19
+
20
+ @docs_group('Event data')
21
+ class SystemInfoEventData(BaseModel):
22
+ mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
23
+ mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
24
+ mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
25
+ cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
26
+ cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
27
+ cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
28
+ is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
29
+ created_at: Annotated[datetime, Field(alias='createdAt')]
30
+
31
+ def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
32
+ return EventSystemInfoData.model_validate(
33
+ {
34
+ 'cpu_info': {
35
+ 'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
36
+ 'created_at': self.created_at,
37
+ },
38
+ 'memory_info': {
39
+ 'total_size': self.mem_max_bytes,
40
+ 'current_size': self.mem_current_bytes,
41
+ 'created_at': self.created_at,
42
+ },
43
+ }
44
+ )
45
+
46
+
47
+ @docs_group('Events')
48
+ class PersistStateEvent(BaseModel):
49
+ name: Literal[Event.PERSIST_STATE]
50
+ data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
51
+
52
+
53
+ @docs_group('Events')
54
+ class SystemInfoEvent(BaseModel):
55
+ name: Literal[Event.SYSTEM_INFO]
56
+ data: SystemInfoEventData
57
+
58
+
59
+ @docs_group('Events')
60
+ class MigratingEvent(BaseModel):
61
+ name: Literal[Event.MIGRATING]
62
+ data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
63
+
64
+
65
+ @docs_group('Events')
66
+ class AbortingEvent(BaseModel):
67
+ name: Literal[Event.ABORTING]
68
+ data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
69
+
70
+
71
+ @docs_group('Events')
72
+ class ExitEvent(BaseModel):
73
+ name: Literal[Event.EXIT]
74
+ data: Annotated[EventExitData, Field(default_factory=EventExitData)]
75
+
76
+
77
+ @docs_group('Events')
78
+ class EventWithoutData(BaseModel):
79
+ name: Literal[
80
+ Event.SESSION_RETIRED,
81
+ Event.BROWSER_LAUNCHED,
82
+ Event.BROWSER_RETIRED,
83
+ Event.BROWSER_CLOSED,
84
+ Event.PAGE_CREATED,
85
+ Event.PAGE_CLOSED,
86
+ ]
87
+ data: Any = None
88
+
89
+
90
+ @docs_group('Events')
91
+ class DeprecatedEvent(BaseModel):
92
+ name: Literal['cpuInfo']
93
+ data: Annotated[dict[str, Any], Field(default_factory=dict)]
94
+
95
+
96
+ @docs_group('Events')
97
+ class UnknownEvent(BaseModel):
98
+ name: str
99
+ data: Annotated[dict[str, Any], Field(default_factory=dict)]
100
+
101
+
102
+ EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData
apify/log.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
 
5
- from apify_shared.utils import ignore_docs
6
5
  from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
7
6
 
8
7
  # Name of the logger used throughout the library (resolves to 'apify')
@@ -12,7 +11,6 @@ logger_name = __name__.split('.')[0]
12
11
  logger = logging.getLogger(logger_name)
13
12
 
14
13
 
15
- @ignore_docs
16
14
  class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from parent class)
17
15
  pass
18
16
 
@@ -29,13 +27,6 @@ def _configure_logging() -> None:
29
27
  else:
30
28
  apify_client_logger.setLevel(level)
31
29
 
32
- # Silence HTTPX logger unless debug logging is requested
33
- httpx_logger = logging.getLogger('httpx')
34
- if level > logging.DEBUG:
35
- httpx_logger.setLevel(logging.WARNING)
36
- else:
37
- httpx_logger.setLevel(level)
38
-
39
30
  # Use configured log level for apify logger
40
31
  apify_logger = logging.getLogger('apify')
41
32
  configure_logger(apify_logger, remove_old_handlers=True)
@@ -0,0 +1,18 @@
1
+ from crawlee.request_loaders import (
2
+ RequestList,
3
+ RequestLoader,
4
+ RequestManager,
5
+ RequestManagerTandem,
6
+ SitemapRequestLoader,
7
+ )
8
+
9
+ from ._apify_request_list import ApifyRequestList
10
+
11
+ __all__ = [
12
+ 'ApifyRequestList',
13
+ 'RequestList',
14
+ 'RequestLoader',
15
+ 'RequestManager',
16
+ 'RequestManagerTandem',
17
+ 'SitemapRequestLoader',
18
+ ]
@@ -3,16 +3,15 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import re
5
5
  from asyncio import Task
6
- from functools import partial
7
- from typing import Annotated, Any, Union
6
+ from typing import Annotated, Any
8
7
 
9
8
  from pydantic import BaseModel, Field, TypeAdapter
10
9
 
11
- from crawlee import Request
12
10
  from crawlee._types import HttpMethod
13
- from crawlee.http_clients import HttpClient, HttpxHttpClient
14
- from crawlee.request_loaders import RequestList as CrawleeRequestList
11
+ from crawlee.http_clients import HttpClient, ImpitHttpClient
12
+ from crawlee.request_loaders import RequestList
15
13
 
14
+ from apify import Request
16
15
  from apify._utils import docs_group
17
16
 
18
17
  URL_NO_COMMAS_REGEX = re.compile(
@@ -35,11 +34,11 @@ class _SimpleUrlInput(_RequestDetails):
35
34
  url: str
36
35
 
37
36
 
38
- url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
37
+ url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput])
39
38
 
40
39
 
41
- @docs_group('Classes')
42
- class RequestList(CrawleeRequestList):
40
+ @docs_group('Request loaders')
41
+ class ApifyRequestList(RequestList):
43
42
  """Extends crawlee RequestList.
44
43
 
45
44
  Method open is used to create RequestList from actor's requestListSources input.
@@ -50,7 +49,7 @@ class RequestList(CrawleeRequestList):
50
49
  name: str | None = None,
51
50
  request_list_sources_input: list[dict[str, Any]] | None = None,
52
51
  http_client: HttpClient | None = None,
53
- ) -> RequestList:
52
+ ) -> ApifyRequestList:
54
53
  """Initialize a new instance from request list source input.
55
54
 
56
55
  Args:
@@ -74,24 +73,26 @@ class RequestList(CrawleeRequestList):
74
73
  ```
75
74
  """
76
75
  request_list_sources_input = request_list_sources_input or []
77
- return await RequestList._create_request_list(name, request_list_sources_input, http_client)
76
+ return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
78
77
 
79
78
  @staticmethod
80
79
  async def _create_request_list(
81
80
  name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
82
- ) -> RequestList:
81
+ ) -> ApifyRequestList:
83
82
  if not http_client:
84
- http_client = HttpxHttpClient()
83
+ http_client = ImpitHttpClient()
85
84
 
86
85
  url_inputs = url_input_adapter.validate_python(request_list_sources_input)
87
86
 
88
87
  simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
89
88
  remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
90
89
 
91
- simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
92
- remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
90
+ simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
91
+ remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
92
+ remote_url_inputs, http_client=http_client
93
+ )
93
94
 
94
- return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
+ return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
96
 
96
97
  @staticmethod
97
98
  def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
@@ -119,13 +120,15 @@ class RequestList(CrawleeRequestList):
119
120
  """
120
121
  created_requests: list[Request] = []
121
122
 
122
- def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
123
+ async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
123
124
  """Extract links from response body and use them to create `Request` objects.
124
125
 
125
126
  Use the regular expression to find all matching links in the response body, then create `Request`
126
127
  objects from these links and the provided input attributes.
127
128
  """
128
- matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
129
+ response = await (task.result()).read()
130
+ matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8'))
131
+
129
132
  created_requests.extend(
130
133
  [
131
134
  Request.from_url(
@@ -148,7 +151,11 @@ class RequestList(CrawleeRequestList):
148
151
  )
149
152
  )
150
153
 
151
- get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
154
+ get_response_task.add_done_callback(
155
+ lambda task, inp=remote_url_requests_input: asyncio.create_task( # type: ignore[misc]
156
+ create_requests_from_response(inp, task)
157
+ )
158
+ )
152
159
  remote_url_requests.append(get_response_task)
153
160
 
154
161
  await asyncio.gather(*remote_url_requests)
File without changes
@@ -10,7 +10,7 @@ from apify.log import ActorLogFormatter
10
10
 
11
11
  # Define logger names.
12
12
  _PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
13
- _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
13
+ _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
14
14
  _ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
15
15
 
16
16
 
@@ -37,9 +37,6 @@ def initialize_logging() -> None:
37
37
  for logger_name in [None, *_ALL_LOGGERS]:
38
38
  _configure_logger(logger_name, logging_level, handler)
39
39
 
40
- # Set the 'httpx' logger to a less verbose level.
41
- logging.getLogger('httpx').setLevel('WARNING')
42
-
43
40
  # Monkey-patch Scrapy's logging configuration to re-apply our settings.
44
41
  original_configure_logging = scrapy_logging.configure_logging
45
42
 
@@ -13,8 +13,8 @@ from scrapy.http.headers import Headers
13
13
  from scrapy.responsetypes import responsetypes
14
14
 
15
15
  from apify import Configuration
16
- from apify.apify_storage_client import ApifyStorageClient
17
16
  from apify.scrapy._async_thread import AsyncThread
17
+ from apify.storage_clients import ApifyStorageClient
18
18
  from apify.storages import KeyValueStore
19
19
 
20
20
  if TYPE_CHECKING:
@@ -51,10 +51,14 @@ class ApifyCacheStorage:
51
51
  kvs_name = get_kvs_name(spider.name)
52
52
 
53
53
  async def open_kvs() -> KeyValueStore:
54
- config = Configuration.get_global_configuration()
55
- if config.is_at_home:
56
- storage_client = ApifyStorageClient.from_config(config)
57
- return await KeyValueStore.open(name=kvs_name, storage_client=storage_client)
54
+ configuration = Configuration.get_global_configuration()
55
+ if configuration.is_at_home:
56
+ storage_client = ApifyStorageClient()
57
+ return await KeyValueStore.open(
58
+ name=kvs_name,
59
+ configuration=configuration,
60
+ storage_client=storage_client,
61
+ )
58
62
  return await KeyValueStore.open(name=kvs_name)
59
63
 
60
64
  logger.debug("Starting background thread for cache storage's event loop")
apify/scrapy/requests.py CHANGED
@@ -10,9 +10,10 @@ from scrapy import Spider
10
10
  from scrapy.http.headers import Headers
11
11
  from scrapy.utils.request import request_from_dict
12
12
 
13
- from crawlee import Request as ApifyRequest
14
13
  from crawlee._types import HttpHeaders
15
14
 
15
+ from apify import Request as ApifyRequest
16
+
16
17
  logger = getLogger(__name__)
17
18
 
18
19
 
@@ -121,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
121
122
 
122
123
  # Update the meta field with the meta field from the apify_request
123
124
  meta = scrapy_request.meta or {}
124
- meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
125
+ meta.update({'apify_request_unique_key': apify_request.unique_key})
125
126
  # scrapy_request.meta is a property, so we have to set it like this
126
127
  scrapy_request._meta = meta # noqa: SLF001
127
128
 
@@ -133,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
133
134
  url=apify_request.url,
134
135
  method=apify_request.method,
135
136
  meta={
136
- 'apify_request_id': apify_request.id,
137
137
  'apify_request_unique_key': apify_request.unique_key,
138
138
  },
139
139
  )
apify/scrapy/scheduler.py CHANGED
@@ -11,7 +11,7 @@ from scrapy.utils.reactor import is_asyncio_reactor_installed
11
11
  from ._async_thread import AsyncThread
12
12
  from .requests import to_apify_request, to_scrapy_request
13
13
  from apify import Configuration
14
- from apify.apify_storage_client import ApifyStorageClient
14
+ from apify.storage_clients import ApifyStorageClient
15
15
  from apify.storages import RequestQueue
16
16
 
17
17
  if TYPE_CHECKING:
@@ -49,10 +49,13 @@ class ApifyScheduler(BaseScheduler):
49
49
  self.spider = spider
50
50
 
51
51
  async def open_rq() -> RequestQueue:
52
- config = Configuration.get_global_configuration()
53
- if config.is_at_home:
54
- storage_client = ApifyStorageClient.from_config(config)
55
- return await RequestQueue.open(storage_client=storage_client)
52
+ configuration = Configuration.get_global_configuration()
53
+ if configuration.is_at_home:
54
+ storage_client = ApifyStorageClient()
55
+ return await RequestQueue.open(
56
+ configuration=configuration,
57
+ storage_client=storage_client,
58
+ )
56
59
  return await RequestQueue.open()
57
60
 
58
61
  try:
@@ -0,0 +1,10 @@
1
+ from crawlee.storage_clients import MemoryStorageClient
2
+
3
+ from ._apify import ApifyStorageClient
4
+ from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
5
+
6
+ __all__ = [
7
+ 'ApifyStorageClient',
8
+ 'FileSystemStorageClient',
9
+ 'MemoryStorageClient',
10
+ ]
@@ -0,0 +1,11 @@
1
+ from ._dataset_client import ApifyDatasetClient
2
+ from ._key_value_store_client import ApifyKeyValueStoreClient
3
+ from ._request_queue_client import ApifyRequestQueueClient
4
+ from ._storage_client import ApifyStorageClient
5
+
6
+ __all__ = [
7
+ 'ApifyDatasetClient',
8
+ 'ApifyKeyValueStoreClient',
9
+ 'ApifyRequestQueueClient',
10
+ 'ApifyStorageClient',
11
+ ]