apify 2.7.1b7__py3-none-any.whl → 2.7.1b9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (43) hide show
  1. apify/_actor.py +6 -7
  2. apify/_configuration.py +33 -0
  3. apify/_proxy_configuration.py +8 -5
  4. apify/_utils.py +9 -1
  5. apify/events/__init__.py +5 -0
  6. apify/events/_apify_event_manager.py +140 -0
  7. apify/events/_types.py +102 -0
  8. apify/log.py +0 -7
  9. apify/request_loaders/__init__.py +18 -0
  10. apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +22 -15
  11. apify/request_loaders/py.typed +0 -0
  12. apify/scrapy/_logging_config.py +1 -4
  13. apify/scrapy/extensions/_httpcache.py +9 -5
  14. apify/scrapy/requests.py +3 -3
  15. apify/scrapy/scheduler.py +8 -5
  16. apify/storage_clients/__init__.py +10 -0
  17. apify/storage_clients/_apify/__init__.py +11 -0
  18. apify/storage_clients/_apify/_dataset_client.py +304 -0
  19. apify/storage_clients/_apify/_key_value_store_client.py +241 -0
  20. apify/storage_clients/_apify/_models.py +107 -0
  21. apify/storage_clients/_apify/_request_queue_client.py +785 -0
  22. apify/storage_clients/_apify/_storage_client.py +80 -0
  23. apify/storage_clients/_apify/py.typed +0 -0
  24. apify/storage_clients/_file_system/__init__.py +2 -0
  25. apify/storage_clients/_file_system/_key_value_store_client.py +36 -0
  26. apify/storage_clients/_file_system/_storage_client.py +35 -0
  27. apify/storage_clients/py.typed +0 -0
  28. apify/storages/__init__.py +1 -3
  29. {apify-2.7.1b7.dist-info → apify-2.7.1b9.dist-info}/METADATA +7 -5
  30. apify-2.7.1b9.dist-info/RECORD +52 -0
  31. apify/_platform_event_manager.py +0 -215
  32. apify/apify_storage_client/__init__.py +0 -3
  33. apify/apify_storage_client/_apify_storage_client.py +0 -72
  34. apify/apify_storage_client/_dataset_client.py +0 -190
  35. apify/apify_storage_client/_dataset_collection_client.py +0 -51
  36. apify/apify_storage_client/_key_value_store_client.py +0 -109
  37. apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
  38. apify/apify_storage_client/_request_queue_client.py +0 -176
  39. apify/apify_storage_client/_request_queue_collection_client.py +0 -51
  40. apify-2.7.1b7.dist-info/RECORD +0 -44
  41. /apify/{apify_storage_client → events}/py.typed +0 -0
  42. {apify-2.7.1b7.dist-info → apify-2.7.1b9.dist-info}/WHEEL +0 -0
  43. {apify-2.7.1b7.dist-info → apify-2.7.1b9.dist-info}/licenses/LICENSE +0 -0
apify/_actor.py CHANGED
@@ -13,7 +13,6 @@ from pydantic import AliasChoices
13
13
 
14
14
  from apify_client import ApifyClientAsync
15
15
  from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
16
- from apify_shared.utils import maybe_extract_enum_member_value
17
16
  from crawlee import service_locator
18
17
  from crawlee.events import (
19
18
  Event,
@@ -30,11 +29,11 @@ from apify._configuration import Configuration
30
29
  from apify._consts import EVENT_LISTENERS_TIMEOUT
31
30
  from apify._crypto import decrypt_input_secrets, load_private_key
32
31
  from apify._models import ActorRun
33
- from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
34
32
  from apify._proxy_configuration import ProxyConfiguration
35
- from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
36
- from apify.apify_storage_client import ApifyStorageClient
33
+ from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython, maybe_extract_enum_member_value
34
+ from apify.events import ApifyEventManager, EventManager, LocalEventManager
37
35
  from apify.log import _configure_logging, logger
36
+ from apify.storage_clients import ApifyStorageClient
38
37
  from apify.storages import Dataset, KeyValueStore, RequestQueue
39
38
 
40
39
  if TYPE_CHECKING:
@@ -126,12 +125,12 @@ class _ActorType:
126
125
 
127
126
  # Create an instance of the cloud storage client, the local storage client is obtained
128
127
  # from the service locator.
129
- self._cloud_storage_client = ApifyStorageClient.from_config(config=self._configuration)
128
+ self._cloud_storage_client = ApifyStorageClient()
130
129
 
131
130
  # Set the event manager based on whether the Actor is running on the platform or locally.
132
131
  self._event_manager = (
133
- PlatformEventManager(
134
- config=self._configuration,
132
+ ApifyEventManager(
133
+ configuration=self._configuration,
135
134
  persist_state_interval=self._configuration.persist_state_interval,
136
135
  )
137
136
  if self.is_at_home()
apify/_configuration.py CHANGED
@@ -140,6 +140,39 @@ class Configuration(CrawleeConfiguration):
140
140
  ),
141
141
  ] = None
142
142
 
143
+ default_dataset_id: Annotated[
144
+ str,
145
+ Field(
146
+ validation_alias=AliasChoices(
147
+ 'actor_default_dataset_id',
148
+ 'apify_default_dataset_id',
149
+ ),
150
+ description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
151
+ ),
152
+ ] = 'default'
153
+
154
+ default_key_value_store_id: Annotated[
155
+ str,
156
+ Field(
157
+ validation_alias=AliasChoices(
158
+ 'actor_default_key_value_store_id',
159
+ 'apify_default_key_value_store_id',
160
+ ),
161
+ description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
162
+ ),
163
+ ] = 'default'
164
+
165
+ default_request_queue_id: Annotated[
166
+ str,
167
+ Field(
168
+ validation_alias=AliasChoices(
169
+ 'actor_default_request_queue_id',
170
+ 'apify_default_request_queue_id',
171
+ ),
172
+ description='Default request queue ID for the Apify storage client when no ID or name is provided.',
173
+ ),
174
+ ] = 'default'
175
+
143
176
  disable_outdated_warning: Annotated[
144
177
  bool,
145
178
  Field(
@@ -1,13 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import ipaddress
4
+ import json
4
5
  import re
5
6
  from dataclasses import dataclass, field
6
7
  from re import Pattern
7
8
  from typing import TYPE_CHECKING, Any
8
9
  from urllib.parse import urljoin, urlparse
9
10
 
10
- import httpx
11
+ import impit
12
+ from yarl import URL
11
13
 
12
14
  from apify_shared.consts import ApifyEnvVars
13
15
  from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration
@@ -20,7 +22,8 @@ from apify.log import logger
20
22
 
21
23
  if TYPE_CHECKING:
22
24
  from apify_client import ApifyClientAsync
23
- from crawlee import Request
25
+
26
+ from apify import Request
24
27
 
25
28
  APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$')
26
29
  COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$')
@@ -230,7 +233,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
230
233
  return None
231
234
 
232
235
  if self._uses_apify_proxy:
233
- parsed_url = httpx.URL(proxy_info.url)
236
+ parsed_url = URL(proxy_info.url)
234
237
  username = self._get_username(session_id)
235
238
 
236
239
  return ProxyInfo(
@@ -274,11 +277,11 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
274
277
  return
275
278
 
276
279
  status = None
277
- async with httpx.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
280
+ async with impit.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
278
281
  for _ in range(2):
279
282
  try:
280
283
  response = await client.get(proxy_status_url)
281
- status = response.json()
284
+ status = json.loads(response.text)
282
285
  break
283
286
  except Exception: # noqa: S110
284
287
  # retry on connection errors
apify/_utils.py CHANGED
@@ -2,8 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import builtins
4
4
  import sys
5
+ from enum import Enum
5
6
  from importlib import metadata
6
- from typing import TYPE_CHECKING, Literal
7
+ from typing import TYPE_CHECKING, Any, Literal
7
8
 
8
9
  if TYPE_CHECKING:
9
10
  from collections.abc import Callable
@@ -81,3 +82,10 @@ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
81
82
  return func
82
83
 
83
84
  return wrapper
85
+
86
+
87
+ def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
88
+ """Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
89
+ if isinstance(maybe_enum_member, Enum):
90
+ return maybe_enum_member.value
91
+ return maybe_enum_member
@@ -0,0 +1,5 @@
1
+ from crawlee.events import EventManager, LocalEventManager
2
+
3
+ from ._apify_event_manager import ApifyEventManager
4
+
5
+ __all__ = ['ApifyEventManager', 'EventManager', 'LocalEventManager']
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ from typing import TYPE_CHECKING, Annotated
6
+
7
+ import websockets.asyncio.client
8
+ from pydantic import Discriminator, TypeAdapter
9
+ from typing_extensions import Self, Unpack, override
10
+
11
+ from crawlee.events import EventManager
12
+ from crawlee.events._types import Event, EventPersistStateData
13
+
14
+ from apify._utils import docs_group
15
+ from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
16
+ from apify.log import logger
17
+
18
+ if TYPE_CHECKING:
19
+ from types import TracebackType
20
+
21
+ from crawlee.events._event_manager import EventManagerOptions
22
+
23
+ from apify._configuration import Configuration
24
+
25
+
26
+ event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
27
+ Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
28
+ )
29
+
30
+
31
+ @docs_group('Event managers')
32
+ class ApifyEventManager(EventManager):
33
+ """Event manager for the Apify platform.
34
+
35
+ This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
36
+ connectivity to the Apify platform for receiving platform events.
37
+
38
+ The event manager handles:
39
+ - Registration and emission of events and their listeners.
40
+ - Websocket connection to Apify platform events.
41
+ - Processing and validation of platform messages.
42
+ - Automatic event forwarding from the platform to local event listeners.
43
+
44
+ This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
45
+ with the event system.
46
+ """
47
+
48
+ def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
49
+ """Initialize a new instance.
50
+
51
+ Args:
52
+ configuration: The Actor configuration for the event manager.
53
+ **kwargs: Additional event manager options passed to the parent class.
54
+ """
55
+ super().__init__(**kwargs)
56
+
57
+ self._configuration = configuration
58
+ """The Actor configuration for the event manager."""
59
+
60
+ self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
61
+ """WebSocket connection to the platform events."""
62
+
63
+ self._process_platform_messages_task: asyncio.Task | None = None
64
+ """Task for processing messages from the platform websocket."""
65
+
66
+ self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
67
+ """Future that resolves when the connection to the platform websocket is established."""
68
+
69
+ @override
70
+ async def __aenter__(self) -> Self:
71
+ await super().__aenter__()
72
+ self._connected_to_platform_websocket = asyncio.Future()
73
+
74
+ # Run tasks but don't await them
75
+ if self._configuration.actor_events_ws_url:
76
+ self._process_platform_messages_task = asyncio.create_task(
77
+ self._process_platform_messages(self._configuration.actor_events_ws_url)
78
+ )
79
+ is_connected = await self._connected_to_platform_websocket
80
+ if not is_connected:
81
+ raise RuntimeError('Error connecting to platform events websocket!')
82
+ else:
83
+ logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
84
+
85
+ return self
86
+
87
+ @override
88
+ async def __aexit__(
89
+ self,
90
+ exc_type: type[BaseException] | None,
91
+ exc_value: BaseException | None,
92
+ exc_traceback: TracebackType | None,
93
+ ) -> None:
94
+ if self._platform_events_websocket:
95
+ await self._platform_events_websocket.close()
96
+
97
+ if self._process_platform_messages_task and not self._process_platform_messages_task.done():
98
+ self._process_platform_messages_task.cancel()
99
+ with contextlib.suppress(asyncio.CancelledError):
100
+ await self._process_platform_messages_task
101
+
102
+ await super().__aexit__(exc_type, exc_value, exc_traceback)
103
+
104
+ async def _process_platform_messages(self, ws_url: str) -> None:
105
+ try:
106
+ async with websockets.asyncio.client.connect(ws_url) as websocket:
107
+ self._platform_events_websocket = websocket
108
+ if self._connected_to_platform_websocket is not None:
109
+ self._connected_to_platform_websocket.set_result(True)
110
+
111
+ async for message in websocket:
112
+ try:
113
+ parsed_message = event_data_adapter.validate_json(message)
114
+
115
+ if isinstance(parsed_message, DeprecatedEvent):
116
+ continue
117
+
118
+ if isinstance(parsed_message, UnknownEvent):
119
+ logger.info(
120
+ f'Unknown message received: event_name={parsed_message.name}, '
121
+ f'event_data={parsed_message.data}'
122
+ )
123
+ continue
124
+
125
+ self.emit(
126
+ event=parsed_message.name,
127
+ event_data=parsed_message.data
128
+ if not isinstance(parsed_message.data, SystemInfoEventData)
129
+ else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
130
+ )
131
+
132
+ if parsed_message.name == Event.MIGRATING:
133
+ await self._emit_persist_state_event_rec_task.stop()
134
+ self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
135
+ except Exception:
136
+ logger.exception('Cannot parse Actor event', extra={'message': message})
137
+ except Exception:
138
+ logger.exception('Error in websocket connection')
139
+ if self._connected_to_platform_websocket is not None:
140
+ self._connected_to_platform_websocket.set_result(False)
apify/events/_types.py ADDED
@@ -0,0 +1,102 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Annotated, Any, Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from crawlee.events._types import (
9
+ Event,
10
+ EventAbortingData,
11
+ EventExitData,
12
+ EventMigratingData,
13
+ EventPersistStateData,
14
+ EventSystemInfoData,
15
+ )
16
+
17
+ from apify._utils import docs_group
18
+
19
+
20
+ @docs_group('Event data')
21
+ class SystemInfoEventData(BaseModel):
22
+ mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
23
+ mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
24
+ mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
25
+ cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
26
+ cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
27
+ cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
28
+ is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
29
+ created_at: Annotated[datetime, Field(alias='createdAt')]
30
+
31
+ def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
32
+ return EventSystemInfoData.model_validate(
33
+ {
34
+ 'cpu_info': {
35
+ 'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
36
+ 'created_at': self.created_at,
37
+ },
38
+ 'memory_info': {
39
+ 'total_size': self.mem_max_bytes,
40
+ 'current_size': self.mem_current_bytes,
41
+ 'created_at': self.created_at,
42
+ },
43
+ }
44
+ )
45
+
46
+
47
+ @docs_group('Events')
48
+ class PersistStateEvent(BaseModel):
49
+ name: Literal[Event.PERSIST_STATE]
50
+ data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
51
+
52
+
53
+ @docs_group('Events')
54
+ class SystemInfoEvent(BaseModel):
55
+ name: Literal[Event.SYSTEM_INFO]
56
+ data: SystemInfoEventData
57
+
58
+
59
+ @docs_group('Events')
60
+ class MigratingEvent(BaseModel):
61
+ name: Literal[Event.MIGRATING]
62
+ data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
63
+
64
+
65
+ @docs_group('Events')
66
+ class AbortingEvent(BaseModel):
67
+ name: Literal[Event.ABORTING]
68
+ data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
69
+
70
+
71
+ @docs_group('Events')
72
+ class ExitEvent(BaseModel):
73
+ name: Literal[Event.EXIT]
74
+ data: Annotated[EventExitData, Field(default_factory=EventExitData)]
75
+
76
+
77
+ @docs_group('Events')
78
+ class EventWithoutData(BaseModel):
79
+ name: Literal[
80
+ Event.SESSION_RETIRED,
81
+ Event.BROWSER_LAUNCHED,
82
+ Event.BROWSER_RETIRED,
83
+ Event.BROWSER_CLOSED,
84
+ Event.PAGE_CREATED,
85
+ Event.PAGE_CLOSED,
86
+ ]
87
+ data: Any = None
88
+
89
+
90
+ @docs_group('Events')
91
+ class DeprecatedEvent(BaseModel):
92
+ name: Literal['cpuInfo']
93
+ data: Annotated[dict[str, Any], Field(default_factory=dict)]
94
+
95
+
96
+ @docs_group('Events')
97
+ class UnknownEvent(BaseModel):
98
+ name: str
99
+ data: Annotated[dict[str, Any], Field(default_factory=dict)]
100
+
101
+
102
+ EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData
apify/log.py CHANGED
@@ -27,13 +27,6 @@ def _configure_logging() -> None:
27
27
  else:
28
28
  apify_client_logger.setLevel(level)
29
29
 
30
- # Silence HTTPX logger unless debug logging is requested
31
- httpx_logger = logging.getLogger('httpx')
32
- if level > logging.DEBUG:
33
- httpx_logger.setLevel(logging.WARNING)
34
- else:
35
- httpx_logger.setLevel(level)
36
-
37
30
  # Use configured log level for apify logger
38
31
  apify_logger = logging.getLogger('apify')
39
32
  configure_logger(apify_logger, remove_old_handlers=True)
@@ -0,0 +1,18 @@
1
+ from crawlee.request_loaders import (
2
+ RequestList,
3
+ RequestLoader,
4
+ RequestManager,
5
+ RequestManagerTandem,
6
+ SitemapRequestLoader,
7
+ )
8
+
9
+ from ._apify_request_list import ApifyRequestList
10
+
11
+ __all__ = [
12
+ 'ApifyRequestList',
13
+ 'RequestList',
14
+ 'RequestLoader',
15
+ 'RequestManager',
16
+ 'RequestManagerTandem',
17
+ 'SitemapRequestLoader',
18
+ ]
@@ -3,16 +3,15 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import re
5
5
  from asyncio import Task
6
- from functools import partial
7
6
  from typing import Annotated, Any
8
7
 
9
8
  from pydantic import BaseModel, Field, TypeAdapter
10
9
 
11
- from crawlee import Request
12
10
  from crawlee._types import HttpMethod
13
- from crawlee.http_clients import HttpClient, HttpxHttpClient
14
- from crawlee.request_loaders import RequestList as CrawleeRequestList
11
+ from crawlee.http_clients import HttpClient, ImpitHttpClient
12
+ from crawlee.request_loaders import RequestList
15
13
 
14
+ from apify import Request
16
15
  from apify._utils import docs_group
17
16
 
18
17
  URL_NO_COMMAS_REGEX = re.compile(
@@ -39,7 +38,7 @@ url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput])
39
38
 
40
39
 
41
40
  @docs_group('Request loaders')
42
- class RequestList(CrawleeRequestList):
41
+ class ApifyRequestList(RequestList):
43
42
  """Extends crawlee RequestList.
44
43
 
45
44
  Method open is used to create RequestList from actor's requestListSources input.
@@ -50,7 +49,7 @@ class RequestList(CrawleeRequestList):
50
49
  name: str | None = None,
51
50
  request_list_sources_input: list[dict[str, Any]] | None = None,
52
51
  http_client: HttpClient | None = None,
53
- ) -> RequestList:
52
+ ) -> ApifyRequestList:
54
53
  """Initialize a new instance from request list source input.
55
54
 
56
55
  Args:
@@ -74,24 +73,26 @@ class RequestList(CrawleeRequestList):
74
73
  ```
75
74
  """
76
75
  request_list_sources_input = request_list_sources_input or []
77
- return await RequestList._create_request_list(name, request_list_sources_input, http_client)
76
+ return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
78
77
 
79
78
  @staticmethod
80
79
  async def _create_request_list(
81
80
  name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
82
- ) -> RequestList:
81
+ ) -> ApifyRequestList:
83
82
  if not http_client:
84
- http_client = HttpxHttpClient()
83
+ http_client = ImpitHttpClient()
85
84
 
86
85
  url_inputs = url_input_adapter.validate_python(request_list_sources_input)
87
86
 
88
87
  simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
89
88
  remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
90
89
 
91
- simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
92
- remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
90
+ simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
91
+ remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
92
+ remote_url_inputs, http_client=http_client
93
+ )
93
94
 
94
- return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
+ return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
96
 
96
97
  @staticmethod
97
98
  def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
@@ -119,13 +120,15 @@ class RequestList(CrawleeRequestList):
119
120
  """
120
121
  created_requests: list[Request] = []
121
122
 
122
- def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
123
+ async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
123
124
  """Extract links from response body and use them to create `Request` objects.
124
125
 
125
126
  Use the regular expression to find all matching links in the response body, then create `Request`
126
127
  objects from these links and the provided input attributes.
127
128
  """
128
- matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
129
+ response = await (task.result()).read()
130
+ matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8'))
131
+
129
132
  created_requests.extend(
130
133
  [
131
134
  Request.from_url(
@@ -148,7 +151,11 @@ class RequestList(CrawleeRequestList):
148
151
  )
149
152
  )
150
153
 
151
- get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
154
+ get_response_task.add_done_callback(
155
+ lambda task, inp=remote_url_requests_input: asyncio.create_task( # type: ignore[misc]
156
+ create_requests_from_response(inp, task)
157
+ )
158
+ )
152
159
  remote_url_requests.append(get_response_task)
153
160
 
154
161
  await asyncio.gather(*remote_url_requests)
File without changes
@@ -10,7 +10,7 @@ from apify.log import ActorLogFormatter
10
10
 
11
11
  # Define logger names.
12
12
  _PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
13
- _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
13
+ _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
14
14
  _ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
15
15
 
16
16
 
@@ -37,9 +37,6 @@ def initialize_logging() -> None:
37
37
  for logger_name in [None, *_ALL_LOGGERS]:
38
38
  _configure_logger(logger_name, logging_level, handler)
39
39
 
40
- # Set the 'httpx' logger to a less verbose level.
41
- logging.getLogger('httpx').setLevel('WARNING')
42
-
43
40
  # Monkey-patch Scrapy's logging configuration to re-apply our settings.
44
41
  original_configure_logging = scrapy_logging.configure_logging
45
42
 
@@ -13,8 +13,8 @@ from scrapy.http.headers import Headers
13
13
  from scrapy.responsetypes import responsetypes
14
14
 
15
15
  from apify import Configuration
16
- from apify.apify_storage_client import ApifyStorageClient
17
16
  from apify.scrapy._async_thread import AsyncThread
17
+ from apify.storage_clients import ApifyStorageClient
18
18
  from apify.storages import KeyValueStore
19
19
 
20
20
  if TYPE_CHECKING:
@@ -51,10 +51,14 @@ class ApifyCacheStorage:
51
51
  kvs_name = get_kvs_name(spider.name)
52
52
 
53
53
  async def open_kvs() -> KeyValueStore:
54
- config = Configuration.get_global_configuration()
55
- if config.is_at_home:
56
- storage_client = ApifyStorageClient.from_config(config)
57
- return await KeyValueStore.open(name=kvs_name, storage_client=storage_client)
54
+ configuration = Configuration.get_global_configuration()
55
+ if configuration.is_at_home:
56
+ storage_client = ApifyStorageClient()
57
+ return await KeyValueStore.open(
58
+ name=kvs_name,
59
+ configuration=configuration,
60
+ storage_client=storage_client,
61
+ )
58
62
  return await KeyValueStore.open(name=kvs_name)
59
63
 
60
64
  logger.debug("Starting background thread for cache storage's event loop")
apify/scrapy/requests.py CHANGED
@@ -10,9 +10,10 @@ from scrapy import Spider
10
10
  from scrapy.http.headers import Headers
11
11
  from scrapy.utils.request import request_from_dict
12
12
 
13
- from crawlee import Request as ApifyRequest
14
13
  from crawlee._types import HttpHeaders
15
14
 
15
+ from apify import Request as ApifyRequest
16
+
16
17
  logger = getLogger(__name__)
17
18
 
18
19
 
@@ -121,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
121
122
 
122
123
  # Update the meta field with the meta field from the apify_request
123
124
  meta = scrapy_request.meta or {}
124
- meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key})
125
+ meta.update({'apify_request_unique_key': apify_request.unique_key})
125
126
  # scrapy_request.meta is a property, so we have to set it like this
126
127
  scrapy_request._meta = meta # noqa: SLF001
127
128
 
@@ -133,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
133
134
  url=apify_request.url,
134
135
  method=apify_request.method,
135
136
  meta={
136
- 'apify_request_id': apify_request.id,
137
137
  'apify_request_unique_key': apify_request.unique_key,
138
138
  },
139
139
  )
apify/scrapy/scheduler.py CHANGED
@@ -11,7 +11,7 @@ from scrapy.utils.reactor import is_asyncio_reactor_installed
11
11
  from ._async_thread import AsyncThread
12
12
  from .requests import to_apify_request, to_scrapy_request
13
13
  from apify import Configuration
14
- from apify.apify_storage_client import ApifyStorageClient
14
+ from apify.storage_clients import ApifyStorageClient
15
15
  from apify.storages import RequestQueue
16
16
 
17
17
  if TYPE_CHECKING:
@@ -49,10 +49,13 @@ class ApifyScheduler(BaseScheduler):
49
49
  self.spider = spider
50
50
 
51
51
  async def open_rq() -> RequestQueue:
52
- config = Configuration.get_global_configuration()
53
- if config.is_at_home:
54
- storage_client = ApifyStorageClient.from_config(config)
55
- return await RequestQueue.open(storage_client=storage_client)
52
+ configuration = Configuration.get_global_configuration()
53
+ if configuration.is_at_home:
54
+ storage_client = ApifyStorageClient()
55
+ return await RequestQueue.open(
56
+ configuration=configuration,
57
+ storage_client=storage_client,
58
+ )
56
59
  return await RequestQueue.open()
57
60
 
58
61
  try:
@@ -0,0 +1,10 @@
1
+ from crawlee.storage_clients import MemoryStorageClient
2
+
3
+ from ._apify import ApifyStorageClient
4
+ from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
5
+
6
+ __all__ = [
7
+ 'ApifyStorageClient',
8
+ 'FileSystemStorageClient',
9
+ 'MemoryStorageClient',
10
+ ]
@@ -0,0 +1,11 @@
1
+ from ._dataset_client import ApifyDatasetClient
2
+ from ._key_value_store_client import ApifyKeyValueStoreClient
3
+ from ._request_queue_client import ApifyRequestQueueClient
4
+ from ._storage_client import ApifyStorageClient
5
+
6
+ __all__ = [
7
+ 'ApifyDatasetClient',
8
+ 'ApifyKeyValueStoreClient',
9
+ 'ApifyRequestQueueClient',
10
+ 'ApifyStorageClient',
11
+ ]