apify 2.7.1b6__py3-none-any.whl → 2.7.1b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +6 -7
- apify/_configuration.py +42 -0
- apify/_proxy_configuration.py +8 -5
- apify/_utils.py +9 -1
- apify/events/__init__.py +5 -0
- apify/events/_apify_event_manager.py +140 -0
- apify/events/_types.py +102 -0
- apify/log.py +0 -7
- apify/request_loaders/__init__.py +18 -0
- apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +22 -15
- apify/request_loaders/py.typed +0 -0
- apify/scrapy/_logging_config.py +1 -4
- apify/scrapy/extensions/_httpcache.py +9 -5
- apify/scrapy/requests.py +3 -3
- apify/scrapy/scheduler.py +8 -5
- apify/storage_clients/__init__.py +10 -0
- apify/storage_clients/_apify/__init__.py +11 -0
- apify/storage_clients/_apify/_dataset_client.py +304 -0
- apify/storage_clients/_apify/_key_value_store_client.py +241 -0
- apify/storage_clients/_apify/_models.py +107 -0
- apify/storage_clients/_apify/_request_queue_client.py +787 -0
- apify/storage_clients/_apify/_storage_client.py +80 -0
- apify/storage_clients/_apify/py.typed +0 -0
- apify/storage_clients/_file_system/__init__.py +2 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +36 -0
- apify/storage_clients/_file_system/_storage_client.py +35 -0
- apify/storage_clients/py.typed +0 -0
- apify/storages/__init__.py +1 -3
- {apify-2.7.1b6.dist-info → apify-2.7.1b8.dist-info}/METADATA +7 -5
- apify-2.7.1b8.dist-info/RECORD +52 -0
- apify/_platform_event_manager.py +0 -215
- apify/apify_storage_client/__init__.py +0 -3
- apify/apify_storage_client/_apify_storage_client.py +0 -72
- apify/apify_storage_client/_dataset_client.py +0 -190
- apify/apify_storage_client/_dataset_collection_client.py +0 -51
- apify/apify_storage_client/_key_value_store_client.py +0 -109
- apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
- apify/apify_storage_client/_request_queue_client.py +0 -176
- apify/apify_storage_client/_request_queue_collection_client.py +0 -51
- apify-2.7.1b6.dist-info/RECORD +0 -44
- /apify/{apify_storage_client → events}/py.typed +0 -0
- {apify-2.7.1b6.dist-info → apify-2.7.1b8.dist-info}/WHEEL +0 -0
- {apify-2.7.1b6.dist-info → apify-2.7.1b8.dist-info}/licenses/LICENSE +0 -0
apify/_actor.py
CHANGED
|
@@ -13,7 +13,6 @@ from pydantic import AliasChoices
|
|
|
13
13
|
|
|
14
14
|
from apify_client import ApifyClientAsync
|
|
15
15
|
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
|
|
16
|
-
from apify_shared.utils import maybe_extract_enum_member_value
|
|
17
16
|
from crawlee import service_locator
|
|
18
17
|
from crawlee.events import (
|
|
19
18
|
Event,
|
|
@@ -30,11 +29,11 @@ from apify._configuration import Configuration
|
|
|
30
29
|
from apify._consts import EVENT_LISTENERS_TIMEOUT
|
|
31
30
|
from apify._crypto import decrypt_input_secrets, load_private_key
|
|
32
31
|
from apify._models import ActorRun
|
|
33
|
-
from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
|
|
34
32
|
from apify._proxy_configuration import ProxyConfiguration
|
|
35
|
-
from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
|
|
36
|
-
from apify.
|
|
33
|
+
from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython, maybe_extract_enum_member_value
|
|
34
|
+
from apify.events import ApifyEventManager, EventManager, LocalEventManager
|
|
37
35
|
from apify.log import _configure_logging, logger
|
|
36
|
+
from apify.storage_clients import ApifyStorageClient
|
|
38
37
|
from apify.storages import Dataset, KeyValueStore, RequestQueue
|
|
39
38
|
|
|
40
39
|
if TYPE_CHECKING:
|
|
@@ -126,12 +125,12 @@ class _ActorType:
|
|
|
126
125
|
|
|
127
126
|
# Create an instance of the cloud storage client, the local storage client is obtained
|
|
128
127
|
# from the service locator.
|
|
129
|
-
self._cloud_storage_client = ApifyStorageClient
|
|
128
|
+
self._cloud_storage_client = ApifyStorageClient()
|
|
130
129
|
|
|
131
130
|
# Set the event manager based on whether the Actor is running on the platform or locally.
|
|
132
131
|
self._event_manager = (
|
|
133
|
-
|
|
134
|
-
|
|
132
|
+
ApifyEventManager(
|
|
133
|
+
configuration=self._configuration,
|
|
135
134
|
persist_state_interval=self._configuration.persist_state_interval,
|
|
136
135
|
)
|
|
137
136
|
if self.is_at_home()
|
apify/_configuration.py
CHANGED
|
@@ -140,6 +140,39 @@ class Configuration(CrawleeConfiguration):
|
|
|
140
140
|
),
|
|
141
141
|
] = None
|
|
142
142
|
|
|
143
|
+
default_dataset_id: Annotated[
|
|
144
|
+
str,
|
|
145
|
+
Field(
|
|
146
|
+
validation_alias=AliasChoices(
|
|
147
|
+
'actor_default_dataset_id',
|
|
148
|
+
'apify_default_dataset_id',
|
|
149
|
+
),
|
|
150
|
+
description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
|
|
151
|
+
),
|
|
152
|
+
] = 'default'
|
|
153
|
+
|
|
154
|
+
default_key_value_store_id: Annotated[
|
|
155
|
+
str,
|
|
156
|
+
Field(
|
|
157
|
+
validation_alias=AliasChoices(
|
|
158
|
+
'actor_default_key_value_store_id',
|
|
159
|
+
'apify_default_key_value_store_id',
|
|
160
|
+
),
|
|
161
|
+
description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
|
|
162
|
+
),
|
|
163
|
+
] = 'default'
|
|
164
|
+
|
|
165
|
+
default_request_queue_id: Annotated[
|
|
166
|
+
str,
|
|
167
|
+
Field(
|
|
168
|
+
validation_alias=AliasChoices(
|
|
169
|
+
'actor_default_request_queue_id',
|
|
170
|
+
'apify_default_request_queue_id',
|
|
171
|
+
),
|
|
172
|
+
description='Default request queue ID for the Apify storage client when no ID or name is provided.',
|
|
173
|
+
),
|
|
174
|
+
] = 'default'
|
|
175
|
+
|
|
143
176
|
disable_outdated_warning: Annotated[
|
|
144
177
|
bool,
|
|
145
178
|
Field(
|
|
@@ -334,6 +367,15 @@ class Configuration(CrawleeConfiguration):
|
|
|
334
367
|
),
|
|
335
368
|
] = None
|
|
336
369
|
|
|
370
|
+
user_is_paying: Annotated[
|
|
371
|
+
bool,
|
|
372
|
+
Field(
|
|
373
|
+
alias='apify_user_is_paying',
|
|
374
|
+
description='True if the user calling the Actor is paying user',
|
|
375
|
+
),
|
|
376
|
+
BeforeValidator(lambda val: False if val == '' else val),
|
|
377
|
+
] = False
|
|
378
|
+
|
|
337
379
|
web_server_port: Annotated[
|
|
338
380
|
int,
|
|
339
381
|
Field(
|
apify/_proxy_configuration.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import ipaddress
|
|
4
|
+
import json
|
|
4
5
|
import re
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from re import Pattern
|
|
7
8
|
from typing import TYPE_CHECKING, Any
|
|
8
9
|
from urllib.parse import urljoin, urlparse
|
|
9
10
|
|
|
10
|
-
import
|
|
11
|
+
import impit
|
|
12
|
+
from yarl import URL
|
|
11
13
|
|
|
12
14
|
from apify_shared.consts import ApifyEnvVars
|
|
13
15
|
from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration
|
|
@@ -20,7 +22,8 @@ from apify.log import logger
|
|
|
20
22
|
|
|
21
23
|
if TYPE_CHECKING:
|
|
22
24
|
from apify_client import ApifyClientAsync
|
|
23
|
-
|
|
25
|
+
|
|
26
|
+
from apify import Request
|
|
24
27
|
|
|
25
28
|
APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$')
|
|
26
29
|
COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$')
|
|
@@ -230,7 +233,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
230
233
|
return None
|
|
231
234
|
|
|
232
235
|
if self._uses_apify_proxy:
|
|
233
|
-
parsed_url =
|
|
236
|
+
parsed_url = URL(proxy_info.url)
|
|
234
237
|
username = self._get_username(session_id)
|
|
235
238
|
|
|
236
239
|
return ProxyInfo(
|
|
@@ -274,11 +277,11 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
274
277
|
return
|
|
275
278
|
|
|
276
279
|
status = None
|
|
277
|
-
async with
|
|
280
|
+
async with impit.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
|
|
278
281
|
for _ in range(2):
|
|
279
282
|
try:
|
|
280
283
|
response = await client.get(proxy_status_url)
|
|
281
|
-
status = response.
|
|
284
|
+
status = json.loads(response.text)
|
|
282
285
|
break
|
|
283
286
|
except Exception: # noqa: S110
|
|
284
287
|
# retry on connection errors
|
apify/_utils.py
CHANGED
|
@@ -2,8 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import builtins
|
|
4
4
|
import sys
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from importlib import metadata
|
|
6
|
-
from typing import TYPE_CHECKING, Literal
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
8
|
|
|
8
9
|
if TYPE_CHECKING:
|
|
9
10
|
from collections.abc import Callable
|
|
@@ -81,3 +82,10 @@ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
|
|
|
81
82
|
return func
|
|
82
83
|
|
|
83
84
|
return wrapper
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
|
|
88
|
+
"""Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
|
|
89
|
+
if isinstance(maybe_enum_member, Enum):
|
|
90
|
+
return maybe_enum_member.value
|
|
91
|
+
return maybe_enum_member
|
apify/events/__init__.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated
|
|
6
|
+
|
|
7
|
+
import websockets.asyncio.client
|
|
8
|
+
from pydantic import Discriminator, TypeAdapter
|
|
9
|
+
from typing_extensions import Self, Unpack, override
|
|
10
|
+
|
|
11
|
+
from crawlee.events import EventManager
|
|
12
|
+
from crawlee.events._types import Event, EventPersistStateData
|
|
13
|
+
|
|
14
|
+
from apify._utils import docs_group
|
|
15
|
+
from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
|
|
16
|
+
from apify.log import logger
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from types import TracebackType
|
|
20
|
+
|
|
21
|
+
from crawlee.events._event_manager import EventManagerOptions
|
|
22
|
+
|
|
23
|
+
from apify._configuration import Configuration
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
|
|
27
|
+
Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@docs_group('Event managers')
|
|
32
|
+
class ApifyEventManager(EventManager):
|
|
33
|
+
"""Event manager for the Apify platform.
|
|
34
|
+
|
|
35
|
+
This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
|
|
36
|
+
connectivity to the Apify platform for receiving platform events.
|
|
37
|
+
|
|
38
|
+
The event manager handles:
|
|
39
|
+
- Registration and emission of events and their listeners.
|
|
40
|
+
- Websocket connection to Apify platform events.
|
|
41
|
+
- Processing and validation of platform messages.
|
|
42
|
+
- Automatic event forwarding from the platform to local event listeners.
|
|
43
|
+
|
|
44
|
+
This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
|
|
45
|
+
with the event system.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
|
|
49
|
+
"""Initialize a new instance.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
configuration: The Actor configuration for the event manager.
|
|
53
|
+
**kwargs: Additional event manager options passed to the parent class.
|
|
54
|
+
"""
|
|
55
|
+
super().__init__(**kwargs)
|
|
56
|
+
|
|
57
|
+
self._configuration = configuration
|
|
58
|
+
"""The Actor configuration for the event manager."""
|
|
59
|
+
|
|
60
|
+
self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
|
|
61
|
+
"""WebSocket connection to the platform events."""
|
|
62
|
+
|
|
63
|
+
self._process_platform_messages_task: asyncio.Task | None = None
|
|
64
|
+
"""Task for processing messages from the platform websocket."""
|
|
65
|
+
|
|
66
|
+
self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
|
|
67
|
+
"""Future that resolves when the connection to the platform websocket is established."""
|
|
68
|
+
|
|
69
|
+
@override
|
|
70
|
+
async def __aenter__(self) -> Self:
|
|
71
|
+
await super().__aenter__()
|
|
72
|
+
self._connected_to_platform_websocket = asyncio.Future()
|
|
73
|
+
|
|
74
|
+
# Run tasks but don't await them
|
|
75
|
+
if self._configuration.actor_events_ws_url:
|
|
76
|
+
self._process_platform_messages_task = asyncio.create_task(
|
|
77
|
+
self._process_platform_messages(self._configuration.actor_events_ws_url)
|
|
78
|
+
)
|
|
79
|
+
is_connected = await self._connected_to_platform_websocket
|
|
80
|
+
if not is_connected:
|
|
81
|
+
raise RuntimeError('Error connecting to platform events websocket!')
|
|
82
|
+
else:
|
|
83
|
+
logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
|
|
84
|
+
|
|
85
|
+
return self
|
|
86
|
+
|
|
87
|
+
@override
|
|
88
|
+
async def __aexit__(
|
|
89
|
+
self,
|
|
90
|
+
exc_type: type[BaseException] | None,
|
|
91
|
+
exc_value: BaseException | None,
|
|
92
|
+
exc_traceback: TracebackType | None,
|
|
93
|
+
) -> None:
|
|
94
|
+
if self._platform_events_websocket:
|
|
95
|
+
await self._platform_events_websocket.close()
|
|
96
|
+
|
|
97
|
+
if self._process_platform_messages_task and not self._process_platform_messages_task.done():
|
|
98
|
+
self._process_platform_messages_task.cancel()
|
|
99
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
100
|
+
await self._process_platform_messages_task
|
|
101
|
+
|
|
102
|
+
await super().__aexit__(exc_type, exc_value, exc_traceback)
|
|
103
|
+
|
|
104
|
+
async def _process_platform_messages(self, ws_url: str) -> None:
|
|
105
|
+
try:
|
|
106
|
+
async with websockets.asyncio.client.connect(ws_url) as websocket:
|
|
107
|
+
self._platform_events_websocket = websocket
|
|
108
|
+
if self._connected_to_platform_websocket is not None:
|
|
109
|
+
self._connected_to_platform_websocket.set_result(True)
|
|
110
|
+
|
|
111
|
+
async for message in websocket:
|
|
112
|
+
try:
|
|
113
|
+
parsed_message = event_data_adapter.validate_json(message)
|
|
114
|
+
|
|
115
|
+
if isinstance(parsed_message, DeprecatedEvent):
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
if isinstance(parsed_message, UnknownEvent):
|
|
119
|
+
logger.info(
|
|
120
|
+
f'Unknown message received: event_name={parsed_message.name}, '
|
|
121
|
+
f'event_data={parsed_message.data}'
|
|
122
|
+
)
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
self.emit(
|
|
126
|
+
event=parsed_message.name,
|
|
127
|
+
event_data=parsed_message.data
|
|
128
|
+
if not isinstance(parsed_message.data, SystemInfoEventData)
|
|
129
|
+
else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if parsed_message.name == Event.MIGRATING:
|
|
133
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
134
|
+
self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
|
|
135
|
+
except Exception:
|
|
136
|
+
logger.exception('Cannot parse Actor event', extra={'message': message})
|
|
137
|
+
except Exception:
|
|
138
|
+
logger.exception('Error in websocket connection')
|
|
139
|
+
if self._connected_to_platform_websocket is not None:
|
|
140
|
+
self._connected_to_platform_websocket.set_result(False)
|
apify/events/_types.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Annotated, Any, Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from crawlee.events._types import (
|
|
9
|
+
Event,
|
|
10
|
+
EventAbortingData,
|
|
11
|
+
EventExitData,
|
|
12
|
+
EventMigratingData,
|
|
13
|
+
EventPersistStateData,
|
|
14
|
+
EventSystemInfoData,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from apify._utils import docs_group
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@docs_group('Event data')
|
|
21
|
+
class SystemInfoEventData(BaseModel):
|
|
22
|
+
mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
|
|
23
|
+
mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
|
|
24
|
+
mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
|
|
25
|
+
cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
|
|
26
|
+
cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
|
|
27
|
+
cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
|
|
28
|
+
is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
|
|
29
|
+
created_at: Annotated[datetime, Field(alias='createdAt')]
|
|
30
|
+
|
|
31
|
+
def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
|
|
32
|
+
return EventSystemInfoData.model_validate(
|
|
33
|
+
{
|
|
34
|
+
'cpu_info': {
|
|
35
|
+
'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
|
|
36
|
+
'created_at': self.created_at,
|
|
37
|
+
},
|
|
38
|
+
'memory_info': {
|
|
39
|
+
'total_size': self.mem_max_bytes,
|
|
40
|
+
'current_size': self.mem_current_bytes,
|
|
41
|
+
'created_at': self.created_at,
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@docs_group('Events')
|
|
48
|
+
class PersistStateEvent(BaseModel):
|
|
49
|
+
name: Literal[Event.PERSIST_STATE]
|
|
50
|
+
data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@docs_group('Events')
|
|
54
|
+
class SystemInfoEvent(BaseModel):
|
|
55
|
+
name: Literal[Event.SYSTEM_INFO]
|
|
56
|
+
data: SystemInfoEventData
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@docs_group('Events')
|
|
60
|
+
class MigratingEvent(BaseModel):
|
|
61
|
+
name: Literal[Event.MIGRATING]
|
|
62
|
+
data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@docs_group('Events')
|
|
66
|
+
class AbortingEvent(BaseModel):
|
|
67
|
+
name: Literal[Event.ABORTING]
|
|
68
|
+
data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@docs_group('Events')
|
|
72
|
+
class ExitEvent(BaseModel):
|
|
73
|
+
name: Literal[Event.EXIT]
|
|
74
|
+
data: Annotated[EventExitData, Field(default_factory=EventExitData)]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@docs_group('Events')
|
|
78
|
+
class EventWithoutData(BaseModel):
|
|
79
|
+
name: Literal[
|
|
80
|
+
Event.SESSION_RETIRED,
|
|
81
|
+
Event.BROWSER_LAUNCHED,
|
|
82
|
+
Event.BROWSER_RETIRED,
|
|
83
|
+
Event.BROWSER_CLOSED,
|
|
84
|
+
Event.PAGE_CREATED,
|
|
85
|
+
Event.PAGE_CLOSED,
|
|
86
|
+
]
|
|
87
|
+
data: Any = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@docs_group('Events')
|
|
91
|
+
class DeprecatedEvent(BaseModel):
|
|
92
|
+
name: Literal['cpuInfo']
|
|
93
|
+
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@docs_group('Events')
|
|
97
|
+
class UnknownEvent(BaseModel):
|
|
98
|
+
name: str
|
|
99
|
+
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData
|
apify/log.py
CHANGED
|
@@ -27,13 +27,6 @@ def _configure_logging() -> None:
|
|
|
27
27
|
else:
|
|
28
28
|
apify_client_logger.setLevel(level)
|
|
29
29
|
|
|
30
|
-
# Silence HTTPX logger unless debug logging is requested
|
|
31
|
-
httpx_logger = logging.getLogger('httpx')
|
|
32
|
-
if level > logging.DEBUG:
|
|
33
|
-
httpx_logger.setLevel(logging.WARNING)
|
|
34
|
-
else:
|
|
35
|
-
httpx_logger.setLevel(level)
|
|
36
|
-
|
|
37
30
|
# Use configured log level for apify logger
|
|
38
31
|
apify_logger = logging.getLogger('apify')
|
|
39
32
|
configure_logger(apify_logger, remove_old_handlers=True)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from crawlee.request_loaders import (
|
|
2
|
+
RequestList,
|
|
3
|
+
RequestLoader,
|
|
4
|
+
RequestManager,
|
|
5
|
+
RequestManagerTandem,
|
|
6
|
+
SitemapRequestLoader,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from ._apify_request_list import ApifyRequestList
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
'ApifyRequestList',
|
|
13
|
+
'RequestList',
|
|
14
|
+
'RequestLoader',
|
|
15
|
+
'RequestManager',
|
|
16
|
+
'RequestManagerTandem',
|
|
17
|
+
'SitemapRequestLoader',
|
|
18
|
+
]
|
|
@@ -3,16 +3,15 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import re
|
|
5
5
|
from asyncio import Task
|
|
6
|
-
from functools import partial
|
|
7
6
|
from typing import Annotated, Any
|
|
8
7
|
|
|
9
8
|
from pydantic import BaseModel, Field, TypeAdapter
|
|
10
9
|
|
|
11
|
-
from crawlee import Request
|
|
12
10
|
from crawlee._types import HttpMethod
|
|
13
|
-
from crawlee.http_clients import HttpClient,
|
|
14
|
-
from crawlee.request_loaders import RequestList
|
|
11
|
+
from crawlee.http_clients import HttpClient, ImpitHttpClient
|
|
12
|
+
from crawlee.request_loaders import RequestList
|
|
15
13
|
|
|
14
|
+
from apify import Request
|
|
16
15
|
from apify._utils import docs_group
|
|
17
16
|
|
|
18
17
|
URL_NO_COMMAS_REGEX = re.compile(
|
|
@@ -39,7 +38,7 @@ url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput])
|
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
@docs_group('Request loaders')
|
|
42
|
-
class RequestList
|
|
41
|
+
class ApifyRequestList(RequestList):
|
|
43
42
|
"""Extends crawlee RequestList.
|
|
44
43
|
|
|
45
44
|
Method open is used to create RequestList from actor's requestListSources input.
|
|
@@ -50,7 +49,7 @@ class RequestList(CrawleeRequestList):
|
|
|
50
49
|
name: str | None = None,
|
|
51
50
|
request_list_sources_input: list[dict[str, Any]] | None = None,
|
|
52
51
|
http_client: HttpClient | None = None,
|
|
53
|
-
) ->
|
|
52
|
+
) -> ApifyRequestList:
|
|
54
53
|
"""Initialize a new instance from request list source input.
|
|
55
54
|
|
|
56
55
|
Args:
|
|
@@ -74,24 +73,26 @@ class RequestList(CrawleeRequestList):
|
|
|
74
73
|
```
|
|
75
74
|
"""
|
|
76
75
|
request_list_sources_input = request_list_sources_input or []
|
|
77
|
-
return await
|
|
76
|
+
return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
|
|
78
77
|
|
|
79
78
|
@staticmethod
|
|
80
79
|
async def _create_request_list(
|
|
81
80
|
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
|
|
82
|
-
) ->
|
|
81
|
+
) -> ApifyRequestList:
|
|
83
82
|
if not http_client:
|
|
84
|
-
http_client =
|
|
83
|
+
http_client = ImpitHttpClient()
|
|
85
84
|
|
|
86
85
|
url_inputs = url_input_adapter.validate_python(request_list_sources_input)
|
|
87
86
|
|
|
88
87
|
simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
|
|
89
88
|
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
|
|
90
89
|
|
|
91
|
-
simple_url_requests =
|
|
92
|
-
remote_url_requests = await
|
|
90
|
+
simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
|
|
91
|
+
remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
|
|
92
|
+
remote_url_inputs, http_client=http_client
|
|
93
|
+
)
|
|
93
94
|
|
|
94
|
-
return
|
|
95
|
+
return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
|
|
95
96
|
|
|
96
97
|
@staticmethod
|
|
97
98
|
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
|
|
@@ -119,13 +120,15 @@ class RequestList(CrawleeRequestList):
|
|
|
119
120
|
"""
|
|
120
121
|
created_requests: list[Request] = []
|
|
121
122
|
|
|
122
|
-
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
|
|
123
|
+
async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
|
|
123
124
|
"""Extract links from response body and use them to create `Request` objects.
|
|
124
125
|
|
|
125
126
|
Use the regular expression to find all matching links in the response body, then create `Request`
|
|
126
127
|
objects from these links and the provided input attributes.
|
|
127
128
|
"""
|
|
128
|
-
|
|
129
|
+
response = await (task.result()).read()
|
|
130
|
+
matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8'))
|
|
131
|
+
|
|
129
132
|
created_requests.extend(
|
|
130
133
|
[
|
|
131
134
|
Request.from_url(
|
|
@@ -148,7 +151,11 @@ class RequestList(CrawleeRequestList):
|
|
|
148
151
|
)
|
|
149
152
|
)
|
|
150
153
|
|
|
151
|
-
get_response_task.add_done_callback(
|
|
154
|
+
get_response_task.add_done_callback(
|
|
155
|
+
lambda task, inp=remote_url_requests_input: asyncio.create_task( # type: ignore[misc]
|
|
156
|
+
create_requests_from_response(inp, task)
|
|
157
|
+
)
|
|
158
|
+
)
|
|
152
159
|
remote_url_requests.append(get_response_task)
|
|
153
160
|
|
|
154
161
|
await asyncio.gather(*remote_url_requests)
|
|
File without changes
|
apify/scrapy/_logging_config.py
CHANGED
|
@@ -10,7 +10,7 @@ from apify.log import ActorLogFormatter
|
|
|
10
10
|
|
|
11
11
|
# Define logger names.
|
|
12
12
|
_PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
|
|
13
|
-
_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', '
|
|
13
|
+
_SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
|
|
14
14
|
_ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
|
|
15
15
|
|
|
16
16
|
|
|
@@ -37,9 +37,6 @@ def initialize_logging() -> None:
|
|
|
37
37
|
for logger_name in [None, *_ALL_LOGGERS]:
|
|
38
38
|
_configure_logger(logger_name, logging_level, handler)
|
|
39
39
|
|
|
40
|
-
# Set the 'httpx' logger to a less verbose level.
|
|
41
|
-
logging.getLogger('httpx').setLevel('WARNING')
|
|
42
|
-
|
|
43
40
|
# Monkey-patch Scrapy's logging configuration to re-apply our settings.
|
|
44
41
|
original_configure_logging = scrapy_logging.configure_logging
|
|
45
42
|
|
|
@@ -13,8 +13,8 @@ from scrapy.http.headers import Headers
|
|
|
13
13
|
from scrapy.responsetypes import responsetypes
|
|
14
14
|
|
|
15
15
|
from apify import Configuration
|
|
16
|
-
from apify.apify_storage_client import ApifyStorageClient
|
|
17
16
|
from apify.scrapy._async_thread import AsyncThread
|
|
17
|
+
from apify.storage_clients import ApifyStorageClient
|
|
18
18
|
from apify.storages import KeyValueStore
|
|
19
19
|
|
|
20
20
|
if TYPE_CHECKING:
|
|
@@ -51,10 +51,14 @@ class ApifyCacheStorage:
|
|
|
51
51
|
kvs_name = get_kvs_name(spider.name)
|
|
52
52
|
|
|
53
53
|
async def open_kvs() -> KeyValueStore:
|
|
54
|
-
|
|
55
|
-
if
|
|
56
|
-
storage_client = ApifyStorageClient
|
|
57
|
-
return await KeyValueStore.open(
|
|
54
|
+
configuration = Configuration.get_global_configuration()
|
|
55
|
+
if configuration.is_at_home:
|
|
56
|
+
storage_client = ApifyStorageClient()
|
|
57
|
+
return await KeyValueStore.open(
|
|
58
|
+
name=kvs_name,
|
|
59
|
+
configuration=configuration,
|
|
60
|
+
storage_client=storage_client,
|
|
61
|
+
)
|
|
58
62
|
return await KeyValueStore.open(name=kvs_name)
|
|
59
63
|
|
|
60
64
|
logger.debug("Starting background thread for cache storage's event loop")
|
apify/scrapy/requests.py
CHANGED
|
@@ -10,9 +10,10 @@ from scrapy import Spider
|
|
|
10
10
|
from scrapy.http.headers import Headers
|
|
11
11
|
from scrapy.utils.request import request_from_dict
|
|
12
12
|
|
|
13
|
-
from crawlee import Request as ApifyRequest
|
|
14
13
|
from crawlee._types import HttpHeaders
|
|
15
14
|
|
|
15
|
+
from apify import Request as ApifyRequest
|
|
16
|
+
|
|
16
17
|
logger = getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
@@ -121,7 +122,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
|
|
|
121
122
|
|
|
122
123
|
# Update the meta field with the meta field from the apify_request
|
|
123
124
|
meta = scrapy_request.meta or {}
|
|
124
|
-
meta.update({'
|
|
125
|
+
meta.update({'apify_request_unique_key': apify_request.unique_key})
|
|
125
126
|
# scrapy_request.meta is a property, so we have to set it like this
|
|
126
127
|
scrapy_request._meta = meta # noqa: SLF001
|
|
127
128
|
|
|
@@ -133,7 +134,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
|
|
|
133
134
|
url=apify_request.url,
|
|
134
135
|
method=apify_request.method,
|
|
135
136
|
meta={
|
|
136
|
-
'apify_request_id': apify_request.id,
|
|
137
137
|
'apify_request_unique_key': apify_request.unique_key,
|
|
138
138
|
},
|
|
139
139
|
)
|
apify/scrapy/scheduler.py
CHANGED
|
@@ -11,7 +11,7 @@ from scrapy.utils.reactor import is_asyncio_reactor_installed
|
|
|
11
11
|
from ._async_thread import AsyncThread
|
|
12
12
|
from .requests import to_apify_request, to_scrapy_request
|
|
13
13
|
from apify import Configuration
|
|
14
|
-
from apify.
|
|
14
|
+
from apify.storage_clients import ApifyStorageClient
|
|
15
15
|
from apify.storages import RequestQueue
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
@@ -49,10 +49,13 @@ class ApifyScheduler(BaseScheduler):
|
|
|
49
49
|
self.spider = spider
|
|
50
50
|
|
|
51
51
|
async def open_rq() -> RequestQueue:
|
|
52
|
-
|
|
53
|
-
if
|
|
54
|
-
storage_client = ApifyStorageClient
|
|
55
|
-
return await RequestQueue.open(
|
|
52
|
+
configuration = Configuration.get_global_configuration()
|
|
53
|
+
if configuration.is_at_home:
|
|
54
|
+
storage_client = ApifyStorageClient()
|
|
55
|
+
return await RequestQueue.open(
|
|
56
|
+
configuration=configuration,
|
|
57
|
+
storage_client=storage_client,
|
|
58
|
+
)
|
|
56
59
|
return await RequestQueue.open()
|
|
57
60
|
|
|
58
61
|
try:
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from crawlee.storage_clients import MemoryStorageClient
|
|
2
|
+
|
|
3
|
+
from ._apify import ApifyStorageClient
|
|
4
|
+
from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
'ApifyStorageClient',
|
|
8
|
+
'FileSystemStorageClient',
|
|
9
|
+
'MemoryStorageClient',
|
|
10
|
+
]
|