apify 3.4.2b4__tar.gz → 3.4.2b6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {apify-3.4.2b4 → apify-3.4.2b6}/CHANGELOG.md +6 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/PKG-INFO +1 -1
- {apify-3.4.2b4 → apify-3.4.2b6}/pyproject.toml +1 -1
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_charging.py +70 -24
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/events/_types.py +57 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/_async_thread.py +14 -7
- apify-3.4.2b6/src/apify/scrapy/_serialization.py +138 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/extensions/_httpcache.py +49 -33
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/middlewares/apify_proxy.py +2 -3
- apify-3.4.2b6/src/apify/scrapy/requests.py +216 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/scheduler.py +14 -3
- apify-3.4.2b4/src/apify/scrapy/requests.py +0 -164
- {apify-3.4.2b4 → apify-3.4.2b6}/.gitignore +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/CONTRIBUTING.md +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/LICENSE +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/README.md +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_actor.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_configuration.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_consts.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_crypto.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_proxy_configuration.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_utils.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/_webhook.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/events/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/events/_apify_event_manager.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/events/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/log.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/request_loaders/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/request_loaders/_apify_request_list.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/request_loaders/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/_actor_runner.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/_logging_config.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/extensions/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/middlewares/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/pipelines/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/scrapy/utils.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_alias_resolving.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_api_client_creation.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_models.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_request_queue_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_request_queue_single_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_utils.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_apify/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storage_clients/py.typed +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storages/__init__.py +0 -0
- {apify-3.4.2b4 → apify-3.4.2b6}/src/apify/storages/py.typed +0 -0
|
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
<!-- git-cliff-unreleased-start -->
|
|
6
6
|
## 3.4.2 - **not yet released**
|
|
7
7
|
|
|
8
|
+
### 🐛 Bug Fixes
|
|
9
|
+
|
|
10
|
+
- **scrapy:** Correct proxy middleware exception log and import ([#953](https://github.com/apify/apify-sdk-python/pull/953)) ([5bd6eb9](https://github.com/apify/apify-sdk-python/commit/5bd6eb9843d90844cec083372e932413bceedec9)) by [@vdusek](https://github.com/vdusek)
|
|
11
|
+
- **scrapy:** Skip a request that fails to convert instead of crashing the run ([#952](https://github.com/apify/apify-sdk-python/pull/952)) ([db9444f](https://github.com/apify/apify-sdk-python/commit/db9444faeb0158c29aa394121cf733ff2e843f28)) by [@vdusek](https://github.com/vdusek)
|
|
12
|
+
- **scrapy:** [**breaking**] Serialize requests and HTTP cache as JSON instead of pickle ([#951](https://github.com/apify/apify-sdk-python/pull/951)) ([a87e8d1](https://github.com/apify/apify-sdk-python/commit/a87e8d1597478b4f12fd5bb9b379f65f637d8e96)) by [@vdusek](https://github.com/vdusek)
|
|
13
|
+
|
|
8
14
|
### 🚜 Refactor
|
|
9
15
|
|
|
10
16
|
- [**breaking**] Remove deprecated APIs ([#918](https://github.com/apify/apify-sdk-python/pull/918)) ([3e5728d](https://github.com/apify/apify-sdk-python/commit/3e5728d94cb8fd879d5a76e33a03d55792d835d5)) by [@vdusek](https://github.com/vdusek), closes [#635](https://github.com/apify/apify-sdk-python/issues/635)
|
|
@@ -7,7 +7,7 @@ from datetime import UTC, datetime
|
|
|
7
7
|
from decimal import Decimal
|
|
8
8
|
from typing import TYPE_CHECKING, Annotated, Literal, Protocol, TypedDict
|
|
9
9
|
|
|
10
|
-
from pydantic import
|
|
10
|
+
from pydantic import Field
|
|
11
11
|
|
|
12
12
|
import apify_client._models as _client_models
|
|
13
13
|
from apify_client._models import ActorChargeEvent as ClientActorChargeEvent
|
|
@@ -28,14 +28,17 @@ if TYPE_CHECKING:
|
|
|
28
28
|
|
|
29
29
|
from apify._configuration import Configuration
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
"""
|
|
31
|
+
charging_manager_ctx: ContextVar[ChargingManager | None] = ContextVar('charging_manager_ctx', default=None)
|
|
32
|
+
"""Holds the current `ChargingManager` instance, if any.
|
|
33
|
+
|
|
34
|
+
Allows PPE-aware dataset clients to access the charging manager without needing to pass it explicitly.
|
|
35
|
+
"""
|
|
33
36
|
|
|
34
37
|
DEFAULT_DATASET_ITEM_EVENT = 'apify-default-dataset-item'
|
|
38
|
+
"""Name of the synthetic event charged for each item pushed to the default dataset."""
|
|
35
39
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
charging_manager_ctx: ContextVar[ChargingManager | None] = ContextVar('charging_manager_ctx', default=None)
|
|
40
|
+
PricingModel = Literal['PAY_PER_EVENT', 'PRICE_PER_DATASET_ITEM', 'FLAT_PRICE_PER_MONTH', 'FREE']
|
|
41
|
+
"""Pricing model for an Actor."""
|
|
39
42
|
|
|
40
43
|
_ensure_context = ensure_context('active')
|
|
41
44
|
|
|
@@ -49,48 +52,91 @@ _ensure_context = ensure_context('active')
|
|
|
49
52
|
# `apify-client` instance) flows through the same code paths without conversion.
|
|
50
53
|
|
|
51
54
|
|
|
52
|
-
class _RelaxedPricingMetadata(BaseModel):
|
|
53
|
-
"""Mixin relaxing the `CommonActorPricingInfo` metadata fields the platform env var omits."""
|
|
54
|
-
|
|
55
|
-
model_config = ConfigDict(populate_by_name=True, extra='allow')
|
|
56
|
-
|
|
57
|
-
apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
|
|
58
|
-
created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
|
|
59
|
-
started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
|
|
60
|
-
|
|
61
|
-
|
|
62
55
|
@docs_group('Charging')
|
|
63
56
|
class ActorChargeEvent(ClientActorChargeEvent):
|
|
64
|
-
|
|
57
|
+
"""Definition of a single chargeable event in the pay-per-event pricing model."""
|
|
58
|
+
|
|
65
59
|
event_description: Annotated[str | None, Field(alias='eventDescription')] = None
|
|
60
|
+
"""Human-readable description of the event.
|
|
61
|
+
|
|
62
|
+
Required in apify-client but omitted from the env var, so it is relaxed to optional.
|
|
63
|
+
"""
|
|
66
64
|
|
|
67
65
|
|
|
68
66
|
@docs_group('Charging')
|
|
69
67
|
class PricingPerEvent(ClientPricingPerEvent):
|
|
68
|
+
"""Pay-per-event pricing details - the chargeable events and their prices."""
|
|
69
|
+
|
|
70
70
|
actor_charge_events: Annotated[dict[str, ActorChargeEvent] | None, Field(alias='actorChargeEvents')] = None
|
|
71
|
+
"""Mapping of event name to its charge definition."""
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
@docs_group('Charging')
|
|
74
|
-
class FreeActorPricingInfo(
|
|
75
|
-
|
|
75
|
+
class FreeActorPricingInfo(ClientFree):
|
|
76
|
+
"""Pricing info for an Actor offered free of charge."""
|
|
77
|
+
|
|
78
|
+
apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
|
|
79
|
+
"""Apify's margin on the price, as a percentage."""
|
|
80
|
+
|
|
81
|
+
created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
|
|
82
|
+
"""Timestamp when this pricing info was created."""
|
|
83
|
+
|
|
84
|
+
started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
|
|
85
|
+
"""Timestamp when this pricing became effective."""
|
|
76
86
|
|
|
77
87
|
|
|
78
88
|
@docs_group('Charging')
|
|
79
|
-
class FlatPricePerMonthActorPricingInfo(
|
|
89
|
+
class FlatPricePerMonthActorPricingInfo(ClientFlatPricePerMonth):
|
|
90
|
+
"""Pricing info for an Actor billed at a flat monthly price."""
|
|
91
|
+
|
|
92
|
+
apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
|
|
93
|
+
"""Apify's margin on the price, as a percentage."""
|
|
94
|
+
|
|
95
|
+
created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
|
|
96
|
+
"""Timestamp when this pricing info was created."""
|
|
97
|
+
|
|
98
|
+
started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
|
|
99
|
+
"""Timestamp when this pricing became effective."""
|
|
100
|
+
|
|
80
101
|
trial_minutes: Annotated[int | None, Field(alias='trialMinutes')] = None
|
|
102
|
+
"""Length of the free trial period, in minutes."""
|
|
103
|
+
|
|
81
104
|
price_per_unit_usd: Annotated[float | None, Field(alias='pricePerUnitUsd')] = None
|
|
105
|
+
"""Price per unit, in USD."""
|
|
82
106
|
|
|
83
107
|
|
|
84
108
|
@docs_group('Charging')
|
|
85
|
-
class PricePerDatasetItemActorPricingInfo(
|
|
109
|
+
class PricePerDatasetItemActorPricingInfo(ClientPricePerDatasetItem):
|
|
110
|
+
"""Pricing info for an Actor billed per dataset item produced."""
|
|
111
|
+
|
|
112
|
+
apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
|
|
113
|
+
"""Apify's margin on the price, as a percentage."""
|
|
114
|
+
|
|
115
|
+
created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
|
|
116
|
+
"""Timestamp when this pricing info was created."""
|
|
117
|
+
|
|
118
|
+
started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
|
|
119
|
+
"""Timestamp when this pricing became effective."""
|
|
120
|
+
|
|
86
121
|
unit_name: Annotated[str | None, Field(alias='unitName')] = None
|
|
87
|
-
|
|
122
|
+
"""Name of the billed unit."""
|
|
88
123
|
|
|
89
124
|
|
|
90
125
|
@docs_group('Charging')
|
|
91
|
-
class PayPerEventActorPricingInfo(
|
|
92
|
-
|
|
126
|
+
class PayPerEventActorPricingInfo(ClientPayPerEvent):
|
|
127
|
+
"""Pricing info for an Actor billed per charged event."""
|
|
128
|
+
|
|
129
|
+
apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
|
|
130
|
+
"""Apify's margin on the price, as a percentage."""
|
|
131
|
+
|
|
132
|
+
created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
|
|
133
|
+
"""Timestamp when this pricing info was created."""
|
|
134
|
+
|
|
135
|
+
started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
|
|
136
|
+
"""Timestamp when this pricing became effective."""
|
|
137
|
+
|
|
93
138
|
pricing_per_event: Annotated[PricingPerEvent, Field(alias='pricingPerEvent')]
|
|
139
|
+
"""The pay-per-event pricing details."""
|
|
94
140
|
|
|
95
141
|
|
|
96
142
|
ActorPricingInfoModel = ClientFree | ClientFlatPricePerMonth | ClientPricePerDatasetItem | ClientPayPerEvent
|
|
@@ -27,14 +27,31 @@ This is the Apify-specific subset of [`Event`][crawlee.events.Event] — for the
|
|
|
27
27
|
|
|
28
28
|
@docs_group('Event data')
|
|
29
29
|
class SystemInfoEventData(BaseModel):
|
|
30
|
+
"""Resource usage metrics carried by a `systemInfo` event."""
|
|
31
|
+
|
|
30
32
|
mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
|
|
33
|
+
"""Average memory usage over the measured interval, in bytes."""
|
|
34
|
+
|
|
31
35
|
mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
|
|
36
|
+
"""Current memory usage, in bytes."""
|
|
37
|
+
|
|
32
38
|
mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
|
|
39
|
+
"""Peak memory usage observed so far, in bytes."""
|
|
40
|
+
|
|
33
41
|
cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
|
|
42
|
+
"""Average CPU usage over the measured interval, in percent."""
|
|
43
|
+
|
|
34
44
|
cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
|
|
45
|
+
"""Peak CPU usage observed so far, in percent."""
|
|
46
|
+
|
|
35
47
|
cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
|
|
48
|
+
"""Current CPU usage, in percent."""
|
|
49
|
+
|
|
36
50
|
is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
|
|
51
|
+
"""Whether the CPU is currently overloaded."""
|
|
52
|
+
|
|
37
53
|
created_at: Annotated[datetime, Field(alias='createdAt')]
|
|
54
|
+
"""Timestamp when the metrics were collected."""
|
|
38
55
|
|
|
39
56
|
def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
|
|
40
57
|
return EventSystemInfoData.model_validate(
|
|
@@ -54,36 +71,63 @@ class SystemInfoEventData(BaseModel):
|
|
|
54
71
|
|
|
55
72
|
@docs_group('Events')
|
|
56
73
|
class PersistStateEvent(BaseModel):
|
|
74
|
+
"""A `persistState` event instructing the Actor to persist its state."""
|
|
75
|
+
|
|
57
76
|
name: Literal[Event.PERSIST_STATE]
|
|
77
|
+
"""The event name."""
|
|
78
|
+
|
|
58
79
|
data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
|
|
80
|
+
"""The event payload."""
|
|
59
81
|
|
|
60
82
|
|
|
61
83
|
@docs_group('Events')
|
|
62
84
|
class SystemInfoEvent(BaseModel):
|
|
85
|
+
"""A `systemInfo` event carrying the Actor's resource usage metrics."""
|
|
86
|
+
|
|
63
87
|
name: Literal[Event.SYSTEM_INFO]
|
|
88
|
+
"""The event name."""
|
|
89
|
+
|
|
64
90
|
data: SystemInfoEventData
|
|
91
|
+
"""The event payload."""
|
|
65
92
|
|
|
66
93
|
|
|
67
94
|
@docs_group('Events')
|
|
68
95
|
class MigratingEvent(BaseModel):
|
|
96
|
+
"""A `migrating` event signalling the Actor is about to be migrated to another host."""
|
|
97
|
+
|
|
69
98
|
name: Literal[Event.MIGRATING]
|
|
99
|
+
"""The event name."""
|
|
100
|
+
|
|
70
101
|
data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
|
|
102
|
+
"""The event payload."""
|
|
71
103
|
|
|
72
104
|
|
|
73
105
|
@docs_group('Events')
|
|
74
106
|
class AbortingEvent(BaseModel):
|
|
107
|
+
"""An `aborting` event signalling the Actor run is being aborted."""
|
|
108
|
+
|
|
75
109
|
name: Literal[Event.ABORTING]
|
|
110
|
+
"""The event name."""
|
|
111
|
+
|
|
76
112
|
data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
|
|
113
|
+
"""The event payload."""
|
|
77
114
|
|
|
78
115
|
|
|
79
116
|
@docs_group('Events')
|
|
80
117
|
class ExitEvent(BaseModel):
|
|
118
|
+
"""An `exit` event signalling the Actor process is about to exit."""
|
|
119
|
+
|
|
81
120
|
name: Literal[Event.EXIT]
|
|
121
|
+
"""The event name."""
|
|
122
|
+
|
|
82
123
|
data: Annotated[EventExitData, Field(default_factory=EventExitData)]
|
|
124
|
+
"""The event payload."""
|
|
83
125
|
|
|
84
126
|
|
|
85
127
|
@docs_group('Events')
|
|
86
128
|
class EventWithoutData(BaseModel):
|
|
129
|
+
"""A framework-level event that carries no payload (e.g. browser and page lifecycle events)."""
|
|
130
|
+
|
|
87
131
|
name: Literal[
|
|
88
132
|
Event.SESSION_RETIRED,
|
|
89
133
|
Event.BROWSER_LAUNCHED,
|
|
@@ -92,19 +136,32 @@ class EventWithoutData(BaseModel):
|
|
|
92
136
|
Event.PAGE_CREATED,
|
|
93
137
|
Event.PAGE_CLOSED,
|
|
94
138
|
]
|
|
139
|
+
"""The event name."""
|
|
140
|
+
|
|
95
141
|
data: Any = None
|
|
142
|
+
"""The event payload, always empty for this event."""
|
|
96
143
|
|
|
97
144
|
|
|
98
145
|
@docs_group('Events')
|
|
99
146
|
class DeprecatedEvent(BaseModel):
|
|
147
|
+
"""A deprecated event kept for backward compatibility (e.g. `cpuInfo`)."""
|
|
148
|
+
|
|
100
149
|
name: Literal['cpuInfo']
|
|
150
|
+
"""The event name."""
|
|
151
|
+
|
|
101
152
|
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
153
|
+
"""The event payload."""
|
|
102
154
|
|
|
103
155
|
|
|
104
156
|
@docs_group('Events')
|
|
105
157
|
class UnknownEvent(BaseModel):
|
|
158
|
+
"""A fallback for any event whose name is not recognized by the SDK."""
|
|
159
|
+
|
|
106
160
|
name: str
|
|
161
|
+
"""The event name."""
|
|
162
|
+
|
|
107
163
|
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
164
|
+
"""The event payload."""
|
|
108
165
|
|
|
109
166
|
|
|
110
167
|
EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData
|
|
@@ -5,7 +5,7 @@ import threading
|
|
|
5
5
|
from concurrent import futures
|
|
6
6
|
from datetime import timedelta
|
|
7
7
|
from logging import getLogger
|
|
8
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
from collections.abc import Coroutine
|
|
@@ -14,13 +14,16 @@ logger = getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class AsyncThread:
|
|
17
|
-
"""
|
|
17
|
+
"""Run an asyncio event loop in a dedicated background thread.
|
|
18
18
|
|
|
19
|
-
This
|
|
20
|
-
|
|
19
|
+
This lets synchronous Scrapy callbacks drive asynchronous Apify and Crawlee coroutines. The
|
|
20
|
+
scheduler and the HTTP cache storage each own their own `AsyncThread`, so the request queue and
|
|
21
|
+
the key-value store never share an event loop; they only share the read-only global
|
|
22
|
+
`Configuration`. A single shared loop would also work but would couple their lifecycles.
|
|
21
23
|
"""
|
|
22
24
|
|
|
23
|
-
def __init__(self) -> None:
|
|
25
|
+
def __init__(self, default_timeout: timedelta = timedelta(seconds=60)) -> None:
|
|
26
|
+
self._default_timeout = default_timeout
|
|
24
27
|
self._eventloop = asyncio.new_event_loop()
|
|
25
28
|
|
|
26
29
|
# Start the event loop in a dedicated daemon thread.
|
|
@@ -33,7 +36,7 @@ class AsyncThread:
|
|
|
33
36
|
def run_coro(
|
|
34
37
|
self,
|
|
35
38
|
coro: Coroutine,
|
|
36
|
-
timeout: timedelta =
|
|
39
|
+
timeout: timedelta | Literal['default'] = 'default',
|
|
37
40
|
) -> Any:
|
|
38
41
|
"""Run a coroutine on an event loop running in a separate thread.
|
|
39
42
|
|
|
@@ -42,7 +45,8 @@ class AsyncThread:
|
|
|
42
45
|
|
|
43
46
|
Args:
|
|
44
47
|
coro: The coroutine to run.
|
|
45
|
-
timeout: The maximum
|
|
48
|
+
timeout: The maximum time to wait for the coroutine to finish. Pass `'default'` to use the
|
|
49
|
+
`default_timeout` passed to the constructor.
|
|
46
50
|
|
|
47
51
|
Returns:
|
|
48
52
|
The result returned by the coroutine.
|
|
@@ -52,6 +56,9 @@ class AsyncThread:
|
|
|
52
56
|
TimeoutError: If the coroutine does not complete within the timeout.
|
|
53
57
|
Exception: Any exception raised during coroutine execution.
|
|
54
58
|
"""
|
|
59
|
+
if timeout == 'default':
|
|
60
|
+
timeout = self._default_timeout
|
|
61
|
+
|
|
55
62
|
if not self._eventloop.is_running():
|
|
56
63
|
raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.')
|
|
57
64
|
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""JSON serialization of Scrapy requests and cached responses for storage on the Apify platform.
|
|
2
|
+
|
|
3
|
+
Scrapy requests and cached responses are stored in the Apify request queue and key-value store which hold JSON,
|
|
4
|
+
so they are serialized as JSON here rather than pickled.
|
|
5
|
+
|
|
6
|
+
Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable; both sit at fixed keys
|
|
7
|
+
and are base64-encoded in place. A `str` `body` is encoded as its UTF-8 bytes and comes back as `bytes`, matching
|
|
8
|
+
Scrapy, which always stores `body` as `bytes`. Pydantic models such as Crawlee's `UserData` are dumped via
|
|
9
|
+
`model_dump()`. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise
|
|
10
|
+
serialization fails with a clear error naming the offending value. No in-band sentinel is used, so no user value
|
|
11
|
+
can collide with the encoding.
|
|
12
|
+
|
|
13
|
+
Known limitations of the pickle -> JSON switch (a documented breaking change): JSON has fewer types than pickle,
|
|
14
|
+
so values in `meta`/`cb_kwargs` are subject to JSON's coercions. A `tuple` round-trips as a `list` and non-string
|
|
15
|
+
`dict` keys round-trip as strings (e.g. `{1: 'a'}` becomes `{'1': 'a'}`). Values JSON cannot represent at all
|
|
16
|
+
(`datetime`, `set`, `Decimal`, arbitrary objects, ...) are not coerced silently: serialization raises and the request
|
|
17
|
+
is skipped loudly rather than stored in a corrupted form.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import base64
|
|
23
|
+
import json
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from pydantic import BaseModel
|
|
27
|
+
|
|
28
|
+
# Cap the offending value's repr in a serialization error message so a huge value cannot bloat the log.
|
|
29
|
+
_MAX_ERROR_VALUE_REPR_LEN = 200
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def encode_to_json(data: dict[str, Any]) -> str:
|
|
33
|
+
"""Serialize a Scrapy request/response dict to a JSON string.
|
|
34
|
+
|
|
35
|
+
The `body` and `headers` fields are base64-encoded in place (a `str` `body` via its UTF-8 bytes); pydantic
|
|
36
|
+
models are dumped to plain dicts. A `TypeError` is raised if any other value cannot be JSON-encoded.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
data: The dict to serialize, e.g. the output of `scrapy.Request.to_dict()`.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The JSON-encoded string.
|
|
43
|
+
"""
|
|
44
|
+
if not isinstance(data, dict):
|
|
45
|
+
raise TypeError(f'Expected a dict to serialize, got {type(data)}')
|
|
46
|
+
|
|
47
|
+
safe = dict(data)
|
|
48
|
+
|
|
49
|
+
# `body` is base64-encoded so binary payloads survive; a `str` body is taken as its UTF-8 bytes, which keeps
|
|
50
|
+
# encode/decode symmetric (decode always base64-decodes `body` back to `bytes`).
|
|
51
|
+
body = safe.get('body')
|
|
52
|
+
if isinstance(body, (bytes, str)):
|
|
53
|
+
raw_body = body.encode('utf-8') if isinstance(body, str) else body
|
|
54
|
+
safe['body'] = base64.b64encode(raw_body).decode('ascii')
|
|
55
|
+
|
|
56
|
+
if isinstance(safe.get('headers'), dict):
|
|
57
|
+
safe['headers'] = _encode_headers(safe['headers'])
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
# `ensure_ascii=False` keeps non-ASCII URLs/meta as their UTF-8 form instead of `\uXXXX` escapes, which
|
|
61
|
+
# would otherwise roughly double the size of non-Latin text in storage.
|
|
62
|
+
return json.dumps(safe, default=_json_default, ensure_ascii=False)
|
|
63
|
+
except TypeError as exc:
|
|
64
|
+
raise TypeError(
|
|
65
|
+
'Failed to JSON-serialize a Scrapy request/response for storage on the Apify platform. '
|
|
66
|
+
'All values in `meta` and `cb_kwargs` must be JSON-serializable (str, int, float, bool, None, '
|
|
67
|
+
'list, dict, or a pydantic model).'
|
|
68
|
+
) from exc
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def decode_from_json(text: str) -> Any:
|
|
72
|
+
"""Reconstruct a Scrapy request/response dict from a string produced by `encode_to_json`.
|
|
73
|
+
|
|
74
|
+
The base64-encoded `body` and `headers` fields are decoded back to their `bytes` representation.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
text: The JSON-encoded string.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The decoded object (a dict for valid request/response payloads).
|
|
81
|
+
"""
|
|
82
|
+
data = json.loads(text)
|
|
83
|
+
if not isinstance(data, dict):
|
|
84
|
+
return data
|
|
85
|
+
|
|
86
|
+
# `validate=True` makes a non-base64 body raise loudly instead of silently decoding to garbage.
|
|
87
|
+
if isinstance(data.get('body'), str):
|
|
88
|
+
data['body'] = base64.b64decode(data['body'], validate=True)
|
|
89
|
+
|
|
90
|
+
if isinstance(data.get('headers'), dict):
|
|
91
|
+
data['headers'] = _decode_headers(data['headers'])
|
|
92
|
+
|
|
93
|
+
return data
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _json_default(obj: Any) -> Any:
|
|
97
|
+
"""Fallback for values `json.dumps` cannot serialize: pydantic models are dumped, anything else raises.
|
|
98
|
+
|
|
99
|
+
The error names the offending value (type and a truncated repr) so a failed serialization points straight
|
|
100
|
+
at the bad `meta`/`cb_kwargs` entry instead of just reporting that something failed.
|
|
101
|
+
"""
|
|
102
|
+
if isinstance(obj, BaseModel):
|
|
103
|
+
return obj.model_dump(by_alias=True)
|
|
104
|
+
value_repr = repr(obj)
|
|
105
|
+
if len(value_repr) > _MAX_ERROR_VALUE_REPR_LEN:
|
|
106
|
+
value_repr = value_repr[:_MAX_ERROR_VALUE_REPR_LEN] + '...'
|
|
107
|
+
raise TypeError(f'Object of type {type(obj).__name__} is not JSON-serializable: {value_repr}')
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _encode_headers(headers: dict[Any, Any]) -> dict[str, list[str]]:
|
|
111
|
+
"""Encode a Scrapy `{bytes: [bytes]}` headers mapping to a JSON-safe `{str: [base64-str]}`."""
|
|
112
|
+
encoded: dict[str, list[str]] = {}
|
|
113
|
+
for key, value in headers.items():
|
|
114
|
+
str_key = key.decode('latin-1') if isinstance(key, bytes) else key
|
|
115
|
+
values = value if isinstance(value, (list, tuple)) else [value]
|
|
116
|
+
encoded[str_key] = [_b64encode_value(item) for item in values]
|
|
117
|
+
return encoded
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _decode_headers(headers: dict[str, Any]) -> dict[bytes, list[bytes]]:
|
|
121
|
+
"""Reverse `_encode_headers`, restoring the `{bytes: [bytes]}` mapping Scrapy expects."""
|
|
122
|
+
decoded: dict[bytes, list[bytes]] = {}
|
|
123
|
+
for key, value in headers.items():
|
|
124
|
+
bytes_key = key.encode('latin-1') if isinstance(key, str) else key
|
|
125
|
+
values = value if isinstance(value, list) else [value]
|
|
126
|
+
decoded[bytes_key] = [base64.b64decode(item, validate=True) for item in values]
|
|
127
|
+
return decoded
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _b64encode_value(value: Any) -> str:
|
|
131
|
+
"""Base64-encode a single header value.
|
|
132
|
+
|
|
133
|
+
Scrapy stores header values as `bytes`; a `str` is encoded as its UTF-8 bytes. Any other type is coerced with
|
|
134
|
+
`str()` as a lenient last resort. That coercion is lossy (e.g. `5` becomes `b'5'`), but Scrapy does not produce
|
|
135
|
+
non-`bytes`/`str` header values, so it is not hit on the real path.
|
|
136
|
+
"""
|
|
137
|
+
raw = value if isinstance(value, bytes) else str(value).encode('utf-8')
|
|
138
|
+
return base64.b64encode(raw).decode('ascii')
|
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import gzip
|
|
4
4
|
import io
|
|
5
|
-
import pickle
|
|
6
5
|
import re
|
|
7
6
|
import struct
|
|
8
7
|
from logging import getLogger
|
|
@@ -14,6 +13,7 @@ from scrapy.responsetypes import responsetypes
|
|
|
14
13
|
|
|
15
14
|
from apify import Configuration
|
|
16
15
|
from apify.scrapy._async_thread import AsyncThread
|
|
16
|
+
from apify.scrapy._serialization import decode_from_json, encode_to_json
|
|
17
17
|
from apify.storage_clients import ApifyStorageClient
|
|
18
18
|
from apify.storages import KeyValueStore
|
|
19
19
|
|
|
@@ -29,14 +29,14 @@ logger = getLogger(__name__)
|
|
|
29
29
|
class ApifyCacheStorage:
|
|
30
30
|
"""A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses.
|
|
31
31
|
|
|
32
|
-
It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
to be installed.
|
|
32
|
+
It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches responses to requests.
|
|
33
|
+
See HTTPCache middleware settings (prefixed with `HTTPCACHE_`) in the Scrapy documentation for more information.
|
|
34
|
+
Requires the asyncio Twisted reactor to be installed.
|
|
36
35
|
"""
|
|
37
36
|
|
|
38
37
|
def __init__(self, settings: BaseSettings) -> None:
|
|
39
|
-
|
|
38
|
+
# Upper bound on how many keys the per-spider-close cleanup sweeps (best-effort; `close_spider`).
|
|
39
|
+
self._expiration_max_items: int = settings.getint('APIFY_HTTPCACHE_EXPIRATION_MAX_ITEMS', 100)
|
|
40
40
|
self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
|
41
41
|
self._spider: Spider | None = None
|
|
42
42
|
self._kvs: KeyValueStore | None = None
|
|
@@ -79,23 +79,26 @@ class ApifyCacheStorage:
|
|
|
79
79
|
async def expire_kvs() -> None:
|
|
80
80
|
if self._kvs is None:
|
|
81
81
|
raise ValueError('Key value store not initialized')
|
|
82
|
-
|
|
82
|
+
# Best-effort cleanup: at most `_expiration_max_items` keys per close, in no guaranteed order,
|
|
83
|
+
# so stale entries may linger. This only reclaims storage; `retrieve_response` already treats
|
|
84
|
+
# an expired entry as a cache miss.
|
|
85
|
+
processed = 0
|
|
83
86
|
async for item in self._kvs.iterate_keys():
|
|
87
|
+
if processed >= self._expiration_max_items:
|
|
88
|
+
break
|
|
89
|
+
processed += 1
|
|
84
90
|
value = await self._kvs.get_value(item.key)
|
|
85
91
|
try:
|
|
86
92
|
gzip_time = read_gzip_time(value)
|
|
87
93
|
except Exception as e:
|
|
88
94
|
logger.warning(f'Malformed cache item {item.key}: {e}')
|
|
89
|
-
await self._kvs.
|
|
95
|
+
await self._kvs.delete_value(item.key)
|
|
90
96
|
else:
|
|
91
97
|
if self._expiration_secs < current_time - gzip_time:
|
|
92
98
|
logger.debug(f'Expired cache item {item.key}')
|
|
93
|
-
await self._kvs.
|
|
99
|
+
await self._kvs.delete_value(item.key)
|
|
94
100
|
else:
|
|
95
101
|
logger.debug(f'Valid cache item {item.key}')
|
|
96
|
-
if i == self._expiration_max_items:
|
|
97
|
-
break
|
|
98
|
-
i += 1
|
|
99
102
|
|
|
100
103
|
self._async_thread.run_coro(expire_kvs())
|
|
101
104
|
|
|
@@ -127,17 +130,25 @@ class ApifyCacheStorage:
|
|
|
127
130
|
|
|
128
131
|
if current_time is None:
|
|
129
132
|
current_time = int(time())
|
|
130
|
-
|
|
131
|
-
|
|
133
|
+
|
|
134
|
+
# A malformed or legacy cache entry must not crash retrieval; treat it as a cache miss so Scrapy re-fetches
|
|
135
|
+
# and re-stores it in the current format. The field reads stay inside the `try` as well: a value that decodes
|
|
136
|
+
# to a dict missing any expected key (a forward/older format, or a truncated-but-valid JSON payload) must
|
|
137
|
+
# also degrade to a miss rather than raising an uncaught `KeyError`.
|
|
138
|
+
try:
|
|
139
|
+
if 0 < self._expiration_secs < current_time - read_gzip_time(value):
|
|
140
|
+
logger.debug('Cache expired', extra={'request': request})
|
|
141
|
+
return None
|
|
142
|
+
data = from_gzip(value)
|
|
143
|
+
url = data['url']
|
|
144
|
+
status = data['status']
|
|
145
|
+
headers = Headers(data['headers'])
|
|
146
|
+
body = data['body']
|
|
147
|
+
except Exception as exc:
|
|
148
|
+
logger.warning(f'Ignoring malformed cache entry {key!r}: {exc}', extra={'request': request})
|
|
132
149
|
return None
|
|
133
150
|
|
|
134
|
-
data = from_gzip(value)
|
|
135
|
-
url = data['url']
|
|
136
|
-
status = data['status']
|
|
137
|
-
headers = Headers(data['headers'])
|
|
138
|
-
body = data['body']
|
|
139
151
|
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
|
|
140
|
-
|
|
141
152
|
logger.debug('Cache hit', extra={'request': request})
|
|
142
153
|
return respcls(url=url, headers=headers, status=status, body=body)
|
|
143
154
|
|
|
@@ -162,18 +173,25 @@ class ApifyCacheStorage:
|
|
|
162
173
|
|
|
163
174
|
|
|
164
175
|
def to_gzip(data: dict, mtime: int | None = None) -> bytes:
|
|
165
|
-
"""Dump a dictionary to a gzip-compressed byte stream.
|
|
176
|
+
"""Dump a dictionary to a gzip-compressed JSON byte stream.
|
|
177
|
+
|
|
178
|
+
Cache entries live in the Apify key-value store, which holds JSON, so they are serialized as JSON rather
|
|
179
|
+
than pickled. See `apify.scrapy._serialization` for the encoding.
|
|
180
|
+
"""
|
|
181
|
+
payload = encode_to_json(data).encode('utf-8')
|
|
166
182
|
with io.BytesIO() as byte_stream:
|
|
167
183
|
with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file:
|
|
168
|
-
|
|
184
|
+
gzip_file.write(payload)
|
|
169
185
|
return byte_stream.getvalue()
|
|
170
186
|
|
|
171
187
|
|
|
172
188
|
def from_gzip(gzip_bytes: bytes) -> dict:
|
|
173
|
-
"""Load a dictionary from a gzip-compressed byte stream."""
|
|
189
|
+
"""Load a dictionary from a gzip-compressed JSON byte stream."""
|
|
174
190
|
with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file:
|
|
175
|
-
data
|
|
176
|
-
|
|
191
|
+
data = decode_from_json(gzip_file.read().decode('utf-8'))
|
|
192
|
+
if not isinstance(data, dict):
|
|
193
|
+
raise TypeError(f'Expected a dict from the cached payload, got {type(data)}')
|
|
194
|
+
return data
|
|
177
195
|
|
|
178
196
|
|
|
179
197
|
def read_gzip_time(gzip_bytes: bytes) -> int:
|
|
@@ -187,17 +205,15 @@ def read_gzip_time(gzip_bytes: bytes) -> int:
|
|
|
187
205
|
def get_kvs_name(spider_name: str, max_length: int = 60) -> str:
|
|
188
206
|
"""Get the key value store name for a spider.
|
|
189
207
|
|
|
190
|
-
The key value store name is derived from the spider name by replacing all special characters
|
|
191
|
-
|
|
192
|
-
|
|
208
|
+
The key value store name is derived from the spider name by replacing all special characters with hyphens
|
|
209
|
+
and trimming leading and trailing hyphens. The resulting name is prefixed with 'httpcache-' and truncated
|
|
210
|
+
to the maximum length.
|
|
193
211
|
|
|
194
|
-
The documentation
|
|
195
|
-
[about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages)
|
|
212
|
+
The documentation [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages)
|
|
196
213
|
mentions that names can be up to 63 characters long, so the default max length is set to 60.
|
|
197
214
|
|
|
198
|
-
Such naming isn't unique per spider, but should be sufficiently unique for most use cases.
|
|
199
|
-
|
|
200
|
-
the listing in the Apify's console.
|
|
215
|
+
Such naming isn't unique per spider, but should be sufficiently unique for most use cases. The name
|
|
216
|
+
of the key-value store should indicate to which spider it belongs, e.g. in the listing in the Apify's console.
|
|
201
217
|
|
|
202
218
|
Args:
|
|
203
219
|
spider_name: Value of the Spider instance's name attribute.
|