apify 2.2.2b1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +69 -10
- apify/_charging.py +318 -0
- apify/_configuration.py +10 -1
- apify/_models.py +58 -1
- apify/_utils.py +1 -1
- apify/apify_storage_client/_apify_storage_client.py +4 -0
- apify/scrapy/__init__.py +25 -4
- apify/scrapy/_actor_runner.py +26 -0
- apify/scrapy/_async_thread.py +122 -0
- apify/scrapy/_logging_config.py +55 -0
- apify/scrapy/middlewares/apify_proxy.py +9 -13
- apify/scrapy/pipelines/actor_dataset_push.py +7 -9
- apify/scrapy/requests.py +52 -79
- apify/scrapy/scheduler.py +64 -61
- apify/scrapy/utils.py +4 -33
- {apify-2.2.2b1.dist-info → apify-2.3.0.dist-info}/METADATA +2 -2
- {apify-2.2.2b1.dist-info → apify-2.3.0.dist-info}/RECORD +19 -15
- {apify-2.2.2b1.dist-info → apify-2.3.0.dist-info}/WHEEL +1 -1
- {apify-2.2.2b1.dist-info → apify-2.3.0.dist-info}/LICENSE +0 -0
apify/_actor.py
CHANGED
|
@@ -24,6 +24,7 @@ from crawlee.events import (
|
|
|
24
24
|
EventSystemInfoData,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
|
+
from apify._charging import ChargeResult, ChargingManager, ChargingManagerImplementation
|
|
27
28
|
from apify._configuration import Configuration
|
|
28
29
|
from apify._consts import EVENT_LISTENERS_TIMEOUT
|
|
29
30
|
from apify._crypto import decrypt_input_secrets, load_private_key
|
|
@@ -55,10 +56,8 @@ MainReturnType = TypeVar('MainReturnType')
|
|
|
55
56
|
class _ActorType:
|
|
56
57
|
"""The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
|
|
57
58
|
|
|
58
|
-
_apify_client: ApifyClientAsync
|
|
59
|
-
_configuration: Configuration
|
|
60
|
-
_is_exiting = False
|
|
61
59
|
_is_rebooting = False
|
|
60
|
+
_is_any_instance_initialized = False
|
|
62
61
|
|
|
63
62
|
def __init__(
|
|
64
63
|
self,
|
|
@@ -76,6 +75,8 @@ class _ActorType:
|
|
|
76
75
|
be created.
|
|
77
76
|
configure_logging: Should the default logging configuration be configured?
|
|
78
77
|
"""
|
|
78
|
+
self._is_exiting = False
|
|
79
|
+
|
|
79
80
|
self._configuration = configuration or Configuration.get_global_configuration()
|
|
80
81
|
self._configure_logging = configure_logging
|
|
81
82
|
self._apify_client = self.new_client()
|
|
@@ -97,6 +98,8 @@ class _ActorType:
|
|
|
97
98
|
)
|
|
98
99
|
)
|
|
99
100
|
|
|
101
|
+
self._charging_manager = ChargingManagerImplementation(self._configuration, self._apify_client)
|
|
102
|
+
|
|
100
103
|
self._is_initialized = False
|
|
101
104
|
|
|
102
105
|
@ignore_docs
|
|
@@ -200,6 +203,12 @@ class _ActorType:
|
|
|
200
203
|
if self._is_initialized:
|
|
201
204
|
raise RuntimeError('The Actor was already initialized!')
|
|
202
205
|
|
|
206
|
+
if _ActorType._is_any_instance_initialized:
|
|
207
|
+
self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care')
|
|
208
|
+
|
|
209
|
+
# Make sure that the currently initialized instance is also available through the global `Actor` proxy
|
|
210
|
+
cast(Proxy, Actor).__wrapped__ = self
|
|
211
|
+
|
|
203
212
|
self._is_exiting = False
|
|
204
213
|
self._was_final_persist_state_emitted = False
|
|
205
214
|
|
|
@@ -221,8 +230,13 @@ class _ActorType:
|
|
|
221
230
|
# https://github.com/apify/apify-sdk-python/issues/146
|
|
222
231
|
|
|
223
232
|
await self._event_manager.__aenter__()
|
|
233
|
+
self.log.debug('Event manager initialized')
|
|
234
|
+
|
|
235
|
+
await self._charging_manager.__aenter__()
|
|
236
|
+
self.log.debug('Charging manager initialized')
|
|
224
237
|
|
|
225
238
|
self._is_initialized = True
|
|
239
|
+
_ActorType._is_any_instance_initialized = True
|
|
226
240
|
|
|
227
241
|
async def exit(
|
|
228
242
|
self,
|
|
@@ -262,6 +276,7 @@ class _ActorType:
|
|
|
262
276
|
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
|
|
263
277
|
|
|
264
278
|
await self._event_manager.__aexit__(None, None, None)
|
|
279
|
+
await self._charging_manager.__aexit__(None, None, None)
|
|
265
280
|
|
|
266
281
|
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
|
|
267
282
|
self._is_initialized = False
|
|
@@ -270,8 +285,8 @@ class _ActorType:
|
|
|
270
285
|
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython')
|
|
271
286
|
elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508
|
|
272
287
|
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test')
|
|
273
|
-
elif
|
|
274
|
-
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running
|
|
288
|
+
elif os.getenv('SCRAPY_SETTINGS_MODULE'):
|
|
289
|
+
self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running with Scrapy')
|
|
275
290
|
else:
|
|
276
291
|
sys.exit(exit_code)
|
|
277
292
|
|
|
@@ -445,19 +460,46 @@ class _ActorType:
|
|
|
445
460
|
storage_client=storage_client,
|
|
446
461
|
)
|
|
447
462
|
|
|
448
|
-
|
|
463
|
+
@overload
|
|
464
|
+
async def push_data(self, data: dict | list[dict]) -> None: ...
|
|
465
|
+
@overload
|
|
466
|
+
async def push_data(self, data: dict | list[dict], charged_event_name: str) -> ChargeResult: ...
|
|
467
|
+
async def push_data(self, data: dict | list[dict], charged_event_name: str | None = None) -> ChargeResult | None:
|
|
449
468
|
"""Store an object or a list of objects to the default dataset of the current Actor run.
|
|
450
469
|
|
|
451
470
|
Args:
|
|
452
471
|
data: The data to push to the default dataset.
|
|
472
|
+
charged_event_name: If provided and if the Actor uses the pay-per-event pricing model,
|
|
473
|
+
the method will attempt to charge for the event for each pushed item.
|
|
453
474
|
"""
|
|
454
475
|
self._raise_if_not_initialized()
|
|
455
476
|
|
|
456
477
|
if not data:
|
|
457
|
-
return
|
|
478
|
+
return None
|
|
479
|
+
|
|
480
|
+
data = data if isinstance(data, list) else [data]
|
|
481
|
+
|
|
482
|
+
max_charged_count = (
|
|
483
|
+
self._charging_manager.calculate_max_event_charge_count_within_limit(charged_event_name)
|
|
484
|
+
if charged_event_name is not None
|
|
485
|
+
else None
|
|
486
|
+
)
|
|
458
487
|
|
|
459
488
|
dataset = await self.open_dataset()
|
|
460
|
-
|
|
489
|
+
|
|
490
|
+
if max_charged_count is not None and len(data) > max_charged_count:
|
|
491
|
+
# Push as many items as we can charge for
|
|
492
|
+
await dataset.push_data(data[:max_charged_count])
|
|
493
|
+
else:
|
|
494
|
+
await dataset.push_data(data)
|
|
495
|
+
|
|
496
|
+
if charged_event_name:
|
|
497
|
+
return await self._charging_manager.charge(
|
|
498
|
+
event_name=charged_event_name,
|
|
499
|
+
count=min(max_charged_count, len(data)) if max_charged_count is not None else len(data),
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
return None
|
|
461
503
|
|
|
462
504
|
async def get_input(self) -> Any:
|
|
463
505
|
"""Get the Actor input value from the default key-value store associated with the current Actor run."""
|
|
@@ -506,6 +548,23 @@ class _ActorType:
|
|
|
506
548
|
key_value_store = await self.open_key_value_store()
|
|
507
549
|
return await key_value_store.set_value(key, value, content_type=content_type)
|
|
508
550
|
|
|
551
|
+
def get_charging_manager(self) -> ChargingManager:
|
|
552
|
+
"""Retrieve the charging manager to access granular pricing information."""
|
|
553
|
+
self._raise_if_not_initialized()
|
|
554
|
+
return self._charging_manager
|
|
555
|
+
|
|
556
|
+
async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
|
|
557
|
+
"""Charge for a specified number of events - sub-operations of the Actor.
|
|
558
|
+
|
|
559
|
+
This is relevant only for the pay-per-event pricing model.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
event_name: Name of the event to be charged for.
|
|
563
|
+
count: Number of events to charge for.
|
|
564
|
+
"""
|
|
565
|
+
self._raise_if_not_initialized()
|
|
566
|
+
return await self._charging_manager.charge(event_name, count)
|
|
567
|
+
|
|
509
568
|
@overload
|
|
510
569
|
def on(
|
|
511
570
|
self, event_name: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]
|
|
@@ -898,11 +957,11 @@ class _ActorType:
|
|
|
898
957
|
self.log.error('Actor.reboot() is only supported when running on the Apify platform.')
|
|
899
958
|
return
|
|
900
959
|
|
|
901
|
-
if
|
|
960
|
+
if _ActorType._is_rebooting:
|
|
902
961
|
self.log.debug('Actor is already rebooting, skipping the additional reboot call.')
|
|
903
962
|
return
|
|
904
963
|
|
|
905
|
-
|
|
964
|
+
_ActorType._is_rebooting = True
|
|
906
965
|
|
|
907
966
|
if not custom_after_sleep:
|
|
908
967
|
custom_after_sleep = self._configuration.metamorph_after_sleep
|
apify/_charging.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from decimal import Decimal
|
|
7
|
+
from typing import TYPE_CHECKING, Protocol, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import TypeAdapter
|
|
10
|
+
|
|
11
|
+
from apify_shared.utils import ignore_docs
|
|
12
|
+
from crawlee._utils.context import ensure_context
|
|
13
|
+
|
|
14
|
+
from apify._models import ActorRun, PricingModel
|
|
15
|
+
from apify._utils import docs_group
|
|
16
|
+
from apify.log import logger
|
|
17
|
+
from apify.storages import Dataset
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from types import TracebackType
|
|
21
|
+
|
|
22
|
+
from apify_client import ApifyClientAsync
|
|
23
|
+
|
|
24
|
+
from apify._configuration import Configuration
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
run_validator: TypeAdapter[ActorRun | None] = TypeAdapter(Union[ActorRun, None])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@docs_group('Interfaces')
|
|
31
|
+
class ChargingManager(Protocol):
|
|
32
|
+
"""Provides fine-grained access to pay-per-event functionality."""
|
|
33
|
+
|
|
34
|
+
async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
|
|
35
|
+
"""Charge for a specified number of events - sub-operations of the Actor.
|
|
36
|
+
|
|
37
|
+
This is relevant only for the pay-per-event pricing model.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
event_name: Name of the event to be charged for.
|
|
41
|
+
count: Number of events to charge for.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def calculate_total_charged_amount(self) -> Decimal:
|
|
45
|
+
"""Calculate the total amount of money charged for pay-per-event events so far."""
|
|
46
|
+
|
|
47
|
+
def calculate_max_event_charge_count_within_limit(self, event_name: str) -> int | None:
|
|
48
|
+
"""Calculate how many instances of an event can be charged before we reach the configured limit.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
event_name: Name of the inspected event.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def get_pricing_info(self) -> ActorPricingInfo:
|
|
55
|
+
"""Retrieve detailed information about the effective pricing of the current Actor run.
|
|
56
|
+
|
|
57
|
+
This can be used for instance when your code needs to support multiple pricing models in transition periods.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@docs_group('Data structures')
|
|
62
|
+
@dataclass(frozen=True)
|
|
63
|
+
class ChargeResult:
|
|
64
|
+
"""Result of the `ChargingManager.charge` method."""
|
|
65
|
+
|
|
66
|
+
event_charge_limit_reached: bool
|
|
67
|
+
"""If true, no more events of this type can be charged within the limit."""
|
|
68
|
+
|
|
69
|
+
charged_count: int
|
|
70
|
+
"""Total amount of charged events - may be lower than the requested amount."""
|
|
71
|
+
|
|
72
|
+
chargeable_within_limit: dict[str, int | None]
|
|
73
|
+
"""How many events of each known type can still be charged within the limit."""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@docs_group('Data structures')
|
|
77
|
+
@dataclass
|
|
78
|
+
class ActorPricingInfo:
|
|
79
|
+
"""Result of the `ChargingManager.get_pricing_info` method."""
|
|
80
|
+
|
|
81
|
+
pricing_model: PricingModel | None
|
|
82
|
+
"""The currently effective pricing model."""
|
|
83
|
+
|
|
84
|
+
max_total_charge_usd: Decimal
|
|
85
|
+
"""A configured limit for the total charged amount - if you exceed it, you won't receive more money than this."""
|
|
86
|
+
|
|
87
|
+
is_pay_per_event: bool
|
|
88
|
+
"""A shortcut - true if the Actor runs with the pay-per-event pricing model."""
|
|
89
|
+
|
|
90
|
+
per_event_prices: dict[str, Decimal]
|
|
91
|
+
"""Price of every known event type."""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@ignore_docs
|
|
95
|
+
class ChargingManagerImplementation(ChargingManager):
|
|
96
|
+
"""Implementation of the `ChargingManager` Protocol - this is only meant to be instantiated internally."""
|
|
97
|
+
|
|
98
|
+
LOCAL_CHARGING_LOG_DATASET_NAME = 'charging_log'
|
|
99
|
+
|
|
100
|
+
def __init__(self, configuration: Configuration, client: ApifyClientAsync) -> None:
|
|
101
|
+
self._max_total_charge_usd = configuration.max_total_charge_usd or Decimal('inf')
|
|
102
|
+
self._is_at_home = configuration.is_at_home
|
|
103
|
+
self._actor_run_id = configuration.actor_run_id
|
|
104
|
+
self._purge_charging_log_dataset = configuration.purge_on_start
|
|
105
|
+
self._pricing_model: PricingModel | None = None
|
|
106
|
+
|
|
107
|
+
if configuration.test_pay_per_event:
|
|
108
|
+
if self._is_at_home:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
'Using the ACTOR_TEST_PAY_PER_EVENT environment variable is only supported '
|
|
111
|
+
'in a local development environment'
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
self._pricing_model = 'PAY_PER_EVENT'
|
|
115
|
+
|
|
116
|
+
self._client = client
|
|
117
|
+
self._charging_log_dataset: Dataset | None = None
|
|
118
|
+
|
|
119
|
+
self._charging_state: dict[str, ChargingStateItem] = {}
|
|
120
|
+
self._pricing_info: dict[str, PricingInfoItem] = {}
|
|
121
|
+
|
|
122
|
+
self._not_ppe_warning_printed = False
|
|
123
|
+
self.active = False
|
|
124
|
+
|
|
125
|
+
async def __aenter__(self) -> None:
|
|
126
|
+
"""Initialize the charging manager - this is called by the `Actor` class and shouldn't be invoked manually."""
|
|
127
|
+
self.active = True
|
|
128
|
+
|
|
129
|
+
if self._is_at_home:
|
|
130
|
+
# Running on the Apify platform - fetch pricing info for the current run.
|
|
131
|
+
|
|
132
|
+
if self._actor_run_id is None:
|
|
133
|
+
raise RuntimeError('Actor run ID not found even though the Actor is running on Apify')
|
|
134
|
+
|
|
135
|
+
run = run_validator.validate_python(await self._client.run(self._actor_run_id).get())
|
|
136
|
+
if run is None:
|
|
137
|
+
raise RuntimeError('Actor run not found')
|
|
138
|
+
|
|
139
|
+
if run.pricing_info is not None:
|
|
140
|
+
self._pricing_model = run.pricing_info.pricing_model
|
|
141
|
+
|
|
142
|
+
if run.pricing_info.pricing_model == 'PAY_PER_EVENT':
|
|
143
|
+
for event_name, event_pricing in run.pricing_info.pricing_per_event.actor_charge_events.items():
|
|
144
|
+
self._pricing_info[event_name] = PricingInfoItem(
|
|
145
|
+
price=event_pricing.event_price_usd,
|
|
146
|
+
title=event_pricing.event_title,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
self._max_total_charge_usd = run.options.max_total_charge_usd or self._max_total_charge_usd
|
|
150
|
+
|
|
151
|
+
for event_name, count in (run.charged_event_counts or {}).items():
|
|
152
|
+
price = self._pricing_info.get(event_name, PricingInfoItem(Decimal(), title='')).price
|
|
153
|
+
self._charging_state[event_name] = ChargingStateItem(
|
|
154
|
+
charge_count=count,
|
|
155
|
+
total_charged_amount=count * price,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if not self._is_at_home and self._pricing_model == 'PAY_PER_EVENT':
|
|
159
|
+
# We are not running on the Apify platform, but PPE is enabled for testing - open a dataset that
|
|
160
|
+
# will contain a log of all charge calls for debugging purposes.
|
|
161
|
+
|
|
162
|
+
if self._purge_charging_log_dataset:
|
|
163
|
+
dataset = await Dataset.open(name=self.LOCAL_CHARGING_LOG_DATASET_NAME)
|
|
164
|
+
await dataset.drop()
|
|
165
|
+
|
|
166
|
+
self._charging_log_dataset = await Dataset.open(name=self.LOCAL_CHARGING_LOG_DATASET_NAME)
|
|
167
|
+
|
|
168
|
+
async def __aexit__(
|
|
169
|
+
self,
|
|
170
|
+
exc_type: type[BaseException] | None,
|
|
171
|
+
exc_value: BaseException | None,
|
|
172
|
+
exc_traceback: TracebackType | None,
|
|
173
|
+
) -> None:
|
|
174
|
+
if not self.active:
|
|
175
|
+
raise RuntimeError('Exiting an uninitialized ChargingManager')
|
|
176
|
+
|
|
177
|
+
self.active = False
|
|
178
|
+
|
|
179
|
+
@ensure_context
|
|
180
|
+
async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
|
|
181
|
+
def calculate_chargeable() -> dict[str, int | None]:
|
|
182
|
+
"""Calculate the maximum number of events of each type that can be charged within the current budget."""
|
|
183
|
+
return {
|
|
184
|
+
event_name: self.calculate_max_event_charge_count_within_limit(event_name)
|
|
185
|
+
for event_name in self._pricing_info
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
# For runs that do not use the pay-per-event pricing model, just print a warning and return
|
|
189
|
+
if self._pricing_model != 'PAY_PER_EVENT':
|
|
190
|
+
if not self._not_ppe_warning_printed:
|
|
191
|
+
logger.warning(
|
|
192
|
+
'Ignored attempt to charge for an event - the Actor does not use the pay-per-event pricing'
|
|
193
|
+
)
|
|
194
|
+
self._not_ppe_warning_printed = True
|
|
195
|
+
|
|
196
|
+
return ChargeResult(
|
|
197
|
+
event_charge_limit_reached=False,
|
|
198
|
+
charged_count=0,
|
|
199
|
+
chargeable_within_limit=calculate_chargeable(),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# START OF CRITICAL SECTION - no awaits here
|
|
203
|
+
|
|
204
|
+
# Determine the maximum amount of events that can be charged within the budget
|
|
205
|
+
charged_count = min(count, self.calculate_max_event_charge_count_within_limit(event_name) or count)
|
|
206
|
+
|
|
207
|
+
if charged_count == 0:
|
|
208
|
+
return ChargeResult(
|
|
209
|
+
event_charge_limit_reached=True,
|
|
210
|
+
charged_count=0,
|
|
211
|
+
chargeable_within_limit=calculate_chargeable(),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
pricing_info = self._pricing_info.get(
|
|
215
|
+
event_name,
|
|
216
|
+
PricingInfoItem(
|
|
217
|
+
price=Decimal()
|
|
218
|
+
if self._is_at_home
|
|
219
|
+
else Decimal(
|
|
220
|
+
'1'
|
|
221
|
+
), # Use a nonzero price for local development so that the maximum budget can be reached,
|
|
222
|
+
title=f"Unknown event '{event_name}'",
|
|
223
|
+
),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Update the charging state
|
|
227
|
+
self._charging_state.setdefault(event_name, ChargingStateItem(0, Decimal()))
|
|
228
|
+
self._charging_state[event_name].charge_count += charged_count
|
|
229
|
+
self._charging_state[event_name].total_charged_amount += charged_count * pricing_info.price
|
|
230
|
+
|
|
231
|
+
# END OF CRITICAL SECTION
|
|
232
|
+
|
|
233
|
+
# If running on the platform, call the charge endpoint
|
|
234
|
+
if self._is_at_home:
|
|
235
|
+
if self._actor_run_id is None:
|
|
236
|
+
raise RuntimeError('Actor run ID not configured')
|
|
237
|
+
|
|
238
|
+
if event_name in self._pricing_info:
|
|
239
|
+
await self._client.run(self._actor_run_id).charge(event_name, charged_count)
|
|
240
|
+
else:
|
|
241
|
+
logger.warning(f"Attempting to charge for an unknown event '{event_name}'")
|
|
242
|
+
|
|
243
|
+
# Log the charged operation (if enabled)
|
|
244
|
+
if self._charging_log_dataset:
|
|
245
|
+
await self._charging_log_dataset.push_data(
|
|
246
|
+
{
|
|
247
|
+
'event_name': event_name,
|
|
248
|
+
'event_title': pricing_info.title,
|
|
249
|
+
'event_price_usd': round(pricing_info.price, 3),
|
|
250
|
+
'charged_count': charged_count,
|
|
251
|
+
'timestamp': datetime.now(timezone.utc).isoformat(),
|
|
252
|
+
}
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# If it is not possible to charge the full amount, log that fact
|
|
256
|
+
if charged_count < count:
|
|
257
|
+
subject = 'instance' if count == 1 else 'instances'
|
|
258
|
+
logger.info(
|
|
259
|
+
f"Charging {count} ${subject} of '{event_name}' event would exceed max_total_charge_usd "
|
|
260
|
+
'- only {charged_count} events were charged'
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
max_charge_count = self.calculate_max_event_charge_count_within_limit(event_name)
|
|
264
|
+
|
|
265
|
+
return ChargeResult(
|
|
266
|
+
event_charge_limit_reached=max_charge_count is not None and max_charge_count <= 0,
|
|
267
|
+
charged_count=charged_count,
|
|
268
|
+
chargeable_within_limit=calculate_chargeable(),
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
@ensure_context
|
|
272
|
+
def calculate_total_charged_amount(self) -> Decimal:
|
|
273
|
+
return sum(
|
|
274
|
+
(item.total_charged_amount for item in self._charging_state.values()),
|
|
275
|
+
start=Decimal(),
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
@ensure_context
|
|
279
|
+
def calculate_max_event_charge_count_within_limit(self, event_name: str) -> int | None:
|
|
280
|
+
pricing_info = self._pricing_info.get(event_name)
|
|
281
|
+
|
|
282
|
+
if pricing_info is not None:
|
|
283
|
+
price = pricing_info.price
|
|
284
|
+
elif not self._is_at_home:
|
|
285
|
+
price = Decimal('1') # Use a nonzero price for local development so that the maximum budget can be reached
|
|
286
|
+
else:
|
|
287
|
+
price = Decimal()
|
|
288
|
+
|
|
289
|
+
if not price:
|
|
290
|
+
return None
|
|
291
|
+
|
|
292
|
+
result = (self._max_total_charge_usd - self.calculate_total_charged_amount()) / price
|
|
293
|
+
return math.floor(result) if result.is_finite() else None
|
|
294
|
+
|
|
295
|
+
@ensure_context
|
|
296
|
+
def get_pricing_info(self) -> ActorPricingInfo:
|
|
297
|
+
return ActorPricingInfo(
|
|
298
|
+
pricing_model=self._pricing_model,
|
|
299
|
+
is_pay_per_event=self._pricing_model == 'PAY_PER_EVENT',
|
|
300
|
+
max_total_charge_usd=self._max_total_charge_usd
|
|
301
|
+
if self._max_total_charge_usd is not None
|
|
302
|
+
else Decimal('inf'),
|
|
303
|
+
per_event_prices={
|
|
304
|
+
event_name: pricing_info.price for event_name, pricing_info in self._pricing_info.items()
|
|
305
|
+
},
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
@dataclass
|
|
310
|
+
class ChargingStateItem:
|
|
311
|
+
charge_count: int
|
|
312
|
+
total_charged_amount: Decimal
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@dataclass
|
|
316
|
+
class PricingInfoItem:
|
|
317
|
+
price: Decimal
|
|
318
|
+
title: str
|
apify/_configuration.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
|
+
from decimal import Decimal
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import Annotated, Any
|
|
6
7
|
|
|
@@ -212,7 +213,7 @@ class Configuration(CrawleeConfiguration):
|
|
|
212
213
|
] = None
|
|
213
214
|
|
|
214
215
|
max_total_charge_usd: Annotated[
|
|
215
|
-
|
|
216
|
+
Decimal | None,
|
|
216
217
|
Field(
|
|
217
218
|
alias='actor_max_total_charge_usd',
|
|
218
219
|
description='For pay-per-event Actors, the user-set limit on total charges. Do not exceed this limit',
|
|
@@ -220,6 +221,14 @@ class Configuration(CrawleeConfiguration):
|
|
|
220
221
|
BeforeValidator(lambda val: val or None),
|
|
221
222
|
] = None
|
|
222
223
|
|
|
224
|
+
test_pay_per_event: Annotated[
|
|
225
|
+
bool,
|
|
226
|
+
Field(
|
|
227
|
+
alias='actor_test_pay_per_event',
|
|
228
|
+
description='Enable pay-per-event functionality for local development',
|
|
229
|
+
),
|
|
230
|
+
] = False
|
|
231
|
+
|
|
223
232
|
meta_origin: Annotated[
|
|
224
233
|
str | None,
|
|
225
234
|
Field(
|
apify/_models.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
|
-
from
|
|
4
|
+
from decimal import Decimal
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Literal
|
|
5
6
|
|
|
6
7
|
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
|
|
7
8
|
|
|
@@ -11,6 +12,9 @@ from crawlee._utils.urls import validate_http_url
|
|
|
11
12
|
|
|
12
13
|
from apify._utils import docs_group
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from typing_extensions import TypeAlias
|
|
17
|
+
|
|
14
18
|
|
|
15
19
|
@docs_group('Data structures')
|
|
16
20
|
class Webhook(BaseModel):
|
|
@@ -67,6 +71,7 @@ class ActorRunOptions(BaseModel):
|
|
|
67
71
|
timeout: Annotated[timedelta, Field(alias='timeoutSecs')]
|
|
68
72
|
memory_mbytes: Annotated[int, Field(alias='memoryMbytes')]
|
|
69
73
|
disk_mbytes: Annotated[int, Field(alias='diskMbytes')]
|
|
74
|
+
max_total_charge_usd: Annotated[Decimal | None, Field(alias='maxTotalChargeUsd')] = None
|
|
70
75
|
|
|
71
76
|
|
|
72
77
|
@docs_group('Data structures')
|
|
@@ -115,3 +120,55 @@ class ActorRun(BaseModel):
|
|
|
115
120
|
usage: Annotated[ActorRunUsage | None, Field(alias='usage')] = None
|
|
116
121
|
usage_total_usd: Annotated[float | None, Field(alias='usageTotalUsd')] = None
|
|
117
122
|
usage_usd: Annotated[ActorRunUsage | None, Field(alias='usageUsd')] = None
|
|
123
|
+
pricing_info: Annotated[
|
|
124
|
+
FreeActorPricingInfo
|
|
125
|
+
| FlatPricePerMonthActorPricingInfo
|
|
126
|
+
| PricePerDatasetItemActorPricingInfo
|
|
127
|
+
| PayPerEventActorPricingInfo
|
|
128
|
+
| None,
|
|
129
|
+
Field(alias='pricingInfo', discriminator='pricing_model'),
|
|
130
|
+
] = None
|
|
131
|
+
charged_event_counts: Annotated[
|
|
132
|
+
dict[str, int] | None,
|
|
133
|
+
Field(alias='chargedEventCounts'),
|
|
134
|
+
] = None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class FreeActorPricingInfo(BaseModel):
|
|
138
|
+
pricing_model: Annotated[Literal['FREE'], Field(alias='pricingModel')]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class FlatPricePerMonthActorPricingInfo(BaseModel):
|
|
142
|
+
pricing_model: Annotated[Literal['FLAT_PRICE_PER_MONTH'], Field(alias='pricingModel')]
|
|
143
|
+
trial_minutes: Annotated[int | None, Field(alias='trialMinutes')] = None
|
|
144
|
+
price_per_unit_usd: Annotated[Decimal, Field(alias='pricePerUnitUsd')]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class PricePerDatasetItemActorPricingInfo(BaseModel):
|
|
148
|
+
pricing_model: Annotated[Literal['PRICE_PER_DATASET_ITEM'], Field(alias='pricingModel')]
|
|
149
|
+
unit_name: Annotated[str | None, Field(alias='unitName')] = None
|
|
150
|
+
price_per_unit_usd: Annotated[Decimal, Field(alias='pricePerUnitUsd')]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class ActorChargeEvent(BaseModel):
|
|
154
|
+
event_price_usd: Annotated[Decimal, Field(alias='eventPriceUsd')]
|
|
155
|
+
event_title: Annotated[str, Field(alias='eventTitle')]
|
|
156
|
+
event_description: Annotated[str | None, Field(alias='eventDescription')] = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class PricingPerEvent(BaseModel):
|
|
160
|
+
actor_charge_events: Annotated[dict[str, ActorChargeEvent], Field(alias='actorChargeEvents')]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class PayPerEventActorPricingInfo(BaseModel):
|
|
164
|
+
pricing_model: Annotated[Literal['PAY_PER_EVENT'], Field(alias='pricingModel')]
|
|
165
|
+
pricing_per_event: Annotated[PricingPerEvent, Field(alias='pricingPerEvent')]
|
|
166
|
+
minimal_max_total_charge_usd: Annotated[Decimal | None, Field(alias='minimalMaxTotalChargeUsd')] = None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
PricingModel: TypeAlias = Literal[
|
|
170
|
+
'FREE',
|
|
171
|
+
'FLAT_PRICE_PER_MONTH',
|
|
172
|
+
'PRICE_PER_DATASET_ITEM',
|
|
173
|
+
'PAY_PER_EVENT',
|
|
174
|
+
]
|
apify/_utils.py
CHANGED
|
@@ -27,7 +27,7 @@ def is_running_in_ipython() -> bool:
|
|
|
27
27
|
return getattr(builtins, '__IPYTHON__', False)
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Errors', 'Functions']
|
|
30
|
+
GroupName = Literal['Classes', 'Abstract classes', 'Interfaces', 'Data structures', 'Errors', 'Functions']
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
|
apify/scrapy/__init__.py
CHANGED
|
@@ -1,11 +1,32 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
|
+
from crawlee._utils.try_import import try_import as _try_import
|
|
3
|
+
|
|
4
|
+
_install_import_hook(__name__)
|
|
5
|
+
|
|
6
|
+
# The following imports use try_import to handle optional dependencies, as they may not always be available.
|
|
7
|
+
|
|
8
|
+
with _try_import(__name__, 'run_scrapy_actor'):
|
|
9
|
+
from ._actor_runner import run_scrapy_actor
|
|
10
|
+
|
|
11
|
+
with _try_import(__name__, 'initialize_logging'):
|
|
12
|
+
from ._logging_config import initialize_logging
|
|
13
|
+
|
|
14
|
+
with _try_import(__name__, 'to_apify_request', 'to_scrapy_request'):
|
|
15
|
+
from .requests import to_apify_request, to_scrapy_request
|
|
16
|
+
|
|
17
|
+
with _try_import(__name__, 'ApifyScheduler'):
|
|
18
|
+
from .scheduler import ApifyScheduler
|
|
19
|
+
|
|
20
|
+
with _try_import(__name__, 'apply_apify_settings', 'get_basic_auth_header'):
|
|
21
|
+
from .utils import apply_apify_settings, get_basic_auth_header
|
|
22
|
+
|
|
4
23
|
|
|
5
24
|
__all__ = [
|
|
6
25
|
'ApifyScheduler',
|
|
26
|
+
'apply_apify_settings',
|
|
7
27
|
'get_basic_auth_header',
|
|
8
|
-
'
|
|
28
|
+
'initialize_logging',
|
|
29
|
+
'run_scrapy_actor',
|
|
9
30
|
'to_apify_request',
|
|
10
31
|
'to_scrapy_request',
|
|
11
32
|
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from twisted.internet.defer import Deferred, ensureDeferred
|
|
7
|
+
from twisted.internet.task import react
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Coroutine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def _run_coro_as_deferred(coro: Coroutine) -> None:
|
|
14
|
+
"""Wrap the given asyncio coroutine in a Task and await its result as a Twisted Deferred."""
|
|
15
|
+
task = asyncio.ensure_future(coro)
|
|
16
|
+
await Deferred.fromFuture(task)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run_scrapy_actor(coro: Coroutine) -> None:
|
|
20
|
+
"""Start Twisted's reactor and execute the provided Actor coroutine.
|
|
21
|
+
|
|
22
|
+
This function initiates the Twisted reactor and runs the given asyncio coroutine (typically the
|
|
23
|
+
Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted event loops,
|
|
24
|
+
enabling the Apify and Scrapy integration to work together.
|
|
25
|
+
"""
|
|
26
|
+
react(lambda _: ensureDeferred(_run_coro_as_deferred(coro)))
|