apify 3.4.2b5__tar.gz → 3.4.2b7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {apify-3.4.2b5 → apify-3.4.2b7}/CHANGELOG.md +2 -0
  2. {apify-3.4.2b5 → apify-3.4.2b7}/PKG-INFO +1 -1
  3. {apify-3.4.2b5 → apify-3.4.2b7}/pyproject.toml +1 -1
  4. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_actor.py +9 -1
  5. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_charging.py +33 -20
  6. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/events/_types.py +28 -9
  7. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/request_loaders/_apify_request_list.py +6 -3
  8. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/_logging_config.py +24 -6
  9. apify-3.4.2b7/src/apify/scrapy/_serialization.py +138 -0
  10. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/extensions/_httpcache.py +49 -33
  11. apify-3.4.2b7/src/apify/scrapy/requests.py +216 -0
  12. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/scheduler.py +12 -8
  13. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_models.py +25 -11
  14. apify-3.4.2b5/src/apify/scrapy/requests.py +0 -164
  15. {apify-3.4.2b5 → apify-3.4.2b7}/.gitignore +0 -0
  16. {apify-3.4.2b5 → apify-3.4.2b7}/CONTRIBUTING.md +0 -0
  17. {apify-3.4.2b5 → apify-3.4.2b7}/LICENSE +0 -0
  18. {apify-3.4.2b5 → apify-3.4.2b7}/README.md +0 -0
  19. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/__init__.py +0 -0
  20. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_configuration.py +0 -0
  21. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_consts.py +0 -0
  22. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_crypto.py +0 -0
  23. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_proxy_configuration.py +0 -0
  24. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_utils.py +0 -0
  25. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/_webhook.py +0 -0
  26. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/events/__init__.py +0 -0
  27. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/events/_apify_event_manager.py +0 -0
  28. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/events/py.typed +0 -0
  29. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/log.py +0 -0
  30. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/py.typed +0 -0
  31. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/request_loaders/__init__.py +0 -0
  32. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/request_loaders/py.typed +0 -0
  33. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/__init__.py +0 -0
  34. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/_actor_runner.py +0 -0
  35. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/_async_thread.py +0 -0
  36. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/extensions/__init__.py +0 -0
  37. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/middlewares/__init__.py +0 -0
  38. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
  39. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/middlewares/py.typed +0 -0
  40. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/pipelines/__init__.py +0 -0
  41. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
  42. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/pipelines/py.typed +0 -0
  43. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/py.typed +0 -0
  44. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/scrapy/utils.py +0 -0
  45. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/__init__.py +0 -0
  46. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/__init__.py +0 -0
  47. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_alias_resolving.py +0 -0
  48. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_api_client_creation.py +0 -0
  49. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
  50. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
  51. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_request_queue_client.py +0 -0
  52. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +0 -0
  53. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_request_queue_single_client.py +0 -0
  54. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
  55. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/_utils.py +0 -0
  56. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_apify/py.typed +0 -0
  57. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_file_system/__init__.py +0 -0
  58. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
  59. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
  60. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
  61. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
  62. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
  63. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
  64. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storage_clients/py.typed +0 -0
  65. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storages/__init__.py +0 -0
  66. {apify-3.4.2b5 → apify-3.4.2b7}/src/apify/storages/py.typed +0 -0
@@ -9,6 +9,8 @@ All notable changes to this project will be documented in this file.
9
9
 
10
10
  - **scrapy:** Correct proxy middleware exception log and import ([#953](https://github.com/apify/apify-sdk-python/pull/953)) ([5bd6eb9](https://github.com/apify/apify-sdk-python/commit/5bd6eb9843d90844cec083372e932413bceedec9)) by [@vdusek](https://github.com/vdusek)
11
11
  - **scrapy:** Skip a request that fails to convert instead of crashing the run ([#952](https://github.com/apify/apify-sdk-python/pull/952)) ([db9444f](https://github.com/apify/apify-sdk-python/commit/db9444faeb0158c29aa394121cf733ff2e843f28)) by [@vdusek](https://github.com/vdusek)
12
+ - **scrapy:** [**breaking**] Serialize requests and HTTP cache as JSON instead of pickle ([#951](https://github.com/apify/apify-sdk-python/pull/951)) ([a87e8d1](https://github.com/apify/apify-sdk-python/commit/a87e8d1597478b4f12fd5bb9b379f65f637d8e96)) by [@vdusek](https://github.com/vdusek)
13
+ - **scrapy:** Make logging configuration idempotent ([#954](https://github.com/apify/apify-sdk-python/pull/954)) ([2cc5602](https://github.com/apify/apify-sdk-python/commit/2cc5602b741b93c81f264d4e09e0d9bcfc7200f2)) by [@vdusek](https://github.com/vdusek)
12
14
 
13
15
  ### 🚜 Refactor
14
16
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.4.2b5
3
+ Version: 3.4.2b7
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "apify"
7
- version = "3.4.2b5"
7
+ version = "3.4.2b7"
8
8
  description = "Apify SDK for Python"
9
9
  authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
10
10
  license = { file = "LICENSE" }
@@ -699,7 +699,15 @@ class _ActorType:
699
699
 
700
700
  @_ensure_context
701
701
  async def get_input(self) -> Any:
702
- """Get the Actor input value from the default key-value store associated with the current Actor run."""
702
+ """Get the Actor input value from the default key-value store associated with the current Actor run.
703
+
704
+ The input is the deserialized contents of the input record (the `INPUT` key by default), so it is typically
705
+ a `dict` keyed by the fields declared in the Actor's input schema. Any secret input fields are decrypted to
706
+ plaintext before being returned.
707
+
708
+ Returns:
709
+ The Actor input, usually a `dict` of input fields, or `None` if the Actor has no input.
710
+ """
703
711
  input_value = await self.get_value(self.configuration.input_key)
704
712
  input_secrets_private_key = self.configuration.input_secrets_private_key_file
705
713
  input_secrets_key_passphrase = self.configuration.input_secrets_private_key_passphrase
@@ -5,9 +5,10 @@ from contextvars import ContextVar
5
5
  from dataclasses import dataclass
6
6
  from datetime import UTC, datetime
7
7
  from decimal import Decimal
8
- from typing import TYPE_CHECKING, Annotated, Literal, Protocol, TypedDict
8
+ from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
9
9
 
10
- from pydantic import Field
10
+ from pydantic import ConfigDict
11
+ from pydantic.alias_generators import to_camel
11
12
 
12
13
  import apify_client._models as _client_models
13
14
  from apify_client._models import ActorChargeEvent as ClientActorChargeEvent
@@ -56,7 +57,9 @@ _ensure_context = ensure_context('active')
56
57
  class ActorChargeEvent(ClientActorChargeEvent):
57
58
  """Definition of a single chargeable event in the pay-per-event pricing model."""
58
59
 
59
- event_description: Annotated[str | None, Field(alias='eventDescription')] = None
60
+ model_config = ConfigDict(alias_generator=to_camel)
61
+
62
+ event_description: str | None = None
60
63
  """Human-readable description of the event.
61
64
 
62
65
  Required in apify-client but omitted from the env var, so it is relaxed to optional.
@@ -67,7 +70,9 @@ class ActorChargeEvent(ClientActorChargeEvent):
67
70
  class PricingPerEvent(ClientPricingPerEvent):
68
71
  """Pay-per-event pricing details - the chargeable events and their prices."""
69
72
 
70
- actor_charge_events: Annotated[dict[str, ActorChargeEvent] | None, Field(alias='actorChargeEvents')] = None
73
+ model_config = ConfigDict(alias_generator=to_camel)
74
+
75
+ actor_charge_events: dict[str, ActorChargeEvent] | None = None
71
76
  """Mapping of event name to its charge definition."""
72
77
 
73
78
 
@@ -75,13 +80,15 @@ class PricingPerEvent(ClientPricingPerEvent):
75
80
  class FreeActorPricingInfo(ClientFree):
76
81
  """Pricing info for an Actor offered free of charge."""
77
82
 
78
- apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
83
+ model_config = ConfigDict(alias_generator=to_camel)
84
+
85
+ apify_margin_percentage: float | None = None
79
86
  """Apify's margin on the price, as a percentage."""
80
87
 
81
- created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
88
+ created_at: datetime | None = None
82
89
  """Timestamp when this pricing info was created."""
83
90
 
84
- started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
91
+ started_at: datetime | None = None
85
92
  """Timestamp when this pricing became effective."""
86
93
 
87
94
 
@@ -89,19 +96,21 @@ class FreeActorPricingInfo(ClientFree):
89
96
  class FlatPricePerMonthActorPricingInfo(ClientFlatPricePerMonth):
90
97
  """Pricing info for an Actor billed at a flat monthly price."""
91
98
 
92
- apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
99
+ model_config = ConfigDict(alias_generator=to_camel)
100
+
101
+ apify_margin_percentage: float | None = None
93
102
  """Apify's margin on the price, as a percentage."""
94
103
 
95
- created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
104
+ created_at: datetime | None = None
96
105
  """Timestamp when this pricing info was created."""
97
106
 
98
- started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
107
+ started_at: datetime | None = None
99
108
  """Timestamp when this pricing became effective."""
100
109
 
101
- trial_minutes: Annotated[int | None, Field(alias='trialMinutes')] = None
110
+ trial_minutes: int | None = None
102
111
  """Length of the free trial period, in minutes."""
103
112
 
104
- price_per_unit_usd: Annotated[float | None, Field(alias='pricePerUnitUsd')] = None
113
+ price_per_unit_usd: float | None = None
105
114
  """Price per unit, in USD."""
106
115
 
107
116
 
@@ -109,16 +118,18 @@ class FlatPricePerMonthActorPricingInfo(ClientFlatPricePerMonth):
109
118
  class PricePerDatasetItemActorPricingInfo(ClientPricePerDatasetItem):
110
119
  """Pricing info for an Actor billed per dataset item produced."""
111
120
 
112
- apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
121
+ model_config = ConfigDict(alias_generator=to_camel)
122
+
123
+ apify_margin_percentage: float | None = None
113
124
  """Apify's margin on the price, as a percentage."""
114
125
 
115
- created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
126
+ created_at: datetime | None = None
116
127
  """Timestamp when this pricing info was created."""
117
128
 
118
- started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
129
+ started_at: datetime | None = None
119
130
  """Timestamp when this pricing became effective."""
120
131
 
121
- unit_name: Annotated[str | None, Field(alias='unitName')] = None
132
+ unit_name: str | None = None
122
133
  """Name of the billed unit."""
123
134
 
124
135
 
@@ -126,16 +137,18 @@ class PricePerDatasetItemActorPricingInfo(ClientPricePerDatasetItem):
126
137
  class PayPerEventActorPricingInfo(ClientPayPerEvent):
127
138
  """Pricing info for an Actor billed per charged event."""
128
139
 
129
- apify_margin_percentage: Annotated[float | None, Field(alias='apifyMarginPercentage')] = None
140
+ model_config = ConfigDict(alias_generator=to_camel)
141
+
142
+ apify_margin_percentage: float | None = None
130
143
  """Apify's margin on the price, as a percentage."""
131
144
 
132
- created_at: Annotated[datetime | None, Field(alias='createdAt')] = None
145
+ created_at: datetime | None = None
133
146
  """Timestamp when this pricing info was created."""
134
147
 
135
- started_at: Annotated[datetime | None, Field(alias='startedAt')] = None
148
+ started_at: datetime | None = None
136
149
  """Timestamp when this pricing became effective."""
137
150
 
138
- pricing_per_event: Annotated[PricingPerEvent, Field(alias='pricingPerEvent')]
151
+ pricing_per_event: PricingPerEvent
139
152
  """The pay-per-event pricing details."""
140
153
 
141
154
 
@@ -3,7 +3,8 @@ from __future__ import annotations
3
3
  from datetime import datetime
4
4
  from typing import Annotated, Any, Literal
5
5
 
6
- from pydantic import BaseModel, Field
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+ from pydantic.alias_generators import to_camel
7
8
 
8
9
  from crawlee.events._types import (
9
10
  Event,
@@ -29,28 +30,30 @@ This is the Apify-specific subset of [`Event`][crawlee.events.Event] — for the
29
30
  class SystemInfoEventData(BaseModel):
30
31
  """Resource usage metrics carried by a `systemInfo` event."""
31
32
 
32
- mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
33
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
34
+
35
+ mem_avg_bytes: float
33
36
  """Average memory usage over the measured interval, in bytes."""
34
37
 
35
- mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
38
+ mem_current_bytes: float
36
39
  """Current memory usage, in bytes."""
37
40
 
38
- mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
41
+ mem_max_bytes: float
39
42
  """Peak memory usage observed so far, in bytes."""
40
43
 
41
- cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
44
+ cpu_avg_usage: float
42
45
  """Average CPU usage over the measured interval, in percent."""
43
46
 
44
- cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
47
+ cpu_max_usage: float
45
48
  """Peak CPU usage observed so far, in percent."""
46
49
 
47
- cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
50
+ cpu_current_usage: float
48
51
  """Current CPU usage, in percent."""
49
52
 
50
- is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
53
+ is_cpu_overloaded: bool
51
54
  """Whether the CPU is currently overloaded."""
52
55
 
53
- created_at: Annotated[datetime, Field(alias='createdAt')]
56
+ created_at: datetime
54
57
  """Timestamp when the metrics were collected."""
55
58
 
56
59
  def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
@@ -73,6 +76,8 @@ class SystemInfoEventData(BaseModel):
73
76
  class PersistStateEvent(BaseModel):
74
77
  """A `persistState` event instructing the Actor to persist its state."""
75
78
 
79
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
80
+
76
81
  name: Literal[Event.PERSIST_STATE]
77
82
  """The event name."""
78
83
 
@@ -84,6 +89,8 @@ class PersistStateEvent(BaseModel):
84
89
  class SystemInfoEvent(BaseModel):
85
90
  """A `systemInfo` event carrying the Actor's resource usage metrics."""
86
91
 
92
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
93
+
87
94
  name: Literal[Event.SYSTEM_INFO]
88
95
  """The event name."""
89
96
 
@@ -95,6 +102,8 @@ class SystemInfoEvent(BaseModel):
95
102
  class MigratingEvent(BaseModel):
96
103
  """A `migrating` event signalling the Actor is about to be migrated to another host."""
97
104
 
105
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
106
+
98
107
  name: Literal[Event.MIGRATING]
99
108
  """The event name."""
100
109
 
@@ -106,6 +115,8 @@ class MigratingEvent(BaseModel):
106
115
  class AbortingEvent(BaseModel):
107
116
  """An `aborting` event signalling the Actor run is being aborted."""
108
117
 
118
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
119
+
109
120
  name: Literal[Event.ABORTING]
110
121
  """The event name."""
111
122
 
@@ -117,6 +128,8 @@ class AbortingEvent(BaseModel):
117
128
  class ExitEvent(BaseModel):
118
129
  """An `exit` event signalling the Actor process is about to exit."""
119
130
 
131
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
132
+
120
133
  name: Literal[Event.EXIT]
121
134
  """The event name."""
122
135
 
@@ -128,6 +141,8 @@ class ExitEvent(BaseModel):
128
141
  class EventWithoutData(BaseModel):
129
142
  """A framework-level event that carries no payload (e.g. browser and page lifecycle events)."""
130
143
 
144
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
145
+
131
146
  name: Literal[
132
147
  Event.SESSION_RETIRED,
133
148
  Event.BROWSER_LAUNCHED,
@@ -146,6 +161,8 @@ class EventWithoutData(BaseModel):
146
161
  class DeprecatedEvent(BaseModel):
147
162
  """A deprecated event kept for backward compatibility (e.g. `cpuInfo`)."""
148
163
 
164
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
165
+
149
166
  name: Literal['cpuInfo']
150
167
  """The event name."""
151
168
 
@@ -157,6 +174,8 @@ class DeprecatedEvent(BaseModel):
157
174
  class UnknownEvent(BaseModel):
158
175
  """A fallback for any event whose name is not recognized by the SDK."""
159
176
 
177
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
178
+
160
179
  name: str
161
180
  """The event name."""
162
181
 
@@ -5,7 +5,8 @@ import re
5
5
  from itertools import chain
6
6
  from typing import Annotated, Any
7
7
 
8
- from pydantic import BaseModel, Field, TypeAdapter
8
+ from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
9
+ from pydantic.alias_generators import to_camel
9
10
 
10
11
  from crawlee._types import HttpMethod
11
12
  from crawlee.http_clients import HttpClient, ImpitHttpClient
@@ -20,14 +21,16 @@ URL_NO_COMMAS_REGEX = re.compile(
20
21
 
21
22
 
22
23
  class _RequestDetails(BaseModel):
24
+ model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
25
+
23
26
  method: HttpMethod = 'GET'
24
27
  payload: str = ''
25
28
  headers: Annotated[dict[str, str], Field(default_factory=dict)]
26
- user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')]
29
+ user_data: Annotated[dict[str, str], Field(default_factory=dict)]
27
30
 
28
31
 
29
32
  class _RequestsFromUrlInput(_RequestDetails):
30
- requests_from_url: str = Field(alias='requestsFromUrl')
33
+ requests_from_url: str
31
34
 
32
35
 
33
36
  class _SimpleUrlInput(_RequestDetails):
@@ -13,6 +13,11 @@ _PRIMARY_LOGGERS = ['apify', 'apify_client', 'scrapy']
13
13
  _SUPPLEMENTAL_LOGGERS = ['filelock', 'hpack', 'httpcore', 'protego', 'twisted']
14
14
  _ALL_LOGGERS = _PRIMARY_LOGGERS + _SUPPLEMENTAL_LOGGERS
15
15
 
16
+ # Mutable state shared with the Scrapy monkey-patch below. `initialize_logging` refreshes
17
+ # `level`/`handler` on each call; the patch (installed once) reads them so it always applies the
18
+ # latest configuration rather than values captured the first time it ran.
19
+ _state: dict[str, Any] = {'level': 'INFO', 'handler': None, 'patched': False}
20
+
16
21
 
17
22
  def _configure_logger(name: str | None, logging_level: str, handler: logging.Handler) -> None:
18
23
  """Clear and reconfigure the logger."""
@@ -23,26 +28,39 @@ def _configure_logger(name: str | None, logging_level: str, handler: logging.Han
23
28
  logger.propagate = False
24
29
 
25
30
 
31
+ def _configure_all_loggers() -> None:
32
+ """Apply the Apify handler and level to the root logger and all defined loggers."""
33
+ handler = _state['handler']
34
+ if handler is None:
35
+ return
36
+ for logger_name in [None, *_ALL_LOGGERS]:
37
+ _configure_logger(logger_name, _state['level'], handler)
38
+
39
+
26
40
  def initialize_logging() -> None:
27
41
  """Configure logging for Apify Actors and adjust Scrapy's logging settings."""
28
42
  # Retrieve Scrapy project settings and determine the logging level.
29
43
  settings = get_project_settings()
30
- logging_level = settings.get('LOG_LEVEL', 'INFO') # Default to INFO.
44
+ _state['level'] = settings.get('LOG_LEVEL', 'INFO') # Default to INFO.
31
45
 
32
46
  # Create a custom handler with the Apify log formatter.
33
47
  handler = logging.StreamHandler()
34
48
  handler.setFormatter(ActorLogFormatter(include_logger_name=True))
49
+ _state['handler'] = handler
35
50
 
36
51
  # Configure the root logger and all other defined loggers.
37
- for logger_name in [None, *_ALL_LOGGERS]:
38
- _configure_logger(logger_name, logging_level, handler)
52
+ _configure_all_loggers()
53
+
54
+ # Monkey-patch Scrapy's logging to re-apply our settings whenever it reconfigures logging.
55
+ # Install the wrapper at most once, otherwise repeated calls would nest wrappers.
56
+ if _state['patched']:
57
+ return
39
58
 
40
- # Monkey-patch Scrapy's logging configuration to re-apply our settings.
41
59
  original_configure_logging = scrapy_logging.configure_logging
42
60
 
43
61
  def new_configure_logging(*args: Any, **kwargs: Any) -> None:
44
62
  original_configure_logging(*args, **kwargs)
45
- for logger_name in [None, *_ALL_LOGGERS]:
46
- _configure_logger(logger_name, logging_level, handler)
63
+ _configure_all_loggers()
47
64
 
48
65
  scrapy_logging.configure_logging = new_configure_logging # ty: ignore[invalid-assignment]
66
+ _state['patched'] = True
@@ -0,0 +1,138 @@
1
+ """JSON serialization of Scrapy requests and cached responses for storage on the Apify platform.
2
+
3
+ Scrapy requests and cached responses are stored in the Apify request queue and key-value store which hold JSON,
4
+ so they are serialized as JSON here rather than pickled.
5
+
6
+ Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable; both sit at fixed keys
7
+ and are base64-encoded in place. A `str` `body` is encoded as its UTF-8 bytes and comes back as `bytes`, matching
8
+ Scrapy, which always stores `body` as `bytes`. Pydantic models such as Crawlee's `UserData` are dumped via
9
+ `model_dump()`. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise
10
+ serialization fails with a clear error naming the offending value. No in-band sentinel is used, so no user value
11
+ can collide with the encoding.
12
+
13
+ Known limitations of the pickle -> JSON switch (a documented breaking change): JSON has fewer types than pickle,
14
+ so values in `meta`/`cb_kwargs` are subject to JSON's coercions. A `tuple` round-trips as a `list` and non-string
15
+ `dict` keys round-trip as strings (e.g. `{1: 'a'}` becomes `{'1': 'a'}`). Values JSON cannot represent at all
16
+ (`datetime`, `set`, `Decimal`, arbitrary objects, ...) are not coerced silently: serialization raises and the request
17
+ is skipped loudly rather than stored in a corrupted form.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import base64
23
+ import json
24
+ from typing import Any
25
+
26
+ from pydantic import BaseModel
27
+
28
+ # Cap the offending value's repr in a serialization error message so a huge value cannot bloat the log.
29
+ _MAX_ERROR_VALUE_REPR_LEN = 200
30
+
31
+
32
+ def encode_to_json(data: dict[str, Any]) -> str:
33
+ """Serialize a Scrapy request/response dict to a JSON string.
34
+
35
+ The `body` and `headers` fields are base64-encoded in place (a `str` `body` via its UTF-8 bytes); pydantic
36
+ models are dumped to plain dicts. A `TypeError` is raised if any other value cannot be JSON-encoded.
37
+
38
+ Args:
39
+ data: The dict to serialize, e.g. the output of `scrapy.Request.to_dict()`.
40
+
41
+ Returns:
42
+ The JSON-encoded string.
43
+ """
44
+ if not isinstance(data, dict):
45
+ raise TypeError(f'Expected a dict to serialize, got {type(data)}')
46
+
47
+ safe = dict(data)
48
+
49
+ # `body` is base64-encoded so binary payloads survive; a `str` body is taken as its UTF-8 bytes, which keeps
50
+ # encode/decode symmetric (decode always base64-decodes `body` back to `bytes`).
51
+ body = safe.get('body')
52
+ if isinstance(body, (bytes, str)):
53
+ raw_body = body.encode('utf-8') if isinstance(body, str) else body
54
+ safe['body'] = base64.b64encode(raw_body).decode('ascii')
55
+
56
+ if isinstance(safe.get('headers'), dict):
57
+ safe['headers'] = _encode_headers(safe['headers'])
58
+
59
+ try:
60
+ # `ensure_ascii=False` keeps non-ASCII URLs/meta as their UTF-8 form instead of `\uXXXX` escapes, which
61
+ # would otherwise roughly double the size of non-Latin text in storage.
62
+ return json.dumps(safe, default=_json_default, ensure_ascii=False)
63
+ except TypeError as exc:
64
+ raise TypeError(
65
+ 'Failed to JSON-serialize a Scrapy request/response for storage on the Apify platform. '
66
+ 'All values in `meta` and `cb_kwargs` must be JSON-serializable (str, int, float, bool, None, '
67
+ 'list, dict, or a pydantic model).'
68
+ ) from exc
69
+
70
+
71
+ def decode_from_json(text: str) -> Any:
72
+ """Reconstruct a Scrapy request/response dict from a string produced by `encode_to_json`.
73
+
74
+ The base64-encoded `body` and `headers` fields are decoded back to their `bytes` representation.
75
+
76
+ Args:
77
+ text: The JSON-encoded string.
78
+
79
+ Returns:
80
+ The decoded object (a dict for valid request/response payloads).
81
+ """
82
+ data = json.loads(text)
83
+ if not isinstance(data, dict):
84
+ return data
85
+
86
+ # `validate=True` makes a non-base64 body raise loudly instead of silently decoding to garbage.
87
+ if isinstance(data.get('body'), str):
88
+ data['body'] = base64.b64decode(data['body'], validate=True)
89
+
90
+ if isinstance(data.get('headers'), dict):
91
+ data['headers'] = _decode_headers(data['headers'])
92
+
93
+ return data
94
+
95
+
96
+ def _json_default(obj: Any) -> Any:
97
+ """Fallback for values `json.dumps` cannot serialize: pydantic models are dumped, anything else raises.
98
+
99
+ The error names the offending value (type and a truncated repr) so a failed serialization points straight
100
+ at the bad `meta`/`cb_kwargs` entry instead of just reporting that something failed.
101
+ """
102
+ if isinstance(obj, BaseModel):
103
+ return obj.model_dump(by_alias=True)
104
+ value_repr = repr(obj)
105
+ if len(value_repr) > _MAX_ERROR_VALUE_REPR_LEN:
106
+ value_repr = value_repr[:_MAX_ERROR_VALUE_REPR_LEN] + '...'
107
+ raise TypeError(f'Object of type {type(obj).__name__} is not JSON-serializable: {value_repr}')
108
+
109
+
110
+ def _encode_headers(headers: dict[Any, Any]) -> dict[str, list[str]]:
111
+ """Encode a Scrapy `{bytes: [bytes]}` headers mapping to a JSON-safe `{str: [base64-str]}`."""
112
+ encoded: dict[str, list[str]] = {}
113
+ for key, value in headers.items():
114
+ str_key = key.decode('latin-1') if isinstance(key, bytes) else key
115
+ values = value if isinstance(value, (list, tuple)) else [value]
116
+ encoded[str_key] = [_b64encode_value(item) for item in values]
117
+ return encoded
118
+
119
+
120
+ def _decode_headers(headers: dict[str, Any]) -> dict[bytes, list[bytes]]:
121
+ """Reverse `_encode_headers`, restoring the `{bytes: [bytes]}` mapping Scrapy expects."""
122
+ decoded: dict[bytes, list[bytes]] = {}
123
+ for key, value in headers.items():
124
+ bytes_key = key.encode('latin-1') if isinstance(key, str) else key
125
+ values = value if isinstance(value, list) else [value]
126
+ decoded[bytes_key] = [base64.b64decode(item, validate=True) for item in values]
127
+ return decoded
128
+
129
+
130
+ def _b64encode_value(value: Any) -> str:
131
+ """Base64-encode a single header value.
132
+
133
+ Scrapy stores header values as `bytes`; a `str` is encoded as its UTF-8 bytes. Any other type is coerced with
134
+ `str()` as a lenient last resort. That coercion is lossy (e.g. `5` becomes `b'5'`), but Scrapy does not produce
135
+ non-`bytes`/`str` header values, so it is not hit on the real path.
136
+ """
137
+ raw = value if isinstance(value, bytes) else str(value).encode('utf-8')
138
+ return base64.b64encode(raw).decode('ascii')