apify 3.4.2b5__tar.gz → 3.4.2b6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {apify-3.4.2b5 → apify-3.4.2b6}/CHANGELOG.md +1 -0
  2. {apify-3.4.2b5 → apify-3.4.2b6}/PKG-INFO +1 -1
  3. {apify-3.4.2b5 → apify-3.4.2b6}/pyproject.toml +1 -1
  4. apify-3.4.2b6/src/apify/scrapy/_serialization.py +138 -0
  5. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/extensions/_httpcache.py +49 -33
  6. apify-3.4.2b6/src/apify/scrapy/requests.py +216 -0
  7. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/scheduler.py +12 -8
  8. apify-3.4.2b5/src/apify/scrapy/requests.py +0 -164
  9. {apify-3.4.2b5 → apify-3.4.2b6}/.gitignore +0 -0
  10. {apify-3.4.2b5 → apify-3.4.2b6}/CONTRIBUTING.md +0 -0
  11. {apify-3.4.2b5 → apify-3.4.2b6}/LICENSE +0 -0
  12. {apify-3.4.2b5 → apify-3.4.2b6}/README.md +0 -0
  13. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/__init__.py +0 -0
  14. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_actor.py +0 -0
  15. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_charging.py +0 -0
  16. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_configuration.py +0 -0
  17. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_consts.py +0 -0
  18. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_crypto.py +0 -0
  19. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_proxy_configuration.py +0 -0
  20. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_utils.py +0 -0
  21. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/_webhook.py +0 -0
  22. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/events/__init__.py +0 -0
  23. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/events/_apify_event_manager.py +0 -0
  24. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/events/_types.py +0 -0
  25. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/events/py.typed +0 -0
  26. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/log.py +0 -0
  27. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/py.typed +0 -0
  28. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/request_loaders/__init__.py +0 -0
  29. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/request_loaders/_apify_request_list.py +0 -0
  30. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/request_loaders/py.typed +0 -0
  31. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/__init__.py +0 -0
  32. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/_actor_runner.py +0 -0
  33. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/_async_thread.py +0 -0
  34. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/_logging_config.py +0 -0
  35. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/extensions/__init__.py +0 -0
  36. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/middlewares/__init__.py +0 -0
  37. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
  38. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/middlewares/py.typed +0 -0
  39. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/pipelines/__init__.py +0 -0
  40. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
  41. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/pipelines/py.typed +0 -0
  42. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/py.typed +0 -0
  43. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/scrapy/utils.py +0 -0
  44. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/__init__.py +0 -0
  45. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/__init__.py +0 -0
  46. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_alias_resolving.py +0 -0
  47. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_api_client_creation.py +0 -0
  48. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
  49. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
  50. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_models.py +0 -0
  51. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_request_queue_client.py +0 -0
  52. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +0 -0
  53. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_request_queue_single_client.py +0 -0
  54. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
  55. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/_utils.py +0 -0
  56. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_apify/py.typed +0 -0
  57. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/__init__.py +0 -0
  58. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
  59. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
  60. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
  61. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
  62. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
  63. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
  64. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storage_clients/py.typed +0 -0
  65. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storages/__init__.py +0 -0
  66. {apify-3.4.2b5 → apify-3.4.2b6}/src/apify/storages/py.typed +0 -0
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
9
9
 
10
10
  - **scrapy:** Correct proxy middleware exception log and import ([#953](https://github.com/apify/apify-sdk-python/pull/953)) ([5bd6eb9](https://github.com/apify/apify-sdk-python/commit/5bd6eb9843d90844cec083372e932413bceedec9)) by [@vdusek](https://github.com/vdusek)
11
11
  - **scrapy:** Skip a request that fails to convert instead of crashing the run ([#952](https://github.com/apify/apify-sdk-python/pull/952)) ([db9444f](https://github.com/apify/apify-sdk-python/commit/db9444faeb0158c29aa394121cf733ff2e843f28)) by [@vdusek](https://github.com/vdusek)
12
+ - **scrapy:** [**breaking**] Serialize requests and HTTP cache as JSON instead of pickle ([#951](https://github.com/apify/apify-sdk-python/pull/951)) ([a87e8d1](https://github.com/apify/apify-sdk-python/commit/a87e8d1597478b4f12fd5bb9b379f65f637d8e96)) by [@vdusek](https://github.com/vdusek)
12
13
 
13
14
  ### 🚜 Refactor
14
15
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.4.2b5
3
+ Version: 3.4.2b6
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "apify"
7
- version = "3.4.2b5"
7
+ version = "3.4.2b6"
8
8
  description = "Apify SDK for Python"
9
9
  authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
10
10
  license = { file = "LICENSE" }
@@ -0,0 +1,138 @@
1
+ """JSON serialization of Scrapy requests and cached responses for storage on the Apify platform.
2
+
3
+ Scrapy requests and cached responses are stored in the Apify request queue and key-value store which hold JSON,
4
+ so they are serialized as JSON here rather than pickled.
5
+
6
+ Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable; both sit at fixed keys
7
+ and are base64-encoded in place. A `str` `body` is encoded as its UTF-8 bytes and comes back as `bytes`, matching
8
+ Scrapy, which always stores `body` as `bytes`. Pydantic models such as Crawlee's `UserData` are dumped via
9
+ `model_dump()`. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise
10
+ serialization fails with a clear error naming the offending value. No in-band sentinel is used, so no user value
11
+ can collide with the encoding.
12
+
13
+ Known limitations of the pickle -> JSON switch (a documented breaking change): JSON has fewer types than pickle,
14
+ so values in `meta`/`cb_kwargs` are subject to JSON's coercions. A `tuple` round-trips as a `list` and non-string
15
+ `dict` keys round-trip as strings (e.g. `{1: 'a'}` becomes `{'1': 'a'}`). Values JSON cannot represent at all
16
+ (`datetime`, `set`, `Decimal`, arbitrary objects, ...) are not coerced silently: serialization raises and the request
17
+ is skipped loudly rather than stored in a corrupted form.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import base64
23
+ import json
24
+ from typing import Any
25
+
26
+ from pydantic import BaseModel
27
+
28
+ # Cap the offending value's repr in a serialization error message so a huge value cannot bloat the log.
29
+ _MAX_ERROR_VALUE_REPR_LEN = 200
30
+
31
+
32
+ def encode_to_json(data: dict[str, Any]) -> str:
33
+ """Serialize a Scrapy request/response dict to a JSON string.
34
+
35
+ The `body` and `headers` fields are base64-encoded in place (a `str` `body` via its UTF-8 bytes); pydantic
36
+ models are dumped to plain dicts. A `TypeError` is raised if any other value cannot be JSON-encoded.
37
+
38
+ Args:
39
+ data: The dict to serialize, e.g. the output of `scrapy.Request.to_dict()`.
40
+
41
+ Returns:
42
+ The JSON-encoded string.
43
+ """
44
+ if not isinstance(data, dict):
45
+ raise TypeError(f'Expected a dict to serialize, got {type(data)}')
46
+
47
+ safe = dict(data)
48
+
49
+ # `body` is base64-encoded so binary payloads survive; a `str` body is taken as its UTF-8 bytes, which keeps
50
+ # encode/decode symmetric (decode always base64-decodes `body` back to `bytes`).
51
+ body = safe.get('body')
52
+ if isinstance(body, (bytes, str)):
53
+ raw_body = body.encode('utf-8') if isinstance(body, str) else body
54
+ safe['body'] = base64.b64encode(raw_body).decode('ascii')
55
+
56
+ if isinstance(safe.get('headers'), dict):
57
+ safe['headers'] = _encode_headers(safe['headers'])
58
+
59
+ try:
60
+ # `ensure_ascii=False` keeps non-ASCII URLs/meta as their UTF-8 form instead of `\uXXXX` escapes, which
61
+ # would otherwise roughly double the size of non-Latin text in storage.
62
+ return json.dumps(safe, default=_json_default, ensure_ascii=False)
63
+ except TypeError as exc:
64
+ raise TypeError(
65
+ 'Failed to JSON-serialize a Scrapy request/response for storage on the Apify platform. '
66
+ 'All values in `meta` and `cb_kwargs` must be JSON-serializable (str, int, float, bool, None, '
67
+ 'list, dict, or a pydantic model).'
68
+ ) from exc
69
+
70
+
71
+ def decode_from_json(text: str) -> Any:
72
+ """Reconstruct a Scrapy request/response dict from a string produced by `encode_to_json`.
73
+
74
+ The base64-encoded `body` and `headers` fields are decoded back to their `bytes` representation.
75
+
76
+ Args:
77
+ text: The JSON-encoded string.
78
+
79
+ Returns:
80
+ The decoded object (a dict for valid request/response payloads).
81
+ """
82
+ data = json.loads(text)
83
+ if not isinstance(data, dict):
84
+ return data
85
+
86
+ # `validate=True` makes a non-base64 body raise loudly instead of silently decoding to garbage.
87
+ if isinstance(data.get('body'), str):
88
+ data['body'] = base64.b64decode(data['body'], validate=True)
89
+
90
+ if isinstance(data.get('headers'), dict):
91
+ data['headers'] = _decode_headers(data['headers'])
92
+
93
+ return data
94
+
95
+
96
+ def _json_default(obj: Any) -> Any:
97
+ """Fallback for values `json.dumps` cannot serialize: pydantic models are dumped, anything else raises.
98
+
99
+ The error names the offending value (type and a truncated repr) so a failed serialization points straight
100
+ at the bad `meta`/`cb_kwargs` entry instead of just reporting that something failed.
101
+ """
102
+ if isinstance(obj, BaseModel):
103
+ return obj.model_dump(by_alias=True)
104
+ value_repr = repr(obj)
105
+ if len(value_repr) > _MAX_ERROR_VALUE_REPR_LEN:
106
+ value_repr = value_repr[:_MAX_ERROR_VALUE_REPR_LEN] + '...'
107
+ raise TypeError(f'Object of type {type(obj).__name__} is not JSON-serializable: {value_repr}')
108
+
109
+
110
+ def _encode_headers(headers: dict[Any, Any]) -> dict[str, list[str]]:
111
+ """Encode a Scrapy `{bytes: [bytes]}` headers mapping to a JSON-safe `{str: [base64-str]}`."""
112
+ encoded: dict[str, list[str]] = {}
113
+ for key, value in headers.items():
114
+ str_key = key.decode('latin-1') if isinstance(key, bytes) else key
115
+ values = value if isinstance(value, (list, tuple)) else [value]
116
+ encoded[str_key] = [_b64encode_value(item) for item in values]
117
+ return encoded
118
+
119
+
120
+ def _decode_headers(headers: dict[str, Any]) -> dict[bytes, list[bytes]]:
121
+ """Reverse `_encode_headers`, restoring the `{bytes: [bytes]}` mapping Scrapy expects."""
122
+ decoded: dict[bytes, list[bytes]] = {}
123
+ for key, value in headers.items():
124
+ bytes_key = key.encode('latin-1') if isinstance(key, str) else key
125
+ values = value if isinstance(value, list) else [value]
126
+ decoded[bytes_key] = [base64.b64decode(item, validate=True) for item in values]
127
+ return decoded
128
+
129
+
130
+ def _b64encode_value(value: Any) -> str:
131
+ """Base64-encode a single header value.
132
+
133
+ Scrapy stores header values as `bytes`; a `str` is encoded as its UTF-8 bytes. Any other type is coerced with
134
+ `str()` as a lenient last resort. That coercion is lossy (e.g. `5` becomes `b'5'`), but Scrapy does not produce
135
+ non-`bytes`/`str` header values, so it is not hit on the real path.
136
+ """
137
+ raw = value if isinstance(value, bytes) else str(value).encode('utf-8')
138
+ return base64.b64encode(raw).decode('ascii')
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import gzip
4
4
  import io
5
- import pickle
6
5
  import re
7
6
  import struct
8
7
  from logging import getLogger
@@ -14,6 +13,7 @@ from scrapy.responsetypes import responsetypes
14
13
 
15
14
  from apify import Configuration
16
15
  from apify.scrapy._async_thread import AsyncThread
16
+ from apify.scrapy._serialization import decode_from_json, encode_to_json
17
17
  from apify.storage_clients import ApifyStorageClient
18
18
  from apify.storages import KeyValueStore
19
19
 
@@ -29,14 +29,14 @@ logger = getLogger(__name__)
29
29
  class ApifyCacheStorage:
30
30
  """A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses.
31
31
 
32
- It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches
33
- responses to requests. See HTTPCache middleware settings (prefixed with `HTTPCACHE_`)
34
- in the Scrapy documentation for more information. Requires the asyncio Twisted reactor
35
- to be installed.
32
+ It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches responses to requests.
33
+ See HTTPCache middleware settings (prefixed with `HTTPCACHE_`) in the Scrapy documentation for more information.
34
+ Requires the asyncio Twisted reactor to be installed.
36
35
  """
37
36
 
38
37
  def __init__(self, settings: BaseSettings) -> None:
39
- self._expiration_max_items = 100
38
+ # Upper bound on how many keys the per-spider-close cleanup sweeps (best-effort; `close_spider`).
39
+ self._expiration_max_items: int = settings.getint('APIFY_HTTPCACHE_EXPIRATION_MAX_ITEMS', 100)
40
40
  self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS')
41
41
  self._spider: Spider | None = None
42
42
  self._kvs: KeyValueStore | None = None
@@ -79,23 +79,26 @@ class ApifyCacheStorage:
79
79
  async def expire_kvs() -> None:
80
80
  if self._kvs is None:
81
81
  raise ValueError('Key value store not initialized')
82
- i = 0
82
+ # Best-effort cleanup: at most `_expiration_max_items` keys per close, in no guaranteed order,
83
+ # so stale entries may linger. This only reclaims storage; `retrieve_response` already treats
84
+ # an expired entry as a cache miss.
85
+ processed = 0
83
86
  async for item in self._kvs.iterate_keys():
87
+ if processed >= self._expiration_max_items:
88
+ break
89
+ processed += 1
84
90
  value = await self._kvs.get_value(item.key)
85
91
  try:
86
92
  gzip_time = read_gzip_time(value)
87
93
  except Exception as e:
88
94
  logger.warning(f'Malformed cache item {item.key}: {e}')
89
- await self._kvs.set_value(item.key, None)
95
+ await self._kvs.delete_value(item.key)
90
96
  else:
91
97
  if self._expiration_secs < current_time - gzip_time:
92
98
  logger.debug(f'Expired cache item {item.key}')
93
- await self._kvs.set_value(item.key, None)
99
+ await self._kvs.delete_value(item.key)
94
100
  else:
95
101
  logger.debug(f'Valid cache item {item.key}')
96
- if i == self._expiration_max_items:
97
- break
98
- i += 1
99
102
 
100
103
  self._async_thread.run_coro(expire_kvs())
101
104
 
@@ -127,17 +130,25 @@ class ApifyCacheStorage:
127
130
 
128
131
  if current_time is None:
129
132
  current_time = int(time())
130
- if 0 < self._expiration_secs < current_time - read_gzip_time(value):
131
- logger.debug('Cache expired', extra={'request': request})
133
+
134
+ # A malformed or legacy cache entry must not crash retrieval; treat it as a cache miss so Scrapy re-fetches
135
+ # and re-stores it in the current format. The field reads stay inside the `try` as well: a value that decodes
136
+ # to a dict missing any expected key (a forward/older format, or a truncated-but-valid JSON payload) must
137
+ # also degrade to a miss rather than raising an uncaught `KeyError`.
138
+ try:
139
+ if 0 < self._expiration_secs < current_time - read_gzip_time(value):
140
+ logger.debug('Cache expired', extra={'request': request})
141
+ return None
142
+ data = from_gzip(value)
143
+ url = data['url']
144
+ status = data['status']
145
+ headers = Headers(data['headers'])
146
+ body = data['body']
147
+ except Exception as exc:
148
+ logger.warning(f'Ignoring malformed cache entry {key!r}: {exc}', extra={'request': request})
132
149
  return None
133
150
 
134
- data = from_gzip(value)
135
- url = data['url']
136
- status = data['status']
137
- headers = Headers(data['headers'])
138
- body = data['body']
139
151
  respcls = responsetypes.from_args(headers=headers, url=url, body=body)
140
-
141
152
  logger.debug('Cache hit', extra={'request': request})
142
153
  return respcls(url=url, headers=headers, status=status, body=body)
143
154
 
@@ -162,18 +173,25 @@ class ApifyCacheStorage:
162
173
 
163
174
 
164
175
  def to_gzip(data: dict, mtime: int | None = None) -> bytes:
165
- """Dump a dictionary to a gzip-compressed byte stream."""
176
+ """Dump a dictionary to a gzip-compressed JSON byte stream.
177
+
178
+ Cache entries live in the Apify key-value store, which holds JSON, so they are serialized as JSON rather
179
+ than pickled. See `apify.scrapy._serialization` for the encoding.
180
+ """
181
+ payload = encode_to_json(data).encode('utf-8')
166
182
  with io.BytesIO() as byte_stream:
167
183
  with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file:
168
- pickle.dump(data, gzip_file, protocol=4)
184
+ gzip_file.write(payload)
169
185
  return byte_stream.getvalue()
170
186
 
171
187
 
172
188
  def from_gzip(gzip_bytes: bytes) -> dict:
173
- """Load a dictionary from a gzip-compressed byte stream."""
189
+ """Load a dictionary from a gzip-compressed JSON byte stream."""
174
190
  with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file:
175
- data: dict = pickle.load(gzip_file)
176
- return data
191
+ data = decode_from_json(gzip_file.read().decode('utf-8'))
192
+ if not isinstance(data, dict):
193
+ raise TypeError(f'Expected a dict from the cached payload, got {type(data)}')
194
+ return data
177
195
 
178
196
 
179
197
  def read_gzip_time(gzip_bytes: bytes) -> int:
@@ -187,17 +205,15 @@ def read_gzip_time(gzip_bytes: bytes) -> int:
187
205
  def get_kvs_name(spider_name: str, max_length: int = 60) -> str:
188
206
  """Get the key value store name for a spider.
189
207
 
190
- The key value store name is derived from the spider name by replacing all special characters
191
- with hyphens and trimming leading and trailing hyphens. The resulting name is prefixed with
192
- 'httpcache-' and truncated to the maximum length.
208
+ The key value store name is derived from the spider name by replacing all special characters with hyphens
209
+ and trimming leading and trailing hyphens. The resulting name is prefixed with 'httpcache-' and truncated
210
+ to the maximum length.
193
211
 
194
- The documentation
195
- [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages)
212
+ The documentation [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages)
196
213
  mentions that names can be up to 63 characters long, so the default max length is set to 60.
197
214
 
198
- Such naming isn't unique per spider, but should be sufficiently unique for most use cases.
199
- The name of the key value store should indicate to which spider it belongs, e.g. in
200
- the listing in the Apify's console.
215
+ Such naming isn't unique per spider, but should be sufficiently unique for most use cases. The name
216
+ of the key-value store should indicate to which spider it belongs, e.g. in the listing in the Apify's console.
201
217
 
202
218
  Args:
203
219
  spider_name: Value of the Spider instance's name attribute.
@@ -0,0 +1,216 @@
1
+ from __future__ import annotations
2
+
3
+ from logging import getLogger
4
+ from typing import Any, cast
5
+
6
+ from scrapy import Request as ScrapyRequest
7
+ from scrapy import Spider
8
+ from scrapy.http.headers import Headers
9
+ from scrapy.utils.misc import load_object
10
+ from scrapy.utils.request import request_from_dict
11
+
12
+ from crawlee._request import UserData
13
+ from crawlee._types import HttpHeaders
14
+
15
+ from ._serialization import decode_from_json, encode_to_json
16
+ from apify import Request as ApifyRequest
17
+
18
+ logger = getLogger(__name__)
19
+
20
+
21
+ def _ensure_known_request_class(request_dict: dict[str, Any]) -> None:
22
+ """Validate the optional `_class` entry before `request_from_dict` instantiates it.
23
+
24
+ `request_from_dict` resolves `_class` with `load_object` and calls it with the request kwargs. The dotted path
25
+ is resolved here first and rejected unless it is a `scrapy.Request` subclass, so a payload can never coerce
26
+ reconstruction into instantiating an arbitrary callable. Resolving may import the class's module (the same
27
+ import `request_from_dict` would do, and far safer than the arbitrary code execution the previous pickle
28
+ format allowed). That import is what lets a custom `Request` subclass be rebuilt in a fresh process after
29
+ an Actor migration, before the spider has lazily imported it.
30
+ """
31
+ class_path = request_dict.get('_class')
32
+ if class_path is None:
33
+ return
34
+
35
+ if not isinstance(class_path, str):
36
+ raise TypeError(f'Invalid scrapy_request `_class`, expected a string, got {type(class_path)}')
37
+
38
+ try:
39
+ request_cls = load_object(class_path)
40
+ except (ImportError, AttributeError, ValueError, NameError) as exc:
41
+ raise TypeError(f'Refusing to reconstruct a Scrapy request: cannot resolve `_class` {class_path!r}.') from exc
42
+
43
+ if not (isinstance(request_cls, type) and issubclass(request_cls, ScrapyRequest)):
44
+ raise TypeError(
45
+ f'Refusing to reconstruct a Scrapy request of type {class_path!r}: it is not a scrapy.Request subclass.'
46
+ )
47
+
48
+
49
+ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None:
50
+ """Convert a Scrapy request to an Apify request.
51
+
52
+ Args:
53
+ scrapy_request: The Scrapy request to be converted.
54
+ spider: The Scrapy spider that the request is associated with.
55
+
56
+ Returns:
57
+ The converted Apify request if the conversion was successful, otherwise None.
58
+ """
59
+ if not isinstance(scrapy_request, ScrapyRequest):
60
+ logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.')
61
+ return None
62
+
63
+ logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...')
64
+
65
+ # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
66
+ #
67
+ # The body is stored twice on purpose: as `payload` (used for the extended unique key) and inside the serialized
68
+ # Scrapy request below (used to reconstruct it). Both come from `scrapy_request.body`.
69
+ request_kwargs: dict[str, Any] = {
70
+ 'url': scrapy_request.url,
71
+ 'method': scrapy_request.method,
72
+ 'payload': scrapy_request.body,
73
+ 'use_extended_unique_key': True,
74
+ 'keep_url_fragment': False,
75
+ }
76
+
77
+ try:
78
+ if scrapy_request.dont_filter:
79
+ request_kwargs['always_enqueue'] = True
80
+ elif scrapy_request.meta.get('apify_request_unique_key'):
81
+ request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
82
+
83
+ user_data = scrapy_request.meta.get('userData', {})
84
+
85
+ # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects from leaking
86
+ # into Request.from_url() during Scrapy-Apify roundtrips.
87
+ if isinstance(user_data, UserData):
88
+ user_data = user_data.model_dump(by_alias=True)
89
+
90
+ # Remove internal Crawlee data since it's managed by Request.from_url() and values from previous roundtrips
91
+ # cause incorrect state.
92
+ if isinstance(user_data, dict):
93
+ user_data.pop('__crawlee', None)
94
+
95
+ request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {}
96
+
97
+ # Store an Apify-platform view of the headers. The authoritative copy with exact bytes travels in
98
+ # the serialized scrapy_request below, so non-UTF-8 headers (which make `to_unicode_dict()` raise) are
99
+ # tolerated rather than dropping the whole request.
100
+ #
101
+ # Trade-off: with `use_extended_unique_key=True` the unique key includes the headers, so when non-UTF-8
102
+ # headers are omitted here two requests differing only in those headers share a unique key and one is
103
+ # deduplicated away. This is rare (header values are normally ASCII/UTF-8) and still strictly better than
104
+ # the old behavior, which dropped such requests entirely.
105
+ if isinstance(scrapy_request.headers, Headers):
106
+ try:
107
+ headers = cast('dict[str, str]', dict(scrapy_request.headers.to_unicode_dict()))
108
+ request_kwargs['headers'] = HttpHeaders(headers)
109
+ except UnicodeDecodeError:
110
+ logger.warning(
111
+ 'Could not represent Scrapy request headers as Apify request headers (non-UTF-8 values); '
112
+ 'they are preserved in the serialized request instead.'
113
+ )
114
+ else:
115
+ logger.warning(
116
+ f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
117
+ )
118
+
119
+ apify_request = ApifyRequest.from_url(**request_kwargs)
120
+ scrapy_request_dict = scrapy_request.to_dict(spider=spider)
121
+
122
+ except Exception as exc:
123
+ logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
124
+ return None
125
+
126
+ # Serialize the Scrapy request as JSON under 'scrapy_request'. Kept outside the broad except above so
127
+ # a non-JSON-serializable `meta`/`cb_kwargs` is logged with a traceback and the request skipped (returning
128
+ # None per this function's contract), rather than crashing the crawl.
129
+ try:
130
+ scrapy_request_json = encode_to_json(scrapy_request_dict)
131
+ except TypeError:
132
+ logger.exception(
133
+ f'Failed to serialize Scrapy request {scrapy_request} for storage on the Apify platform; skipping it. '
134
+ 'Ensure all values in `meta` and `cb_kwargs` are JSON-serializable.'
135
+ )
136
+ return None
137
+
138
+ # `scrapy_request_json` is already JSON-safe text (binary fields are base64-encoded inside it), so it is stored
139
+ # as-is. The request queue serializes `user_data` to JSON, which escapes the string correctly; wrapping it in
140
+ # a second base64 layer would only add ~33% overhead on the enqueue path.
141
+ apify_request.user_data['scrapy_request'] = scrapy_request_json
142
+
143
+ logger.debug(f'scrapy_request was converted to the apify_request={apify_request}')
144
+ return apify_request
145
+
146
+
147
+ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequest:
148
+ """Convert an Apify request to a Scrapy request.
149
+
150
+ Args:
151
+ apify_request: The Apify request to be converted.
152
+ spider: The Scrapy spider that the request is associated with.
153
+
154
+ Raises:
155
+ TypeError: If `apify_request` is not an `ApifyRequest`, if the stored Scrapy request payload is malformed,
156
+ or if its `_class` cannot be resolved to a `scrapy.Request` subclass.
157
+
158
+ Returns:
159
+ The converted Scrapy request.
160
+ """
161
+ if not isinstance(cast('Any', apify_request), ApifyRequest):
162
+ raise TypeError('apify_request must be an apify.Request instance')
163
+
164
+ logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...')
165
+
166
+ # If the apify_request comes from the Scrapy
167
+ if 'scrapy_request' in apify_request.user_data:
168
+ # Deserialize the Scrapy ScrapyRequest from the apify_request by parsing the stored JSON and reconstructing
169
+ # the Scrapy ScrapyRequest object from its dictionary representation.
170
+ logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...')
171
+
172
+ scrapy_request_json = apify_request.user_data['scrapy_request']
173
+ if not isinstance(scrapy_request_json, str):
174
+ raise TypeError('the stored scrapy_request must be a string')
175
+
176
+ scrapy_request_dict = decode_from_json(scrapy_request_json)
177
+ if not isinstance(scrapy_request_dict, dict):
178
+ raise TypeError('scrapy_request_dict must be a dictionary')
179
+
180
+ # Validate any `_class` entry before request_from_dict resolves and imports it.
181
+ _ensure_known_request_class(scrapy_request_dict)
182
+
183
+ scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
184
+ if not isinstance(scrapy_request, ScrapyRequest):
185
+ raise TypeError('scrapy_request must be an instance of the ScrapyRequest class')
186
+
187
+ logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...')
188
+
189
+ # Update the meta field with the meta field from the apify_request
190
+ meta = scrapy_request.meta or {}
191
+ meta.update({'apify_request_unique_key': apify_request.unique_key})
192
+ # scrapy_request.meta is a property, so we have to set it like this
193
+ scrapy_request._meta = meta # noqa: SLF001
194
+
195
+ # If the apify_request comes directly from the Scrapy, typically start URLs.
196
+ else:
197
+ logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)')
198
+
199
+ scrapy_request = ScrapyRequest(
200
+ url=apify_request.url,
201
+ method=apify_request.method,
202
+ meta={
203
+ 'apify_request_unique_key': apify_request.unique_key,
204
+ },
205
+ )
206
+
207
+ # Add optional 'headers' field
208
+ if apify_request.headers:
209
+ scrapy_request.headers |= Headers(apify_request.headers)
210
+
211
+ # Add optional 'userData' field
212
+ if apify_request.user_data:
213
+ scrapy_request.meta['userData'] = apify_request.user_data
214
+
215
+ logger.debug(f'an apify_request was converted to the scrapy_request={scrapy_request}')
216
+ return scrapy_request
@@ -162,20 +162,24 @@ class ApifyScheduler(BaseScheduler):
162
162
  if not isinstance(self.spider, Spider):
163
163
  raise TypeError('self.spider must be an instance of the Spider class')
164
164
 
165
- # Let the request queue know that the request is being handled. Every request should
166
- # be marked as handled, retrying is handled by the Scrapy's RetryMiddleware.
165
+ # Reconstruct the Scrapy request before consuming the queue entry. A malformed entry must not crash
166
+ # the whole run, so on failure it is logged and skipped (None) rather than propagating.
167
+ try:
168
+ scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
169
+ except Exception:
170
+ logger.exception(f'Failed to convert Apify request {apify_request} to a Scrapy request; skipping it.')
171
+ scrapy_request = None
172
+
173
+ # Mark the request as handled. This runs even when reconstruction failed above: an unrecoverable entry
174
+ # (a corrupt or legacy payload) must still be consumed, otherwise the queue would keep handing it back
175
+ # forever. Retrying genuine failures is the RetryMiddleware's job.
167
176
  try:
168
177
  self._async_thread.run_coro(self._rq.mark_request_as_handled(apify_request))
169
178
  except Exception:
170
179
  traceback.print_exc()
171
180
  raise
172
181
 
173
- # Reconstruct the Scrapy request. A malformed queue entry must not crash the whole run: it
174
- # has already been marked handled above, so log it and skip it instead of propagating.
175
- try:
176
- scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
177
- except Exception:
178
- logger.exception(f'Failed to convert Apify request {apify_request} to a Scrapy request; skipping it.')
182
+ if scrapy_request is None:
179
183
  return None
180
184
 
181
185
  logger.debug(f'Converted to scrapy_request: {scrapy_request}')
@@ -1,164 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import codecs
4
- import pickle
5
- from logging import getLogger
6
- from typing import Any, cast
7
-
8
- from scrapy import Request as ScrapyRequest
9
- from scrapy import Spider
10
- from scrapy.http.headers import Headers
11
- from scrapy.utils.request import request_from_dict
12
-
13
- from crawlee._request import UserData
14
- from crawlee._types import HttpHeaders
15
-
16
- from apify import Request as ApifyRequest
17
-
18
- logger = getLogger(__name__)
19
-
20
-
21
- def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None:
22
- """Convert a Scrapy request to an Apify request.
23
-
24
- Args:
25
- scrapy_request: The Scrapy request to be converted.
26
- spider: The Scrapy spider that the request is associated with.
27
-
28
- Returns:
29
- The converted Apify request if the conversion was successful, otherwise None.
30
- """
31
- if not isinstance(scrapy_request, ScrapyRequest):
32
- logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.')
33
- return None
34
-
35
- logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...')
36
-
37
- # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter.
38
- request_kwargs: dict[str, Any] = {
39
- 'url': scrapy_request.url,
40
- 'method': scrapy_request.method,
41
- 'payload': scrapy_request.body,
42
- 'use_extended_unique_key': True,
43
- 'keep_url_fragment': False,
44
- }
45
-
46
- try:
47
- if scrapy_request.dont_filter:
48
- request_kwargs['always_enqueue'] = True
49
- else:
50
- if scrapy_request.meta.get('apify_request_unique_key'):
51
- request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']
52
-
53
- if scrapy_request.meta.get('apify_request_id'):
54
- request_kwargs['id'] = scrapy_request.meta['apify_request_id']
55
-
56
- user_data = scrapy_request.meta.get('userData', {})
57
-
58
- # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects
59
- # from leaking into Request.from_url() during Scrapy-Apify roundtrips.
60
- if isinstance(user_data, UserData):
61
- user_data = user_data.model_dump(by_alias=True)
62
-
63
- # Remove internal Crawlee data since it's managed by Request.from_url() and values
64
- # from previous roundtrips cause incorrect state.
65
- if isinstance(user_data, dict):
66
- user_data.pop('__crawlee', None)
67
-
68
- request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {}
69
-
70
- # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
71
- if isinstance(scrapy_request.headers, Headers):
72
- headers = cast('dict[str, str]', dict(scrapy_request.headers.to_unicode_dict()))
73
- request_kwargs['headers'] = HttpHeaders(headers)
74
- else:
75
- logger.warning(
76
- f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
77
- )
78
-
79
- apify_request = ApifyRequest.from_url(**request_kwargs)
80
-
81
- # Serialize the Scrapy ScrapyRequest and store it in the apify_request.
82
- # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
83
- # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
84
- # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
85
- scrapy_request_dict = scrapy_request.to_dict(spider=spider)
86
- scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
87
- apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded
88
-
89
- except Exception as exc:
90
- logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
91
- return None
92
-
93
- logger.debug(f'scrapy_request was converted to the apify_request={apify_request}')
94
- return apify_request
95
-
96
-
97
- def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequest:
98
- """Convert an Apify request to a Scrapy request.
99
-
100
- Args:
101
- apify_request: The Apify request to be converted.
102
- spider: The Scrapy spider that the request is associated with.
103
-
104
- Raises:
105
- TypeError: If the Apify request is not an instance of the `ApifyRequest` class.
106
- ValueError: If the Apify request does not contain the required keys.
107
-
108
- Returns:
109
- The converted Scrapy request.
110
- """
111
- if not isinstance(cast('Any', apify_request), ApifyRequest):
112
- raise TypeError('apify_request must be a crawlee.ScrapyRequest instance')
113
-
114
- logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...')
115
-
116
- # If the apify_request comes from the Scrapy
117
- if 'scrapy_request' in apify_request.user_data:
118
- # Deserialize the Scrapy ScrapyRequest from the apify_request.
119
- # - This process involves decoding the base64-encoded request data and reconstructing
120
- # the Scrapy ScrapyRequest object from its dictionary representation.
121
- logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...')
122
-
123
- scrapy_request_dict_encoded = apify_request.user_data['scrapy_request']
124
- if not isinstance(scrapy_request_dict_encoded, str):
125
- raise TypeError('scrapy_request_dict_encoded must be a string')
126
-
127
- scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
128
- if not isinstance(scrapy_request_dict, dict):
129
- raise TypeError('scrapy_request_dict must be a dictionary')
130
-
131
- scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
132
- if not isinstance(scrapy_request, ScrapyRequest):
133
- raise TypeError('scrapy_request must be an instance of the ScrapyRequest class')
134
-
135
- logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...')
136
-
137
- # Update the meta field with the meta field from the apify_request
138
- meta = scrapy_request.meta or {}
139
- meta.update({'apify_request_unique_key': apify_request.unique_key})
140
- # scrapy_request.meta is a property, so we have to set it like this
141
- scrapy_request._meta = meta # noqa: SLF001
142
-
143
- # If the apify_request comes directly from the Scrapy, typically start URLs.
144
- else:
145
- logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)')
146
-
147
- scrapy_request = ScrapyRequest(
148
- url=apify_request.url,
149
- method=apify_request.method,
150
- meta={
151
- 'apify_request_unique_key': apify_request.unique_key,
152
- },
153
- )
154
-
155
- # Add optional 'headers' field
156
- if apify_request.headers:
157
- scrapy_request.headers |= Headers(apify_request.headers)
158
-
159
- # Add optional 'userData' field
160
- if apify_request.user_data:
161
- scrapy_request.meta['userData'] = apify_request.user_data
162
-
163
- logger.debug(f'an apify_request was converted to the scrapy_request={scrapy_request}')
164
- return scrapy_request
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes