apify 2.4.0b6__py3-none-any.whl → 2.5.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

@@ -0,0 +1,3 @@
1
+ from apify.scrapy.extensions._httpcache import ApifyCacheStorage
2
+
3
+ __all__ = ['ApifyCacheStorage']
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import io
5
+ import pickle
6
+ import re
7
+ import struct
8
+ from logging import getLogger
9
+ from time import time
10
+ from typing import TYPE_CHECKING
11
+
12
+ from scrapy.http.headers import Headers
13
+ from scrapy.responsetypes import responsetypes
14
+
15
+ from apify import Configuration
16
+ from apify.apify_storage_client import ApifyStorageClient
17
+ from apify.scrapy._async_thread import AsyncThread
18
+ from apify.storages import KeyValueStore
19
+
20
+ if TYPE_CHECKING:
21
+ from scrapy import Request, Spider
22
+ from scrapy.http.response import Response
23
+ from scrapy.settings import BaseSettings
24
+ from scrapy.utils.request import RequestFingerprinterProtocol
25
+
26
+ logger = getLogger(__name__)
27
+
28
+
29
+ class ApifyCacheStorage:
30
+ """A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses.
31
+
32
+ It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches
33
+ responses to requests. See HTTPCache middleware settings (prefixed with `HTTPCACHE_`)
34
+ in the Scrapy documentation for more information. Requires the asyncio Twisted reactor
35
+ to be installed.
36
+ """
37
+
38
+ def __init__(self, settings: BaseSettings) -> None:
39
+ self._expiration_max_items = 100
40
+ self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS')
41
+ self._spider: Spider | None = None
42
+ self._kvs: KeyValueStore | None = None
43
+ self._fingerprinter: RequestFingerprinterProtocol | None = None
44
+ self._async_thread: AsyncThread | None = None
45
+
46
+ def open_spider(self, spider: Spider) -> None:
47
+ """Open the cache storage for a spider."""
48
+ logger.debug('Using Apify key value cache storage', extra={'spider': spider})
49
+ self._spider = spider
50
+ self._fingerprinter = spider.crawler.request_fingerprinter
51
+ kvs_name = get_kvs_name(spider.name)
52
+
53
+ async def open_kvs() -> KeyValueStore:
54
+ config = Configuration.get_global_configuration()
55
+ if config.is_at_home:
56
+ storage_client = ApifyStorageClient.from_config(config)
57
+ return await KeyValueStore.open(name=kvs_name, storage_client=storage_client)
58
+ return await KeyValueStore.open(name=kvs_name)
59
+
60
+ logger.debug("Starting background thread for cache storage's event loop")
61
+ self._async_thread = AsyncThread()
62
+ logger.debug(f"Opening cache storage's {kvs_name!r} key value store")
63
+ self._kvs = self._async_thread.run_coro(open_kvs())
64
+
65
+ def close_spider(self, _: Spider, current_time: int | None = None) -> None:
66
+ """Close the cache storage for a spider."""
67
+ if self._async_thread is None:
68
+ raise ValueError('Async thread not initialized')
69
+
70
+ logger.info(f'Cleaning up cache items (max {self._expiration_max_items})')
71
+ if self._expiration_secs > 0:
72
+ if current_time is None:
73
+ current_time = int(time())
74
+
75
+ async def expire_kvs() -> None:
76
+ if self._kvs is None:
77
+ raise ValueError('Key value store not initialized')
78
+ i = 0
79
+ async for item in self._kvs.iterate_keys():
80
+ value = await self._kvs.get_value(item.key)
81
+ try:
82
+ gzip_time = read_gzip_time(value)
83
+ except Exception as e:
84
+ logger.warning(f'Malformed cache item {item.key}: {e}')
85
+ await self._kvs.set_value(item.key, None)
86
+ else:
87
+ if self._expiration_secs < current_time - gzip_time:
88
+ logger.debug(f'Expired cache item {item.key}')
89
+ await self._kvs.set_value(item.key, None)
90
+ else:
91
+ logger.debug(f'Valid cache item {item.key}')
92
+ if i == self._expiration_max_items:
93
+ break
94
+ i += 1
95
+
96
+ self._async_thread.run_coro(expire_kvs())
97
+
98
+ logger.debug('Closing cache storage')
99
+ try:
100
+ self._async_thread.close()
101
+ except KeyboardInterrupt:
102
+ logger.warning('Shutdown interrupted by KeyboardInterrupt!')
103
+ except Exception:
104
+ logger.exception('Exception occurred while shutting down cache storage')
105
+ finally:
106
+ logger.debug('Cache storage closed')
107
+
108
+ def retrieve_response(self, _: Spider, request: Request, current_time: int | None = None) -> Response | None:
109
+ """Retrieve a response from the cache storage."""
110
+ if self._async_thread is None:
111
+ raise ValueError('Async thread not initialized')
112
+ if self._kvs is None:
113
+ raise ValueError('Key value store not initialized')
114
+ if self._fingerprinter is None:
115
+ raise ValueError('Request fingerprinter not initialized')
116
+
117
+ key = self._fingerprinter.fingerprint(request).hex()
118
+ value = self._async_thread.run_coro(self._kvs.get_value(key))
119
+
120
+ if value is None:
121
+ logger.debug('Cache miss', extra={'request': request})
122
+ return None
123
+
124
+ if current_time is None:
125
+ current_time = int(time())
126
+ if 0 < self._expiration_secs < current_time - read_gzip_time(value):
127
+ logger.debug('Cache expired', extra={'request': request})
128
+ return None
129
+
130
+ data = from_gzip(value)
131
+ url = data['url']
132
+ status = data['status']
133
+ headers = Headers(data['headers'])
134
+ body = data['body']
135
+ respcls = responsetypes.from_args(headers=headers, url=url, body=body)
136
+
137
+ logger.debug('Cache hit', extra={'request': request})
138
+ return respcls(url=url, headers=headers, status=status, body=body)
139
+
140
+ def store_response(self, _: Spider, request: Request, response: Response) -> None:
141
+ """Store a response in the cache storage."""
142
+ if self._async_thread is None:
143
+ raise ValueError('Async thread not initialized')
144
+ if self._kvs is None:
145
+ raise ValueError('Key value store not initialized')
146
+ if self._fingerprinter is None:
147
+ raise ValueError('Request fingerprinter not initialized')
148
+
149
+ key = self._fingerprinter.fingerprint(request).hex()
150
+ data = {
151
+ 'status': response.status,
152
+ 'url': response.url,
153
+ 'headers': dict(response.headers),
154
+ 'body': response.body,
155
+ }
156
+ value = to_gzip(data)
157
+ self._async_thread.run_coro(self._kvs.set_value(key, value))
158
+
159
+
160
+ def to_gzip(data: dict, mtime: int | None = None) -> bytes:
161
+ """Dump a dictionary to a gzip-compressed byte stream."""
162
+ with io.BytesIO() as byte_stream:
163
+ with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file:
164
+ pickle.dump(data, gzip_file, protocol=4)
165
+ return byte_stream.getvalue()
166
+
167
+
168
+ def from_gzip(gzip_bytes: bytes) -> dict:
169
+ """Load a dictionary from a gzip-compressed byte stream."""
170
+ with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file:
171
+ data: dict = pickle.load(gzip_file)
172
+ return data
173
+
174
+
175
+ def read_gzip_time(gzip_bytes: bytes) -> int:
176
+ """Read the modification time from a gzip-compressed byte stream without decompressing the data."""
177
+ header = gzip_bytes[:10]
178
+ header_components = struct.unpack('<HBBI2B', header)
179
+ mtime: int = header_components[3]
180
+ return mtime
181
+
182
+
183
+ def get_kvs_name(spider_name: str, max_length: int = 60) -> str:
184
+ """Get the key value store name for a spider.
185
+
186
+ The key value store name is derived from the spider name by replacing all special characters
187
+ with hyphens and trimming leading and trailing hyphens. The resulting name is prefixed with
188
+ 'httpcache-' and truncated to the maximum length.
189
+
190
+ The documentation
191
+ [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages)
192
+ mentions that names can be up to 63 characters long, so the default max length is set to 60.
193
+
194
+ Such naming isn't unique per spider, but should be sufficiently unique for most use cases.
195
+ The name of the key value store should indicate to which spider it belongs, e.g. in
196
+ the listing in the Apify's console.
197
+
198
+ Args:
199
+ spider_name: Value of the Spider instance's name attribute.
200
+ max_length: Maximum length of the key value store name.
201
+
202
+ Returns: Key value store name.
203
+
204
+ Raises:
205
+ ValueError: If the spider name contains only special characters.
206
+ """
207
+ slug = re.sub(r'[^a-zA-Z0-9-]', '-', spider_name)
208
+ slug = re.sub(r'-+', '-', slug)
209
+ slug = slug.strip('-')
210
+ if not slug:
211
+ raise ValueError(f'Unsupported spider name: {spider_name!r} (slug: {slug!r})')
212
+ return f'httpcache-{slug}'[:max_length]
apify/scrapy/utils.py CHANGED
@@ -44,6 +44,9 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
44
44
  settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
45
45
  settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750
46
46
 
47
+ # Set the default HTTPCache middleware storage backend to ApifyCacheStorage
48
+ settings['HTTPCACHE_STORAGE'] = 'apify.scrapy.extensions.ApifyCacheStorage'
49
+
47
50
  # Store the proxy configuration
48
51
  settings['APIFY_PROXY_SETTINGS'] = proxy_config
49
52
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 2.4.0b6
3
+ Version: 2.5.0b1
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Homepage, https://docs.apify.com/sdk/python/
6
6
  Project-URL: Apify homepage, https://apify.com
@@ -26,7 +26,9 @@ apify/scrapy/_logging_config.py,sha256=hFq90fNtZyjjJA7w2k-mtuEC8xCFiBMTalbwPDcai
26
26
  apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  apify/scrapy/requests.py,sha256=vZEU1IwNotCkqZ-b-LfM15iAyr1LnZ_fF8oWyMFVVFI,6553
28
28
  apify/scrapy/scheduler.py,sha256=-r1wZjMmeRDPxZKGHO-EYDYpGdDgSPAdNgMFViqUK8E,6019
29
- apify/scrapy/utils.py,sha256=5cka33PWc_at14yjhnLkCvY4h-ySUgVVhhDLxTy39ZI,1965
29
+ apify/scrapy/utils.py,sha256=Ssfa-P9-g9XYP1suDce6dQ8ta7PfijiPoMl2iplE6Ow,2126
30
+ apify/scrapy/extensions/__init__.py,sha256=cVQ8CCtOsJsRP28YKZWSUsi4FBwxI-yPJRNSXPFSa_o,98
31
+ apify/scrapy/extensions/_httpcache.py,sha256=OOz3eia6LRHakLcfS3GGSlVEOV83BDbAUhCg9GApXvU,8771
30
32
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
31
33
  apify/scrapy/middlewares/apify_proxy.py,sha256=CDAOXS3bcVDZHM3B0GvhXbxEikMIadLF_0P73WL_nI4,5550
32
34
  apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,7 +38,7 @@ apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
36
38
  apify/storages/__init__.py,sha256=FW-z6ubuPnHGM-Wp15T8mR5q6lnpDGrCW-IkgZd5L30,177
37
39
  apify/storages/_request_list.py,sha256=7WpcdWvT3QxEBthynBpTVCSNDLXq6UbpQQmfUVyJ1jE,5849
38
40
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- apify-2.4.0b6.dist-info/METADATA,sha256=WG5egGrX4ImAVFO265if3fAAL0KllSnyfay3pcdwpiY,21558
40
- apify-2.4.0b6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- apify-2.4.0b6.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
42
- apify-2.4.0b6.dist-info/RECORD,,
41
+ apify-2.5.0b1.dist-info/METADATA,sha256=S9YTlXwiL8olRKz5HGUS1fXDLOzvTNJ-1O81U6OFn3s,21558
42
+ apify-2.5.0b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
43
+ apify-2.5.0b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
44
+ apify-2.5.0b1.dist-info/RECORD,,