apify 1.4.2b3__tar.gz → 1.5.0b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (46) hide show
  1. {apify-1.4.2b3 → apify-1.5.0b1}/PKG-INFO +1 -1
  2. {apify-1.4.2b3 → apify-1.5.0b1}/pyproject.toml +1 -1
  3. apify-1.5.0b1/src/apify/scrapy/__init__.py +3 -0
  4. apify-1.5.0b1/src/apify/scrapy/middlewares/__init__.py +2 -0
  5. apify-1.5.0b1/src/apify/scrapy/middlewares/apify_proxy.py +145 -0
  6. apify-1.4.2b3/src/apify/scrapy/middlewares.py → apify-1.5.0b1/src/apify/scrapy/middlewares/apify_retry.py +3 -3
  7. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/scrapy/utils.py +11 -0
  8. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify.egg-info/PKG-INFO +1 -1
  9. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify.egg-info/SOURCES.txt +3 -1
  10. apify-1.4.2b3/src/apify/scrapy/__init__.py +0 -4
  11. {apify-1.4.2b3 → apify-1.5.0b1}/LICENSE +0 -0
  12. {apify-1.4.2b3 → apify-1.5.0b1}/README.md +0 -0
  13. {apify-1.4.2b3 → apify-1.5.0b1}/setup.cfg +0 -0
  14. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/__init__.py +0 -0
  15. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_crypto.py +0 -0
  16. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/__init__.py +0 -0
  17. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/file_storage_utils.py +0 -0
  18. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/memory_storage_client.py +0 -0
  19. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/__init__.py +0 -0
  20. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/base_resource_client.py +0 -0
  21. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -0
  22. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/dataset.py +0 -0
  23. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/dataset_collection.py +0 -0
  24. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/key_value_store.py +0 -0
  25. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -0
  26. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/request_queue.py +0 -0
  27. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_memory_storage/resource_clients/request_queue_collection.py +0 -0
  28. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/_utils.py +0 -0
  29. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/actor.py +0 -0
  30. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/config.py +0 -0
  31. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/consts.py +0 -0
  32. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/event_manager.py +0 -0
  33. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/log.py +0 -0
  34. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/proxy_configuration.py +0 -0
  35. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/py.typed +0 -0
  36. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/scrapy/pipelines.py +0 -0
  37. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/scrapy/scheduler.py +0 -0
  38. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/storages/__init__.py +0 -0
  39. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/storages/base_storage.py +0 -0
  40. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/storages/dataset.py +0 -0
  41. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/storages/key_value_store.py +0 -0
  42. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/storages/request_queue.py +0 -0
  43. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify/storages/storage_client_manager.py +0 -0
  44. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify.egg-info/dependency_links.txt +0 -0
  45. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify.egg-info/requires.txt +0 -0
  46. {apify-1.4.2b3 → apify-1.5.0b1}/src/apify.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.4.2b3
3
+ Version: 1.5.0b1
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "apify"
3
- version = "1.4.2b3"
3
+ version = "1.5.0b1"
4
4
  description = "Apify SDK for Python"
5
5
  readme = "README.md"
6
6
  license = { text = "Apache Software License" }
@@ -0,0 +1,3 @@
1
+ from .pipelines import ActorDatasetPushPipeline
2
+ from .scheduler import ApifyScheduler
3
+ from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
@@ -0,0 +1,2 @@
1
+ from .apify_proxy import ApifyHttpProxyMiddleware
2
+ from .apify_retry import ApifyRetryMiddleware
@@ -0,0 +1,145 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+ from urllib.parse import ParseResult, urlparse
5
+
6
+ from scrapy.core.downloader.handlers.http11 import TunnelError
7
+ from scrapy.exceptions import NotConfigured
8
+
9
+ from ...actor import Actor
10
+ from ...proxy_configuration import ProxyConfiguration
11
+ from ..utils import get_basic_auth_header
12
+
13
+ if TYPE_CHECKING:
14
+ from scrapy import Request, Spider
15
+ from scrapy.crawler import Crawler
16
+
17
+
18
+ class ApifyHttpProxyMiddleware:
19
+ """Apify HTTP proxy middleware for Scrapy.
20
+
21
+ This middleware enhances request processing by adding a 'proxy' field to the request's meta and an authentication
22
+ header. It draws inspiration from the `HttpProxyMiddleware` included by default in Scrapy projects. The proxy URL
23
+ is sourced from the settings under the `APIFY_PROXY_SETTINGS` key. The value of this key, a dictionary, should be
24
+ provided by the Actor input. An example of the proxy settings:
25
+
26
+ proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []}
27
+ """
28
+
29
+ def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None:
30
+ """Create a new instance.
31
+
32
+ Args:
33
+ proxy_settings: Dictionary containing proxy settings, provided by the Actor input.
34
+ auth_encoding: Encoding for basic authentication (default is 'latin-1').
35
+ """
36
+ self._proxy_settings = proxy_settings
37
+ self._proxy_cfg_internal: ProxyConfiguration | None = None
38
+
39
+ @classmethod
40
+ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware:
41
+ """Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
42
+
43
+ Args:
44
+ cls: Class type.
45
+ crawler: Scrapy Crawler object.
46
+
47
+ Returns:
48
+ ApifyHttpProxyMiddleware: Instance of the class.
49
+ """
50
+ proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')
51
+
52
+ if proxy_settings is None:
53
+ Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.')
54
+ raise NotConfigured
55
+
56
+ use_apify_proxy = proxy_settings.get('useApifyProxy', False)
57
+
58
+ if use_apify_proxy is not True:
59
+ Actor.log.warning(
60
+ 'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.'
61
+ )
62
+ raise NotConfigured
63
+
64
+ return cls(proxy_settings)
65
+
66
+ async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None:
67
+ """Process a Scrapy request by assigning a new proxy.
68
+
69
+ Args:
70
+ request: Scrapy Request object.
71
+ spider: Scrapy Spider object.
72
+
73
+ Raises:
74
+ ValueError: If username and password are not provided in the proxy URL.
75
+
76
+ Returns:
77
+ None: The request is processed and middleware pipeline can continue.
78
+ """
79
+ Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
80
+ url = await self._get_new_proxy_url()
81
+
82
+ if not (url.username and url.password):
83
+ raise ValueError('Username and password must be provided in the proxy URL.')
84
+
85
+ request.meta['proxy'] = url.geturl()
86
+ basic_auth_header = get_basic_auth_header(url.username, url.password)
87
+ request.headers[b'Proxy-Authorization'] = basic_auth_header
88
+
89
+ Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}')
90
+
91
+ def process_exception(
92
+ self: ApifyHttpProxyMiddleware,
93
+ request: Request,
94
+ exception: Exception,
95
+ spider: Spider,
96
+ ) -> None | Request:
97
+ """Process an exception that occurs during request processing.
98
+
99
+ Args:
100
+ request: Scrapy Request object.
101
+ exception: Exception object.
102
+ spider: Scrapy Spider object.
103
+
104
+ Returns:
105
+ If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
106
+ Return None otherwise to allow the continuation of request processing.
107
+ """
108
+ Actor.log.debug(
109
+ f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
110
+ )
111
+
112
+ if isinstance(exception, TunnelError):
113
+ Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...')
114
+ return request
115
+
116
+ return None
117
+
118
+ async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
119
+ """Get a new proxy URL.
120
+
121
+ Raises:
122
+ NotConfigured: If creation of the proxy configuration fails.
123
+
124
+ Returns:
125
+ ParseResult: New proxy URL.
126
+ """
127
+ # Get proxy configuration, creating it if necessary
128
+ proxy_cfg = (
129
+ self._proxy_cfg_internal
130
+ if isinstance(self._proxy_cfg_internal, ProxyConfiguration)
131
+ else await Actor.create_proxy_configuration(actor_proxy_input=self._proxy_settings)
132
+ )
133
+
134
+ # If the proxy configuration is still not available, raise an error. However, this should not happen due
135
+ # to the checks in the `from_crawler` method.
136
+ if proxy_cfg is None:
137
+ Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.')
138
+ raise NotConfigured
139
+
140
+ # Store the proxy configuration for future use
141
+ self._proxy_cfg_internal = proxy_cfg
142
+
143
+ # Get a new proxy URL and return it
144
+ new_url = await proxy_cfg.new_url()
145
+ return urlparse(new_url)
@@ -11,14 +11,14 @@ except ImportError as exc:
11
11
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
12
12
  ) from exc
13
13
 
14
- from ..actor import Actor
15
- from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
14
+ from ...actor import Actor
15
+ from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from scrapy import Spider
19
19
  from scrapy.http import Request, Response
20
20
 
21
- from ..storages import RequestQueue
21
+ from ...storages import RequestQueue
22
22
 
23
23
 
24
24
  class ApifyRetryMiddleware(RetryMiddleware):
@@ -3,6 +3,10 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import codecs
5
5
  import pickle
6
+ from base64 import b64encode
7
+ from urllib.parse import unquote
8
+
9
+ from scrapy.utils.python import to_bytes
6
10
 
7
11
  try:
8
12
  from scrapy import Request, Spider
@@ -19,6 +23,13 @@ from ..storages import RequestQueue, StorageClientManager
19
23
  nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
20
24
 
21
25
 
26
+ def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
27
+ """Generate a basic authentication header for the given username and password."""
28
+ string = f'{unquote(username)}:{unquote(password)}'
29
+ user_pass = to_bytes(string, encoding=auth_encoding)
30
+ return b'Basic ' + b64encode(user_pass)
31
+
32
+
22
33
  def get_running_event_loop_id() -> int:
23
34
  """Get the ID of the currently running event loop.
24
35
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 1.4.2b3
3
+ Version: 1.5.0b1
4
4
  Summary: Apify SDK for Python
5
5
  Author-email: "Apify Technologies s.r.o." <support@apify.com>
6
6
  License: Apache Software License
@@ -29,10 +29,12 @@ src/apify/_memory_storage/resource_clients/key_value_store_collection.py
29
29
  src/apify/_memory_storage/resource_clients/request_queue.py
30
30
  src/apify/_memory_storage/resource_clients/request_queue_collection.py
31
31
  src/apify/scrapy/__init__.py
32
- src/apify/scrapy/middlewares.py
33
32
  src/apify/scrapy/pipelines.py
34
33
  src/apify/scrapy/scheduler.py
35
34
  src/apify/scrapy/utils.py
35
+ src/apify/scrapy/middlewares/__init__.py
36
+ src/apify/scrapy/middlewares/apify_proxy.py
37
+ src/apify/scrapy/middlewares/apify_retry.py
36
38
  src/apify/storages/__init__.py
37
39
  src/apify/storages/base_storage.py
38
40
  src/apify/storages/dataset.py
@@ -1,4 +0,0 @@
1
- from .middlewares import ApifyRetryMiddleware
2
- from .pipelines import ActorDatasetPushPipeline
3
- from .scheduler import ApifyScheduler
4
- from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes