apify 2.0.2b7__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (40) hide show
  1. {apify-2.0.2b7 → apify-2.1.0}/PKG-INFO +3 -3
  2. {apify-2.0.2b7 → apify-2.1.0}/pyproject.toml +20 -16
  3. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_actor.py +5 -2
  4. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_configuration.py +4 -2
  5. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_models.py +8 -1
  6. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_platform_event_manager.py +12 -1
  7. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_proxy_configuration.py +4 -1
  8. apify-2.1.0/src/apify/_utils.py +58 -0
  9. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_apify_storage_client.py +2 -0
  10. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/middlewares/apify_proxy.py +4 -2
  11. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/pipelines/actor_dataset_push.py +4 -1
  12. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/requests.py +7 -5
  13. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/scheduler.py +4 -1
  14. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/utils.py +4 -1
  15. apify-2.1.0/src/apify/storages/__init__.py +5 -0
  16. apify-2.1.0/src/apify/storages/_request_list.py +150 -0
  17. apify-2.0.2b7/src/apify/_utils.py +0 -26
  18. apify-2.0.2b7/src/apify/storages/__init__.py +0 -3
  19. {apify-2.0.2b7 → apify-2.1.0}/LICENSE +0 -0
  20. {apify-2.0.2b7 → apify-2.1.0}/README.md +0 -0
  21. {apify-2.0.2b7 → apify-2.1.0}/src/apify/__init__.py +1 -1
  22. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_consts.py +0 -0
  23. {apify-2.0.2b7 → apify-2.1.0}/src/apify/_crypto.py +0 -0
  24. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/__init__.py +0 -0
  25. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_dataset_client.py +0 -0
  26. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_dataset_collection_client.py +0 -0
  27. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_key_value_store_client.py +0 -0
  28. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_key_value_store_collection_client.py +0 -0
  29. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_request_queue_client.py +0 -0
  30. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_request_queue_collection_client.py +0 -0
  31. {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/py.typed +0 -0
  32. {apify-2.0.2b7 → apify-2.1.0}/src/apify/log.py +0 -0
  33. {apify-2.0.2b7 → apify-2.1.0}/src/apify/py.typed +0 -0
  34. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/__init__.py +2 -2
  35. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/middlewares/__init__.py +0 -0
  36. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/middlewares/py.typed +0 -0
  37. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/pipelines/__init__.py +0 -0
  38. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/pipelines/py.typed +0 -0
  39. {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/py.typed +0 -0
  40. {apify-2.0.2b7 → apify-2.1.0}/src/apify/storages/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 2.0.2b7
3
+ Version: 2.1.0
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -21,9 +21,9 @@ Classifier: Topic :: Software Development :: Libraries
21
21
  Provides-Extra: scrapy
22
22
  Requires-Dist: apify-client (>=1.8.1)
23
23
  Requires-Dist: apify-shared (>=1.1.2)
24
- Requires-Dist: crawlee (>=0.3.9)
24
+ Requires-Dist: crawlee (>=0.4.0,<0.5.0)
25
25
  Requires-Dist: cryptography (>=42.0.0)
26
- Requires-Dist: httpx (>=0.27.0)
26
+ Requires-Dist: httpx (>=0.27.0,<0.28.0)
27
27
  Requires-Dist: lazy-object-proxy (>=1.10.0)
28
28
  Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
29
29
  Requires-Dist: typing-extensions (>=4.1.0)
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "apify"
7
- version = "2.0.2b7"
7
+ version = "2.1.0"
8
8
  description = "Apify SDK for Python"
9
9
  authors = ["Apify Technologies s.r.o. <support@apify.com>"]
10
10
  license = "Apache-2.0"
@@ -41,16 +41,15 @@ keywords = [
41
41
  "Issue Tracker" = "https://github.com/apify/apify-sdk-python/issues"
42
42
  "Repository" = "https://github.com/apify/apify-sdk-python"
43
43
 
44
- # We use inclusive ordered comparison clauses for external packages intentionally in order to enhance SDK's
45
- # compatibility with external packages. This decision was discussed in detail in the following PR:
46
- # https://github.com/apify/apify-sdk-python/pull/154.
47
44
  [tool.poetry.dependencies]
48
45
  python = "^3.9"
49
46
  apify-client = ">=1.8.1"
50
47
  apify-shared = ">=1.1.2"
51
- crawlee = ">=0.3.9"
48
+ crawlee = "~0.4.0"
52
49
  cryptography = ">=42.0.0"
53
- httpx = ">=0.27.0"
50
+ # TODO: relax the upper bound once the issue is resolved:
51
+ # https://github.com/apify/apify-sdk-python/issues/348
52
+ httpx = "~0.27.0"
54
53
  lazy-object-proxy = ">=1.10.0"
55
54
  scrapy = { version = ">=2.11.0", optional = true }
56
55
  typing-extensions = ">=4.1.0"
@@ -65,13 +64,13 @@ pre-commit = "~4.0.0"
65
64
  pydoc-markdown = "~4.8.0"
66
65
  pytest = "~8.3.0"
67
66
  pytest-asyncio = "~0.24.0"
68
- pytest-cov = "~5.0.0"
67
+ pytest-cov = "~6.0.0"
69
68
  pytest-only = "~2.1.0"
70
69
  pytest-timeout = "~2.3.0"
71
70
  pytest-xdist = "~3.6.0"
72
71
  respx = "~0.21.0"
73
- ruff = "~0.7.0"
74
- setuptools = "~75.0.0" # setuptools are used by pytest but not explicitly required
72
+ ruff = "~0.8.0"
73
+ setuptools = "~75.6.0" # setuptools are used by pytest but not explicitly required
75
74
 
76
75
  [tool.poetry.extras]
77
76
  scrapy = ["scrapy"]
@@ -82,8 +81,6 @@ line-length = 120
82
81
  [tool.ruff.lint]
83
82
  select = ["ALL"]
84
83
  ignore = [
85
- "ANN101", # Missing type annotation for `self` in method
86
- "ANN102", # Missing type annotation for `{name}` in classmethod
87
84
  "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename}
88
85
  "ASYNC109", # Async function definition with a `timeout` parameter
89
86
  "BLE001", # Do not catch blind exception
@@ -96,7 +93,6 @@ ignore = [
96
93
  "G004", # Logging statement uses f-string
97
94
  "ISC001", # This rule may cause conflicts when used with the formatter
98
95
  "FIX", # flake8-fixme
99
- "PGH003", # Use specific rule codes when ignoring type issues
100
96
  "PLR0911", # Too many return statements
101
97
  "PLR0913", # Too many arguments in function definition
102
98
  "PLR0915", # Too many statements
@@ -141,6 +137,12 @@ indent-style = "space"
141
137
  docstring-quotes = "double"
142
138
  inline-quotes = "single"
143
139
 
140
+ [tool.ruff.lint.flake8-type-checking]
141
+ runtime-evaluated-base-classes = [
142
+ "pydantic.BaseModel",
143
+ "crawlee.configuration.Configuration",
144
+ ]
145
+
144
146
  [tool.ruff.lint.flake8-builtins]
145
147
  builtins-ignorelist = ["id"]
146
148
 
@@ -180,15 +182,17 @@ exclude = []
180
182
  module = ['scrapy', 'scrapy.*', 'lazy_object_proxy']
181
183
  ignore_missing_imports = true
182
184
 
185
+ [tool.basedpyright]
186
+ pythonVersion = "3.9"
187
+ typeCheckingMode = "standard"
188
+ include = ["src", "tests"]
189
+
183
190
  [tool.coverage.report]
184
191
  exclude_lines = [
185
192
  "pragma: no cover",
186
193
  "if TYPE_CHECKING:",
187
- "assert_never()"
194
+ "assert_never()",
188
195
  ]
189
196
 
190
- [tool.basedpyright]
191
- typeCheckingMode = "standard"
192
-
193
197
  [tool.ipdb]
194
198
  context = 7
@@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
8
8
 
9
9
  from lazy_object_proxy import Proxy
10
10
  from pydantic import AliasChoices
11
- from typing_extensions import Self
12
11
 
13
12
  from apify_client import ApifyClientAsync
14
13
  from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
@@ -22,7 +21,7 @@ from apify._crypto import decrypt_input_secrets, load_private_key
22
21
  from apify._models import ActorRun
23
22
  from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
24
23
  from apify._proxy_configuration import ProxyConfiguration
25
- from apify._utils import get_system_info, is_running_in_ipython
24
+ from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
26
25
  from apify.apify_storage_client import ApifyStorageClient
27
26
  from apify.log import _configure_logging, logger
28
27
  from apify.storages import Dataset, KeyValueStore, RequestQueue
@@ -31,6 +30,8 @@ if TYPE_CHECKING:
31
30
  import logging
32
31
  from types import TracebackType
33
32
 
33
+ from typing_extensions import Self
34
+
34
35
  from crawlee.proxy_configuration import _NewUrlFunction
35
36
 
36
37
  from apify._models import Webhook
@@ -39,6 +40,8 @@ if TYPE_CHECKING:
39
40
  MainReturnType = TypeVar('MainReturnType')
40
41
 
41
42
 
43
+ @docs_name('Actor')
44
+ @docs_group('Classes')
42
45
  class _ActorType:
43
46
  """The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
44
47
 
@@ -1,4 +1,3 @@
1
- # ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work)
2
1
  from __future__ import annotations
3
2
 
4
3
  from datetime import datetime, timedelta
@@ -11,7 +10,10 @@ from crawlee._utils.models import timedelta_ms
11
10
  from crawlee._utils.urls import validate_http_url
12
11
  from crawlee.configuration import Configuration as CrawleeConfiguration
13
12
 
13
+ from apify._utils import docs_group
14
14
 
15
+
16
+ @docs_group('Classes')
15
17
  class Configuration(CrawleeConfiguration):
16
18
  """A class for specifying the configuration of an Actor.
17
19
 
@@ -321,4 +323,4 @@ class Configuration(CrawleeConfiguration):
321
323
 
322
324
 
323
325
  # Monkey-patch the base class so that it works with the extended configuration
324
- CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore
326
+ CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
@@ -1,4 +1,3 @@
1
- # ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
2
1
  from __future__ import annotations
3
2
 
4
3
  from datetime import datetime, timedelta
@@ -10,7 +9,10 @@ from apify_shared.consts import ActorJobStatus, MetaOrigin, WebhookEventType
10
9
  from crawlee._utils.models import timedelta_ms
11
10
  from crawlee._utils.urls import validate_http_url
12
11
 
12
+ from apify._utils import docs_group
13
13
 
14
+
15
+ @docs_group('Data structures')
14
16
  class Webhook(BaseModel):
15
17
  __model_config__ = ConfigDict(populate_by_name=True)
16
18
 
@@ -29,12 +31,14 @@ class Webhook(BaseModel):
29
31
  ] = None
30
32
 
31
33
 
34
+ @docs_group('Data structures')
32
35
  class ActorRunMeta(BaseModel):
33
36
  __model_config__ = ConfigDict(populate_by_name=True)
34
37
 
35
38
  origin: Annotated[MetaOrigin, Field()]
36
39
 
37
40
 
41
+ @docs_group('Data structures')
38
42
  class ActorRunStats(BaseModel):
39
43
  __model_config__ = ConfigDict(populate_by_name=True)
40
44
 
@@ -55,6 +59,7 @@ class ActorRunStats(BaseModel):
55
59
  compute_units: Annotated[float, Field(alias='computeUnits')]
56
60
 
57
61
 
62
+ @docs_group('Data structures')
58
63
  class ActorRunOptions(BaseModel):
59
64
  __model_config__ = ConfigDict(populate_by_name=True)
60
65
 
@@ -64,6 +69,7 @@ class ActorRunOptions(BaseModel):
64
69
  disk_mbytes: Annotated[int, Field(alias='diskMbytes')]
65
70
 
66
71
 
72
+ @docs_group('Data structures')
67
73
  class ActorRunUsage(BaseModel):
68
74
  __model_config__ = ConfigDict(populate_by_name=True)
69
75
 
@@ -81,6 +87,7 @@ class ActorRunUsage(BaseModel):
81
87
  proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
82
88
 
83
89
 
90
+ @docs_group('Data structures')
84
91
  class ActorRun(BaseModel):
85
92
  __model_config__ = ConfigDict(populate_by_name=True)
86
93
 
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
- from datetime import datetime # noqa: TCH003
4
+ from datetime import datetime
5
5
  from typing import TYPE_CHECKING, Annotated, Any, Literal, Union
6
6
 
7
7
  import websockets.client
@@ -19,6 +19,7 @@ from crawlee.events._types import (
19
19
  EventSystemInfoData,
20
20
  )
21
21
 
22
+ from apify._utils import docs_group
22
23
  from apify.log import logger
23
24
 
24
25
  if TYPE_CHECKING:
@@ -30,11 +31,13 @@ if TYPE_CHECKING:
30
31
  __all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager']
31
32
 
32
33
 
34
+ @docs_group('Data structures')
33
35
  class PersistStateEvent(BaseModel):
34
36
  name: Literal[Event.PERSIST_STATE]
35
37
  data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
36
38
 
37
39
 
40
+ @docs_group('Data structures')
38
41
  class SystemInfoEventData(BaseModel):
39
42
  mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
40
43
  mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
@@ -61,26 +64,31 @@ class SystemInfoEventData(BaseModel):
61
64
  )
62
65
 
63
66
 
67
+ @docs_group('Data structures')
64
68
  class SystemInfoEvent(BaseModel):
65
69
  name: Literal[Event.SYSTEM_INFO]
66
70
  data: SystemInfoEventData
67
71
 
68
72
 
73
+ @docs_group('Data structures')
69
74
  class MigratingEvent(BaseModel):
70
75
  name: Literal[Event.MIGRATING]
71
76
  data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
72
77
 
73
78
 
79
+ @docs_group('Data structures')
74
80
  class AbortingEvent(BaseModel):
75
81
  name: Literal[Event.ABORTING]
76
82
  data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
77
83
 
78
84
 
85
+ @docs_group('Data structures')
79
86
  class ExitEvent(BaseModel):
80
87
  name: Literal[Event.EXIT]
81
88
  data: Annotated[EventExitData, Field(default_factory=EventExitData)]
82
89
 
83
90
 
91
+ @docs_group('Data structures')
84
92
  class EventWithoutData(BaseModel):
85
93
  name: Literal[
86
94
  Event.SESSION_RETIRED,
@@ -93,11 +101,13 @@ class EventWithoutData(BaseModel):
93
101
  data: Any = None
94
102
 
95
103
 
104
+ @docs_group('Data structures')
96
105
  class DeprecatedEvent(BaseModel):
97
106
  name: Literal['cpuInfo']
98
107
  data: Annotated[dict[str, Any], Field(default_factory=dict)]
99
108
 
100
109
 
110
+ @docs_group('Data structures')
101
111
  class UnknownEvent(BaseModel):
102
112
  name: str
103
113
  data: Annotated[dict[str, Any], Field(default_factory=dict)]
@@ -125,6 +135,7 @@ event_data_adapter: TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent] =
125
135
  )
126
136
 
127
137
 
138
+ @docs_group('Classes')
128
139
  class PlatformEventManager(EventManager):
129
140
  """A class for managing Actor events.
130
141
 
@@ -16,6 +16,7 @@ from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
16
16
  from crawlee.proxy_configuration import _NewUrlFunction
17
17
 
18
18
  from apify._configuration import Configuration
19
+ from apify._utils import docs_group
19
20
  from apify.log import logger
20
21
 
21
22
  if TYPE_CHECKING:
@@ -68,6 +69,7 @@ def _check(
68
69
  raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
69
70
 
70
71
 
72
+ @docs_group('Classes')
71
73
  @dataclass
72
74
  class ProxyInfo(CrawleeProxyInfo):
73
75
  """Provides information about a proxy connection that is used for requests."""
@@ -87,6 +89,7 @@ class ProxyInfo(CrawleeProxyInfo):
87
89
  """
88
90
 
89
91
 
92
+ @docs_group('Classes')
90
93
  class ProxyConfiguration(CrawleeProxyConfiguration):
91
94
  """Configures a connection to a proxy server with the provided options.
92
95
 
@@ -277,7 +280,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
277
280
  return
278
281
 
279
282
  status = None
280
- async with httpx.AsyncClient(proxies=proxy_info.url, timeout=10) as client:
283
+ async with httpx.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
281
284
  for _ in range(2):
282
285
  try:
283
286
  response = await client.get(proxy_status_url)
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ import builtins
4
+ import sys
5
+ from importlib import metadata
6
+ from typing import Callable, Literal
7
+
8
+
9
+ def get_system_info() -> dict:
10
+ python_version = '.'.join([str(x) for x in sys.version_info[:3]])
11
+
12
+ system_info: dict[str, str | bool] = {
13
+ 'apify_sdk_version': metadata.version('apify'),
14
+ 'apify_client_version': metadata.version('apify-client'),
15
+ 'crawlee_version': metadata.version('crawlee'),
16
+ 'python_version': python_version,
17
+ 'os': sys.platform,
18
+ }
19
+
20
+ if is_running_in_ipython():
21
+ system_info['is_running_in_ipython'] = True
22
+
23
+ return system_info
24
+
25
+
26
+ def is_running_in_ipython() -> bool:
27
+ return getattr(builtins, '__IPYTHON__', False)
28
+
29
+
30
+ GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Errors', 'Functions']
31
+
32
+
33
+ def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
34
+ """Decorator to mark symbols for rendering and grouping in documentation.
35
+
36
+ This decorator is used purely for documentation purposes and does not alter the behavior
37
+ of the decorated callable.
38
+ """
39
+
40
+ def wrapper(func: Callable) -> Callable:
41
+ return func
42
+
43
+ return wrapper
44
+
45
+
46
+ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
47
+ """Decorator for renaming symbols in documentation.
48
+
49
+ This changes the rendered name of the symbol only in the rendered web documentation.
50
+
51
+ This decorator is used purely for documentation purposes and does not alter the behavior
52
+ of the decorated callable.
53
+ """
54
+
55
+ def wrapper(func: Callable) -> Callable:
56
+ return func
57
+
58
+ return wrapper
@@ -5,6 +5,7 @@ from crawlee._utils.crypto import crypto_random_object_id
5
5
  from crawlee.base_storage_client import BaseStorageClient
6
6
 
7
7
  from apify._configuration import Configuration
8
+ from apify._utils import docs_group
8
9
  from apify.apify_storage_client._dataset_client import DatasetClient
9
10
  from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
10
11
  from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
@@ -13,6 +14,7 @@ from apify.apify_storage_client._request_queue_client import RequestQueueClient
13
14
  from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
14
15
 
15
16
 
17
+ @docs_group('Classes')
16
18
  class ApifyStorageClient(BaseStorageClient):
17
19
  """A storage client implementation based on the Apify platform storage."""
18
20
 
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
3
4
  from urllib.parse import ParseResult, urlparse
4
5
 
5
6
  try:
6
- from scrapy import Request, Spider # noqa: TCH002
7
+ if TYPE_CHECKING:
8
+ from scrapy import Request, Spider
9
+ from scrapy.crawler import Crawler
7
10
  from scrapy.core.downloader.handlers.http11 import TunnelError
8
- from scrapy.crawler import Crawler # noqa: TCH002
9
11
  from scrapy.exceptions import NotConfigured
10
12
  except ImportError as exc:
11
13
  raise ImportError(
@@ -1,9 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
4
+
3
5
  from itemadapter.adapter import ItemAdapter
4
6
 
5
7
  try:
6
- from scrapy import Item, Spider # noqa: TCH002
8
+ if TYPE_CHECKING:
9
+ from scrapy import Item, Spider
7
10
  except ImportError as exc:
8
11
  raise ImportError(
9
12
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
@@ -42,8 +42,10 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
42
42
  Returns:
43
43
  The converted Apify request if the conversion was successful, otherwise None.
44
44
  """
45
- if not isinstance(cast(Any, scrapy_request), Request):
46
- Actor.log.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.')
45
+ if not isinstance(scrapy_request, Request):
46
+ Actor.log.warning( # type: ignore[unreachable]
47
+ 'Failed to convert to Apify request: Scrapy request must be a Request instance.'
48
+ )
47
49
  return None
48
50
 
49
51
  call_id = crypto_random_object_id(8)
@@ -53,7 +55,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
53
55
  if _is_request_produced_by_middleware(scrapy_request):
54
56
  unique_key = compute_unique_key(
55
57
  url=scrapy_request.url,
56
- method=scrapy_request.method,
58
+ method=scrapy_request.method, # type: ignore[arg-type] # str vs literal
57
59
  payload=scrapy_request.body,
58
60
  use_extended_unique_key=True,
59
61
  )
@@ -80,9 +82,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
80
82
 
81
83
  # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
82
84
  if isinstance(scrapy_request.headers, Headers):
83
- apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict())
85
+ apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
84
86
  else:
85
- Actor.log.warning(
87
+ Actor.log.warning( # type: ignore[unreachable]
86
88
  f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
87
89
  )
88
90
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import traceback
4
+ from typing import TYPE_CHECKING
4
5
 
5
6
  from apify._configuration import Configuration
6
7
  from apify.apify_storage_client import ApifyStorageClient
@@ -8,8 +9,10 @@ from apify.apify_storage_client import ApifyStorageClient
8
9
  try:
9
10
  from scrapy import Spider
10
11
  from scrapy.core.scheduler import BaseScheduler
11
- from scrapy.http.request import Request # noqa: TCH002
12
12
  from scrapy.utils.reactor import is_asyncio_reactor_installed
13
+
14
+ if TYPE_CHECKING:
15
+ from scrapy.http.request import Request
13
16
  except ImportError as exc:
14
17
  raise ImportError(
15
18
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
@@ -2,14 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  from base64 import b64encode
5
+ from typing import TYPE_CHECKING
5
6
  from urllib.parse import unquote
6
7
 
7
8
  from apify_shared.utils import ignore_docs
8
9
 
9
10
  try:
10
- from scrapy.settings import Settings # noqa: TCH002
11
11
  from scrapy.utils.project import get_project_settings
12
12
  from scrapy.utils.python import to_bytes
13
+
14
+ if TYPE_CHECKING:
15
+ from scrapy.settings import Settings
13
16
  except ImportError as exc:
14
17
  raise ImportError(
15
18
  'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
@@ -0,0 +1,5 @@
1
+ from crawlee.storages import Dataset, KeyValueStore, RequestQueue
2
+
3
+ from ._request_list import RequestList
4
+
5
+ __all__ = ['Dataset', 'KeyValueStore', 'RequestList', 'RequestQueue']
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import re
5
+ from asyncio import Task
6
+ from functools import partial
7
+ from typing import Annotated, Any, Union
8
+
9
+ from pydantic import BaseModel, Field, TypeAdapter
10
+
11
+ from crawlee import Request
12
+ from crawlee._types import HttpMethod
13
+ from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
14
+ from crawlee.storages import RequestList as CrawleeRequestList
15
+
16
+ from apify._utils import docs_group
17
+
18
+ URL_NO_COMMAS_REGEX = re.compile(
19
+ r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
20
+ )
21
+
22
+
23
+ class _RequestDetails(BaseModel):
24
+ method: HttpMethod = 'GET'
25
+ payload: str = ''
26
+ headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
27
+ user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
28
+
29
+
30
+ class _RequestsFromUrlInput(_RequestDetails):
31
+ requests_from_url: str = Field(alias='requestsFromUrl')
32
+
33
+
34
+ class _SimpleUrlInput(_RequestDetails):
35
+ url: str
36
+
37
+
38
+ url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
39
+
40
+
41
+ @docs_group('Classes')
42
+ class RequestList(CrawleeRequestList):
43
+ """Extends crawlee RequestList.
44
+
45
+ Method open is used to create RequestList from actor's requestListSources input.
46
+ """
47
+
48
+ @staticmethod
49
+ async def open(
50
+ name: str | None = None,
51
+ request_list_sources_input: list[dict[str, Any]] | None = None,
52
+ http_client: BaseHttpClient | None = None,
53
+ ) -> RequestList:
54
+ """Creates RequestList from Actor input requestListSources.
55
+
56
+ Args:
57
+ name: Name of the returned RequestList.
58
+ request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
59
+ http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
60
+
61
+ Returns:
62
+ RequestList created from request_list_sources_input.
63
+
64
+ ### Usage
65
+
66
+ ```python
67
+ example_input = [
68
+ # Gather urls from response body.
69
+ {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
70
+ # Directly include this url.
71
+ {'url': 'https://crawlee.dev', 'method': 'GET'}
72
+ ]
73
+ request_list = await RequestList.open(request_list_sources_input=example_input)
74
+ ```
75
+ """
76
+ request_list_sources_input = request_list_sources_input or []
77
+ return await RequestList._create_request_list(name, request_list_sources_input, http_client)
78
+
79
+ @staticmethod
80
+ async def _create_request_list(
81
+ name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
82
+ ) -> RequestList:
83
+ if not http_client:
84
+ http_client = HttpxHttpClient()
85
+
86
+ url_inputs = url_input_adapter.validate_python(request_list_sources_input)
87
+
88
+ simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
89
+ remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
90
+
91
+ simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
92
+ remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
93
+
94
+ return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
+
96
+ @staticmethod
97
+ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
98
+ return [
99
+ Request.from_url(
100
+ method=request_input.method,
101
+ url=request_input.url,
102
+ payload=request_input.payload.encode('utf-8'),
103
+ headers=request_input.headers,
104
+ user_data=request_input.user_data,
105
+ )
106
+ for request_input in simple_url_inputs
107
+ ]
108
+
109
+ @staticmethod
110
+ async def _fetch_requests_from_url(
111
+ remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
112
+ ) -> list[Request]:
113
+ """Crete list of requests from url.
114
+
115
+ Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
116
+ callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
117
+ collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
118
+ """
119
+ created_requests: list[Request] = []
120
+
121
+ def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
122
+ """Callback to scrape response body with regexp and create Requests from matches."""
123
+ matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
124
+ created_requests.extend(
125
+ [
126
+ Request.from_url(
127
+ match.group(0),
128
+ method=request_input.method,
129
+ payload=request_input.payload.encode('utf-8'),
130
+ headers=request_input.headers,
131
+ user_data=request_input.user_data,
132
+ )
133
+ for match in matches
134
+ ]
135
+ )
136
+
137
+ remote_url_requests = []
138
+ for remote_url_requests_input in remote_url_requests_inputs:
139
+ get_response_task = asyncio.create_task(
140
+ http_client.send_request(
141
+ method='GET',
142
+ url=remote_url_requests_input.requests_from_url,
143
+ )
144
+ )
145
+
146
+ get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
147
+ remote_url_requests.append(get_response_task)
148
+
149
+ await asyncio.gather(*remote_url_requests)
150
+ return created_requests
@@ -1,26 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import builtins
4
- import sys
5
- from importlib import metadata
6
-
7
-
8
- def get_system_info() -> dict:
9
- python_version = '.'.join([str(x) for x in sys.version_info[:3]])
10
-
11
- system_info: dict[str, str | bool] = {
12
- 'apify_sdk_version': metadata.version('apify'),
13
- 'apify_client_version': metadata.version('apify-client'),
14
- 'crawlee_version': metadata.version('crawlee'),
15
- 'python_version': python_version,
16
- 'os': sys.platform,
17
- }
18
-
19
- if is_running_in_ipython():
20
- system_info['is_running_in_ipython'] = True
21
-
22
- return system_info
23
-
24
-
25
- def is_running_in_ipython() -> bool:
26
- return getattr(builtins, '__IPYTHON__', False)
@@ -1,3 +0,0 @@
1
- from crawlee.storages import Dataset, KeyValueStore, RequestQueue
2
-
3
- __all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
File without changes
File without changes
@@ -13,8 +13,8 @@ __version__ = metadata.version('apify')
13
13
 
14
14
  __all__ = [
15
15
  'Actor',
16
- 'Event',
17
16
  'Configuration',
17
+ 'Event',
18
18
  'ProxyConfiguration',
19
19
  'ProxyInfo',
20
20
  'Request',
File without changes
File without changes
File without changes
File without changes
@@ -3,9 +3,9 @@ from apify.scrapy.scheduler import ApifyScheduler
3
3
  from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
4
4
 
5
5
  __all__ = [
6
- 'to_apify_request',
7
- 'to_scrapy_request',
8
6
  'ApifyScheduler',
9
7
  'get_basic_auth_header',
10
8
  'get_running_event_loop_id',
9
+ 'to_apify_request',
10
+ 'to_scrapy_request',
11
11
  ]
File without changes