apify 2.0.2b7__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- {apify-2.0.2b7 → apify-2.1.0}/PKG-INFO +3 -3
- {apify-2.0.2b7 → apify-2.1.0}/pyproject.toml +20 -16
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_actor.py +5 -2
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_configuration.py +4 -2
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_models.py +8 -1
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_platform_event_manager.py +12 -1
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_proxy_configuration.py +4 -1
- apify-2.1.0/src/apify/_utils.py +58 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_apify_storage_client.py +2 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/middlewares/apify_proxy.py +4 -2
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/pipelines/actor_dataset_push.py +4 -1
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/requests.py +7 -5
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/scheduler.py +4 -1
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/utils.py +4 -1
- apify-2.1.0/src/apify/storages/__init__.py +5 -0
- apify-2.1.0/src/apify/storages/_request_list.py +150 -0
- apify-2.0.2b7/src/apify/_utils.py +0 -26
- apify-2.0.2b7/src/apify/storages/__init__.py +0 -3
- {apify-2.0.2b7 → apify-2.1.0}/LICENSE +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/README.md +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/__init__.py +1 -1
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_consts.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/_crypto.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/__init__.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_dataset_client.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_dataset_collection_client.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_key_value_store_client.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_key_value_store_collection_client.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_request_queue_client.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_request_queue_collection_client.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/py.typed +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/log.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/py.typed +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/__init__.py +2 -2
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/middlewares/py.typed +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/pipelines/py.typed +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/scrapy/py.typed +0 -0
- {apify-2.0.2b7 → apify-2.1.0}/src/apify/storages/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: apify
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Apify SDK for Python
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
|
|
@@ -21,9 +21,9 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
21
21
|
Provides-Extra: scrapy
|
|
22
22
|
Requires-Dist: apify-client (>=1.8.1)
|
|
23
23
|
Requires-Dist: apify-shared (>=1.1.2)
|
|
24
|
-
Requires-Dist: crawlee (>=0.
|
|
24
|
+
Requires-Dist: crawlee (>=0.4.0,<0.5.0)
|
|
25
25
|
Requires-Dist: cryptography (>=42.0.0)
|
|
26
|
-
Requires-Dist: httpx (>=0.27.0)
|
|
26
|
+
Requires-Dist: httpx (>=0.27.0,<0.28.0)
|
|
27
27
|
Requires-Dist: lazy-object-proxy (>=1.10.0)
|
|
28
28
|
Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
|
|
29
29
|
Requires-Dist: typing-extensions (>=4.1.0)
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "apify"
|
|
7
|
-
version = "2.0
|
|
7
|
+
version = "2.1.0"
|
|
8
8
|
description = "Apify SDK for Python"
|
|
9
9
|
authors = ["Apify Technologies s.r.o. <support@apify.com>"]
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -41,16 +41,15 @@ keywords = [
|
|
|
41
41
|
"Issue Tracker" = "https://github.com/apify/apify-sdk-python/issues"
|
|
42
42
|
"Repository" = "https://github.com/apify/apify-sdk-python"
|
|
43
43
|
|
|
44
|
-
# We use inclusive ordered comparison clauses for external packages intentionally in order to enhance SDK's
|
|
45
|
-
# compatibility with external packages. This decision was discussed in detail in the following PR:
|
|
46
|
-
# https://github.com/apify/apify-sdk-python/pull/154.
|
|
47
44
|
[tool.poetry.dependencies]
|
|
48
45
|
python = "^3.9"
|
|
49
46
|
apify-client = ">=1.8.1"
|
|
50
47
|
apify-shared = ">=1.1.2"
|
|
51
|
-
crawlee = "
|
|
48
|
+
crawlee = "~0.4.0"
|
|
52
49
|
cryptography = ">=42.0.0"
|
|
53
|
-
|
|
50
|
+
# TODO: relax the upper bound once the issue is resolved:
|
|
51
|
+
# https://github.com/apify/apify-sdk-python/issues/348
|
|
52
|
+
httpx = "~0.27.0"
|
|
54
53
|
lazy-object-proxy = ">=1.10.0"
|
|
55
54
|
scrapy = { version = ">=2.11.0", optional = true }
|
|
56
55
|
typing-extensions = ">=4.1.0"
|
|
@@ -65,13 +64,13 @@ pre-commit = "~4.0.0"
|
|
|
65
64
|
pydoc-markdown = "~4.8.0"
|
|
66
65
|
pytest = "~8.3.0"
|
|
67
66
|
pytest-asyncio = "~0.24.0"
|
|
68
|
-
pytest-cov = "~
|
|
67
|
+
pytest-cov = "~6.0.0"
|
|
69
68
|
pytest-only = "~2.1.0"
|
|
70
69
|
pytest-timeout = "~2.3.0"
|
|
71
70
|
pytest-xdist = "~3.6.0"
|
|
72
71
|
respx = "~0.21.0"
|
|
73
|
-
ruff = "~0.
|
|
74
|
-
setuptools = "~75.
|
|
72
|
+
ruff = "~0.8.0"
|
|
73
|
+
setuptools = "~75.6.0" # setuptools are used by pytest but not explicitly required
|
|
75
74
|
|
|
76
75
|
[tool.poetry.extras]
|
|
77
76
|
scrapy = ["scrapy"]
|
|
@@ -82,8 +81,6 @@ line-length = 120
|
|
|
82
81
|
[tool.ruff.lint]
|
|
83
82
|
select = ["ALL"]
|
|
84
83
|
ignore = [
|
|
85
|
-
"ANN101", # Missing type annotation for `self` in method
|
|
86
|
-
"ANN102", # Missing type annotation for `{name}` in classmethod
|
|
87
84
|
"ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename}
|
|
88
85
|
"ASYNC109", # Async function definition with a `timeout` parameter
|
|
89
86
|
"BLE001", # Do not catch blind exception
|
|
@@ -96,7 +93,6 @@ ignore = [
|
|
|
96
93
|
"G004", # Logging statement uses f-string
|
|
97
94
|
"ISC001", # This rule may cause conflicts when used with the formatter
|
|
98
95
|
"FIX", # flake8-fixme
|
|
99
|
-
"PGH003", # Use specific rule codes when ignoring type issues
|
|
100
96
|
"PLR0911", # Too many return statements
|
|
101
97
|
"PLR0913", # Too many arguments in function definition
|
|
102
98
|
"PLR0915", # Too many statements
|
|
@@ -141,6 +137,12 @@ indent-style = "space"
|
|
|
141
137
|
docstring-quotes = "double"
|
|
142
138
|
inline-quotes = "single"
|
|
143
139
|
|
|
140
|
+
[tool.ruff.lint.flake8-type-checking]
|
|
141
|
+
runtime-evaluated-base-classes = [
|
|
142
|
+
"pydantic.BaseModel",
|
|
143
|
+
"crawlee.configuration.Configuration",
|
|
144
|
+
]
|
|
145
|
+
|
|
144
146
|
[tool.ruff.lint.flake8-builtins]
|
|
145
147
|
builtins-ignorelist = ["id"]
|
|
146
148
|
|
|
@@ -180,15 +182,17 @@ exclude = []
|
|
|
180
182
|
module = ['scrapy', 'scrapy.*', 'lazy_object_proxy']
|
|
181
183
|
ignore_missing_imports = true
|
|
182
184
|
|
|
185
|
+
[tool.basedpyright]
|
|
186
|
+
pythonVersion = "3.9"
|
|
187
|
+
typeCheckingMode = "standard"
|
|
188
|
+
include = ["src", "tests"]
|
|
189
|
+
|
|
183
190
|
[tool.coverage.report]
|
|
184
191
|
exclude_lines = [
|
|
185
192
|
"pragma: no cover",
|
|
186
193
|
"if TYPE_CHECKING:",
|
|
187
|
-
"assert_never()"
|
|
194
|
+
"assert_never()",
|
|
188
195
|
]
|
|
189
196
|
|
|
190
|
-
[tool.basedpyright]
|
|
191
|
-
typeCheckingMode = "standard"
|
|
192
|
-
|
|
193
197
|
[tool.ipdb]
|
|
194
198
|
context = 7
|
|
@@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
|
|
|
8
8
|
|
|
9
9
|
from lazy_object_proxy import Proxy
|
|
10
10
|
from pydantic import AliasChoices
|
|
11
|
-
from typing_extensions import Self
|
|
12
11
|
|
|
13
12
|
from apify_client import ApifyClientAsync
|
|
14
13
|
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
|
|
@@ -22,7 +21,7 @@ from apify._crypto import decrypt_input_secrets, load_private_key
|
|
|
22
21
|
from apify._models import ActorRun
|
|
23
22
|
from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
|
|
24
23
|
from apify._proxy_configuration import ProxyConfiguration
|
|
25
|
-
from apify._utils import get_system_info, is_running_in_ipython
|
|
24
|
+
from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
|
|
26
25
|
from apify.apify_storage_client import ApifyStorageClient
|
|
27
26
|
from apify.log import _configure_logging, logger
|
|
28
27
|
from apify.storages import Dataset, KeyValueStore, RequestQueue
|
|
@@ -31,6 +30,8 @@ if TYPE_CHECKING:
|
|
|
31
30
|
import logging
|
|
32
31
|
from types import TracebackType
|
|
33
32
|
|
|
33
|
+
from typing_extensions import Self
|
|
34
|
+
|
|
34
35
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
35
36
|
|
|
36
37
|
from apify._models import Webhook
|
|
@@ -39,6 +40,8 @@ if TYPE_CHECKING:
|
|
|
39
40
|
MainReturnType = TypeVar('MainReturnType')
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
@docs_name('Actor')
|
|
44
|
+
@docs_group('Classes')
|
|
42
45
|
class _ActorType:
|
|
43
46
|
"""The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
|
|
44
47
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work)
|
|
2
1
|
from __future__ import annotations
|
|
3
2
|
|
|
4
3
|
from datetime import datetime, timedelta
|
|
@@ -11,7 +10,10 @@ from crawlee._utils.models import timedelta_ms
|
|
|
11
10
|
from crawlee._utils.urls import validate_http_url
|
|
12
11
|
from crawlee.configuration import Configuration as CrawleeConfiguration
|
|
13
12
|
|
|
13
|
+
from apify._utils import docs_group
|
|
14
14
|
|
|
15
|
+
|
|
16
|
+
@docs_group('Classes')
|
|
15
17
|
class Configuration(CrawleeConfiguration):
|
|
16
18
|
"""A class for specifying the configuration of an Actor.
|
|
17
19
|
|
|
@@ -321,4 +323,4 @@ class Configuration(CrawleeConfiguration):
|
|
|
321
323
|
|
|
322
324
|
|
|
323
325
|
# Monkey-patch the base class so that it works with the extended configuration
|
|
324
|
-
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore
|
|
326
|
+
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
|
|
2
1
|
from __future__ import annotations
|
|
3
2
|
|
|
4
3
|
from datetime import datetime, timedelta
|
|
@@ -10,7 +9,10 @@ from apify_shared.consts import ActorJobStatus, MetaOrigin, WebhookEventType
|
|
|
10
9
|
from crawlee._utils.models import timedelta_ms
|
|
11
10
|
from crawlee._utils.urls import validate_http_url
|
|
12
11
|
|
|
12
|
+
from apify._utils import docs_group
|
|
13
13
|
|
|
14
|
+
|
|
15
|
+
@docs_group('Data structures')
|
|
14
16
|
class Webhook(BaseModel):
|
|
15
17
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
16
18
|
|
|
@@ -29,12 +31,14 @@ class Webhook(BaseModel):
|
|
|
29
31
|
] = None
|
|
30
32
|
|
|
31
33
|
|
|
34
|
+
@docs_group('Data structures')
|
|
32
35
|
class ActorRunMeta(BaseModel):
|
|
33
36
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
34
37
|
|
|
35
38
|
origin: Annotated[MetaOrigin, Field()]
|
|
36
39
|
|
|
37
40
|
|
|
41
|
+
@docs_group('Data structures')
|
|
38
42
|
class ActorRunStats(BaseModel):
|
|
39
43
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
40
44
|
|
|
@@ -55,6 +59,7 @@ class ActorRunStats(BaseModel):
|
|
|
55
59
|
compute_units: Annotated[float, Field(alias='computeUnits')]
|
|
56
60
|
|
|
57
61
|
|
|
62
|
+
@docs_group('Data structures')
|
|
58
63
|
class ActorRunOptions(BaseModel):
|
|
59
64
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
60
65
|
|
|
@@ -64,6 +69,7 @@ class ActorRunOptions(BaseModel):
|
|
|
64
69
|
disk_mbytes: Annotated[int, Field(alias='diskMbytes')]
|
|
65
70
|
|
|
66
71
|
|
|
72
|
+
@docs_group('Data structures')
|
|
67
73
|
class ActorRunUsage(BaseModel):
|
|
68
74
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
69
75
|
|
|
@@ -81,6 +87,7 @@ class ActorRunUsage(BaseModel):
|
|
|
81
87
|
proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
|
|
82
88
|
|
|
83
89
|
|
|
90
|
+
@docs_group('Data structures')
|
|
84
91
|
class ActorRun(BaseModel):
|
|
85
92
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
86
93
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime
|
|
5
5
|
from typing import TYPE_CHECKING, Annotated, Any, Literal, Union
|
|
6
6
|
|
|
7
7
|
import websockets.client
|
|
@@ -19,6 +19,7 @@ from crawlee.events._types import (
|
|
|
19
19
|
EventSystemInfoData,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
+
from apify._utils import docs_group
|
|
22
23
|
from apify.log import logger
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
@@ -30,11 +31,13 @@ if TYPE_CHECKING:
|
|
|
30
31
|
__all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager']
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
@docs_group('Data structures')
|
|
33
35
|
class PersistStateEvent(BaseModel):
|
|
34
36
|
name: Literal[Event.PERSIST_STATE]
|
|
35
37
|
data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
@docs_group('Data structures')
|
|
38
41
|
class SystemInfoEventData(BaseModel):
|
|
39
42
|
mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
|
|
40
43
|
mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
|
|
@@ -61,26 +64,31 @@ class SystemInfoEventData(BaseModel):
|
|
|
61
64
|
)
|
|
62
65
|
|
|
63
66
|
|
|
67
|
+
@docs_group('Data structures')
|
|
64
68
|
class SystemInfoEvent(BaseModel):
|
|
65
69
|
name: Literal[Event.SYSTEM_INFO]
|
|
66
70
|
data: SystemInfoEventData
|
|
67
71
|
|
|
68
72
|
|
|
73
|
+
@docs_group('Data structures')
|
|
69
74
|
class MigratingEvent(BaseModel):
|
|
70
75
|
name: Literal[Event.MIGRATING]
|
|
71
76
|
data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
|
|
72
77
|
|
|
73
78
|
|
|
79
|
+
@docs_group('Data structures')
|
|
74
80
|
class AbortingEvent(BaseModel):
|
|
75
81
|
name: Literal[Event.ABORTING]
|
|
76
82
|
data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
|
|
77
83
|
|
|
78
84
|
|
|
85
|
+
@docs_group('Data structures')
|
|
79
86
|
class ExitEvent(BaseModel):
|
|
80
87
|
name: Literal[Event.EXIT]
|
|
81
88
|
data: Annotated[EventExitData, Field(default_factory=EventExitData)]
|
|
82
89
|
|
|
83
90
|
|
|
91
|
+
@docs_group('Data structures')
|
|
84
92
|
class EventWithoutData(BaseModel):
|
|
85
93
|
name: Literal[
|
|
86
94
|
Event.SESSION_RETIRED,
|
|
@@ -93,11 +101,13 @@ class EventWithoutData(BaseModel):
|
|
|
93
101
|
data: Any = None
|
|
94
102
|
|
|
95
103
|
|
|
104
|
+
@docs_group('Data structures')
|
|
96
105
|
class DeprecatedEvent(BaseModel):
|
|
97
106
|
name: Literal['cpuInfo']
|
|
98
107
|
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
99
108
|
|
|
100
109
|
|
|
110
|
+
@docs_group('Data structures')
|
|
101
111
|
class UnknownEvent(BaseModel):
|
|
102
112
|
name: str
|
|
103
113
|
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
@@ -125,6 +135,7 @@ event_data_adapter: TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent] =
|
|
|
125
135
|
)
|
|
126
136
|
|
|
127
137
|
|
|
138
|
+
@docs_group('Classes')
|
|
128
139
|
class PlatformEventManager(EventManager):
|
|
129
140
|
"""A class for managing Actor events.
|
|
130
141
|
|
|
@@ -16,6 +16,7 @@ from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
|
|
|
16
16
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
17
17
|
|
|
18
18
|
from apify._configuration import Configuration
|
|
19
|
+
from apify._utils import docs_group
|
|
19
20
|
from apify.log import logger
|
|
20
21
|
|
|
21
22
|
if TYPE_CHECKING:
|
|
@@ -68,6 +69,7 @@ def _check(
|
|
|
68
69
|
raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
|
|
69
70
|
|
|
70
71
|
|
|
72
|
+
@docs_group('Classes')
|
|
71
73
|
@dataclass
|
|
72
74
|
class ProxyInfo(CrawleeProxyInfo):
|
|
73
75
|
"""Provides information about a proxy connection that is used for requests."""
|
|
@@ -87,6 +89,7 @@ class ProxyInfo(CrawleeProxyInfo):
|
|
|
87
89
|
"""
|
|
88
90
|
|
|
89
91
|
|
|
92
|
+
@docs_group('Classes')
|
|
90
93
|
class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
91
94
|
"""Configures a connection to a proxy server with the provided options.
|
|
92
95
|
|
|
@@ -277,7 +280,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
277
280
|
return
|
|
278
281
|
|
|
279
282
|
status = None
|
|
280
|
-
async with httpx.AsyncClient(
|
|
283
|
+
async with httpx.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
|
|
281
284
|
for _ in range(2):
|
|
282
285
|
try:
|
|
283
286
|
response = await client.get(proxy_status_url)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import builtins
|
|
4
|
+
import sys
|
|
5
|
+
from importlib import metadata
|
|
6
|
+
from typing import Callable, Literal
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_system_info() -> dict:
|
|
10
|
+
python_version = '.'.join([str(x) for x in sys.version_info[:3]])
|
|
11
|
+
|
|
12
|
+
system_info: dict[str, str | bool] = {
|
|
13
|
+
'apify_sdk_version': metadata.version('apify'),
|
|
14
|
+
'apify_client_version': metadata.version('apify-client'),
|
|
15
|
+
'crawlee_version': metadata.version('crawlee'),
|
|
16
|
+
'python_version': python_version,
|
|
17
|
+
'os': sys.platform,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if is_running_in_ipython():
|
|
21
|
+
system_info['is_running_in_ipython'] = True
|
|
22
|
+
|
|
23
|
+
return system_info
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_running_in_ipython() -> bool:
|
|
27
|
+
return getattr(builtins, '__IPYTHON__', False)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Errors', 'Functions']
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
|
|
34
|
+
"""Decorator to mark symbols for rendering and grouping in documentation.
|
|
35
|
+
|
|
36
|
+
This decorator is used purely for documentation purposes and does not alter the behavior
|
|
37
|
+
of the decorated callable.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def wrapper(func: Callable) -> Callable:
|
|
41
|
+
return func
|
|
42
|
+
|
|
43
|
+
return wrapper
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
|
|
47
|
+
"""Decorator for renaming symbols in documentation.
|
|
48
|
+
|
|
49
|
+
This changes the rendered name of the symbol only in the rendered web documentation.
|
|
50
|
+
|
|
51
|
+
This decorator is used purely for documentation purposes and does not alter the behavior
|
|
52
|
+
of the decorated callable.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def wrapper(func: Callable) -> Callable:
|
|
56
|
+
return func
|
|
57
|
+
|
|
58
|
+
return wrapper
|
|
@@ -5,6 +5,7 @@ from crawlee._utils.crypto import crypto_random_object_id
|
|
|
5
5
|
from crawlee.base_storage_client import BaseStorageClient
|
|
6
6
|
|
|
7
7
|
from apify._configuration import Configuration
|
|
8
|
+
from apify._utils import docs_group
|
|
8
9
|
from apify.apify_storage_client._dataset_client import DatasetClient
|
|
9
10
|
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
|
|
10
11
|
from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
|
|
@@ -13,6 +14,7 @@ from apify.apify_storage_client._request_queue_client import RequestQueueClient
|
|
|
13
14
|
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
|
|
14
15
|
|
|
15
16
|
|
|
17
|
+
@docs_group('Classes')
|
|
16
18
|
class ApifyStorageClient(BaseStorageClient):
|
|
17
19
|
"""A storage client implementation based on the Apify platform storage."""
|
|
18
20
|
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
from urllib.parse import ParseResult, urlparse
|
|
4
5
|
|
|
5
6
|
try:
|
|
6
|
-
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from scrapy import Request, Spider
|
|
9
|
+
from scrapy.crawler import Crawler
|
|
7
10
|
from scrapy.core.downloader.handlers.http11 import TunnelError
|
|
8
|
-
from scrapy.crawler import Crawler # noqa: TCH002
|
|
9
11
|
from scrapy.exceptions import NotConfigured
|
|
10
12
|
except ImportError as exc:
|
|
11
13
|
raise ImportError(
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
3
5
|
from itemadapter.adapter import ItemAdapter
|
|
4
6
|
|
|
5
7
|
try:
|
|
6
|
-
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from scrapy import Item, Spider
|
|
7
10
|
except ImportError as exc:
|
|
8
11
|
raise ImportError(
|
|
9
12
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
@@ -42,8 +42,10 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
|
|
|
42
42
|
Returns:
|
|
43
43
|
The converted Apify request if the conversion was successful, otherwise None.
|
|
44
44
|
"""
|
|
45
|
-
if not isinstance(
|
|
46
|
-
Actor.log.warning(
|
|
45
|
+
if not isinstance(scrapy_request, Request):
|
|
46
|
+
Actor.log.warning( # type: ignore[unreachable]
|
|
47
|
+
'Failed to convert to Apify request: Scrapy request must be a Request instance.'
|
|
48
|
+
)
|
|
47
49
|
return None
|
|
48
50
|
|
|
49
51
|
call_id = crypto_random_object_id(8)
|
|
@@ -53,7 +55,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
|
|
|
53
55
|
if _is_request_produced_by_middleware(scrapy_request):
|
|
54
56
|
unique_key = compute_unique_key(
|
|
55
57
|
url=scrapy_request.url,
|
|
56
|
-
method=scrapy_request.method,
|
|
58
|
+
method=scrapy_request.method, # type: ignore[arg-type] # str vs literal
|
|
57
59
|
payload=scrapy_request.body,
|
|
58
60
|
use_extended_unique_key=True,
|
|
59
61
|
)
|
|
@@ -80,9 +82,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
|
|
|
80
82
|
|
|
81
83
|
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
|
|
82
84
|
if isinstance(scrapy_request.headers, Headers):
|
|
83
|
-
apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict())
|
|
85
|
+
apify_request.headers = HttpHeaders(dict(scrapy_request.headers.to_unicode_dict()))
|
|
84
86
|
else:
|
|
85
|
-
Actor.log.warning(
|
|
87
|
+
Actor.log.warning( # type: ignore[unreachable]
|
|
86
88
|
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
|
|
87
89
|
)
|
|
88
90
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import traceback
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from apify._configuration import Configuration
|
|
6
7
|
from apify.apify_storage_client import ApifyStorageClient
|
|
@@ -8,8 +9,10 @@ from apify.apify_storage_client import ApifyStorageClient
|
|
|
8
9
|
try:
|
|
9
10
|
from scrapy import Spider
|
|
10
11
|
from scrapy.core.scheduler import BaseScheduler
|
|
11
|
-
from scrapy.http.request import Request # noqa: TCH002
|
|
12
12
|
from scrapy.utils.reactor import is_asyncio_reactor_installed
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from scrapy.http.request import Request
|
|
13
16
|
except ImportError as exc:
|
|
14
17
|
raise ImportError(
|
|
15
18
|
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
|
|
@@ -2,14 +2,17 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from base64 import b64encode
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
from urllib.parse import unquote
|
|
6
7
|
|
|
7
8
|
from apify_shared.utils import ignore_docs
|
|
8
9
|
|
|
9
10
|
try:
|
|
10
|
-
from scrapy.settings import Settings # noqa: TCH002
|
|
11
11
|
from scrapy.utils.project import get_project_settings
|
|
12
12
|
from scrapy.utils.python import to_bytes
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from scrapy.settings import Settings
|
|
13
16
|
except ImportError as exc:
|
|
14
17
|
raise ImportError(
|
|
15
18
|
'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import re
|
|
5
|
+
from asyncio import Task
|
|
6
|
+
from functools import partial
|
|
7
|
+
from typing import Annotated, Any, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, TypeAdapter
|
|
10
|
+
|
|
11
|
+
from crawlee import Request
|
|
12
|
+
from crawlee._types import HttpMethod
|
|
13
|
+
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
|
|
14
|
+
from crawlee.storages import RequestList as CrawleeRequestList
|
|
15
|
+
|
|
16
|
+
from apify._utils import docs_group
|
|
17
|
+
|
|
18
|
+
URL_NO_COMMAS_REGEX = re.compile(
|
|
19
|
+
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _RequestDetails(BaseModel):
|
|
24
|
+
method: HttpMethod = 'GET'
|
|
25
|
+
payload: str = ''
|
|
26
|
+
headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
|
|
27
|
+
user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _RequestsFromUrlInput(_RequestDetails):
|
|
31
|
+
requests_from_url: str = Field(alias='requestsFromUrl')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _SimpleUrlInput(_RequestDetails):
|
|
35
|
+
url: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@docs_group('Classes')
|
|
42
|
+
class RequestList(CrawleeRequestList):
|
|
43
|
+
"""Extends crawlee RequestList.
|
|
44
|
+
|
|
45
|
+
Method open is used to create RequestList from actor's requestListSources input.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
async def open(
|
|
50
|
+
name: str | None = None,
|
|
51
|
+
request_list_sources_input: list[dict[str, Any]] | None = None,
|
|
52
|
+
http_client: BaseHttpClient | None = None,
|
|
53
|
+
) -> RequestList:
|
|
54
|
+
"""Creates RequestList from Actor input requestListSources.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
name: Name of the returned RequestList.
|
|
58
|
+
request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
|
|
59
|
+
http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
RequestList created from request_list_sources_input.
|
|
63
|
+
|
|
64
|
+
### Usage
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
example_input = [
|
|
68
|
+
# Gather urls from response body.
|
|
69
|
+
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
|
|
70
|
+
# Directly include this url.
|
|
71
|
+
{'url': 'https://crawlee.dev', 'method': 'GET'}
|
|
72
|
+
]
|
|
73
|
+
request_list = await RequestList.open(request_list_sources_input=example_input)
|
|
74
|
+
```
|
|
75
|
+
"""
|
|
76
|
+
request_list_sources_input = request_list_sources_input or []
|
|
77
|
+
return await RequestList._create_request_list(name, request_list_sources_input, http_client)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
async def _create_request_list(
|
|
81
|
+
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
|
|
82
|
+
) -> RequestList:
|
|
83
|
+
if not http_client:
|
|
84
|
+
http_client = HttpxHttpClient()
|
|
85
|
+
|
|
86
|
+
url_inputs = url_input_adapter.validate_python(request_list_sources_input)
|
|
87
|
+
|
|
88
|
+
simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
|
|
89
|
+
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
|
|
90
|
+
|
|
91
|
+
simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
|
|
92
|
+
remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
|
|
93
|
+
|
|
94
|
+
return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
|
|
98
|
+
return [
|
|
99
|
+
Request.from_url(
|
|
100
|
+
method=request_input.method,
|
|
101
|
+
url=request_input.url,
|
|
102
|
+
payload=request_input.payload.encode('utf-8'),
|
|
103
|
+
headers=request_input.headers,
|
|
104
|
+
user_data=request_input.user_data,
|
|
105
|
+
)
|
|
106
|
+
for request_input in simple_url_inputs
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
async def _fetch_requests_from_url(
|
|
111
|
+
remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
|
|
112
|
+
) -> list[Request]:
|
|
113
|
+
"""Crete list of requests from url.
|
|
114
|
+
|
|
115
|
+
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
|
|
116
|
+
callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
|
|
117
|
+
collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
|
|
118
|
+
"""
|
|
119
|
+
created_requests: list[Request] = []
|
|
120
|
+
|
|
121
|
+
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
|
|
122
|
+
"""Callback to scrape response body with regexp and create Requests from matches."""
|
|
123
|
+
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
|
|
124
|
+
created_requests.extend(
|
|
125
|
+
[
|
|
126
|
+
Request.from_url(
|
|
127
|
+
match.group(0),
|
|
128
|
+
method=request_input.method,
|
|
129
|
+
payload=request_input.payload.encode('utf-8'),
|
|
130
|
+
headers=request_input.headers,
|
|
131
|
+
user_data=request_input.user_data,
|
|
132
|
+
)
|
|
133
|
+
for match in matches
|
|
134
|
+
]
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
remote_url_requests = []
|
|
138
|
+
for remote_url_requests_input in remote_url_requests_inputs:
|
|
139
|
+
get_response_task = asyncio.create_task(
|
|
140
|
+
http_client.send_request(
|
|
141
|
+
method='GET',
|
|
142
|
+
url=remote_url_requests_input.requests_from_url,
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
|
|
147
|
+
remote_url_requests.append(get_response_task)
|
|
148
|
+
|
|
149
|
+
await asyncio.gather(*remote_url_requests)
|
|
150
|
+
return created_requests
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import builtins
|
|
4
|
-
import sys
|
|
5
|
-
from importlib import metadata
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def get_system_info() -> dict:
|
|
9
|
-
python_version = '.'.join([str(x) for x in sys.version_info[:3]])
|
|
10
|
-
|
|
11
|
-
system_info: dict[str, str | bool] = {
|
|
12
|
-
'apify_sdk_version': metadata.version('apify'),
|
|
13
|
-
'apify_client_version': metadata.version('apify-client'),
|
|
14
|
-
'crawlee_version': metadata.version('crawlee'),
|
|
15
|
-
'python_version': python_version,
|
|
16
|
-
'os': sys.platform,
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
if is_running_in_ipython():
|
|
20
|
-
system_info['is_running_in_ipython'] = True
|
|
21
|
-
|
|
22
|
-
return system_info
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def is_running_in_ipython() -> bool:
|
|
26
|
-
return getattr(builtins, '__IPYTHON__', False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_key_value_store_collection_client.py
RENAMED
|
File without changes
|
|
File without changes
|
{apify-2.0.2b7 → apify-2.1.0}/src/apify/apify_storage_client/_request_queue_collection_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -3,9 +3,9 @@ from apify.scrapy.scheduler import ApifyScheduler
|
|
|
3
3
|
from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
|
-
'to_apify_request',
|
|
7
|
-
'to_scrapy_request',
|
|
8
6
|
'ApifyScheduler',
|
|
9
7
|
'get_basic_auth_header',
|
|
10
8
|
'get_running_event_loop_id',
|
|
9
|
+
'to_apify_request',
|
|
10
|
+
'to_scrapy_request',
|
|
11
11
|
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|