apify 2.1.0b1__py3-none-any.whl → 2.1.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/_actor.py CHANGED
@@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast
8
8
 
9
9
  from lazy_object_proxy import Proxy
10
10
  from pydantic import AliasChoices
11
- from typing_extensions import Self
12
11
 
13
12
  from apify_client import ApifyClientAsync
14
13
  from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
@@ -22,7 +21,7 @@ from apify._crypto import decrypt_input_secrets, load_private_key
22
21
  from apify._models import ActorRun
23
22
  from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
24
23
  from apify._proxy_configuration import ProxyConfiguration
25
- from apify._utils import get_system_info, is_running_in_ipython
24
+ from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
26
25
  from apify.apify_storage_client import ApifyStorageClient
27
26
  from apify.log import _configure_logging, logger
28
27
  from apify.storages import Dataset, KeyValueStore, RequestQueue
@@ -31,6 +30,8 @@ if TYPE_CHECKING:
31
30
  import logging
32
31
  from types import TracebackType
33
32
 
33
+ from typing_extensions import Self
34
+
34
35
  from crawlee.proxy_configuration import _NewUrlFunction
35
36
 
36
37
  from apify._models import Webhook
@@ -39,6 +40,8 @@ if TYPE_CHECKING:
39
40
  MainReturnType = TypeVar('MainReturnType')
40
41
 
41
42
 
43
+ @docs_name('Actor')
44
+ @docs_group('Classes')
42
45
  class _ActorType:
43
46
  """The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
44
47
 
apify/_configuration.py CHANGED
@@ -1,4 +1,3 @@
1
- # ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work)
2
1
  from __future__ import annotations
3
2
 
4
3
  from datetime import datetime, timedelta
@@ -11,7 +10,10 @@ from crawlee._utils.models import timedelta_ms
11
10
  from crawlee._utils.urls import validate_http_url
12
11
  from crawlee.configuration import Configuration as CrawleeConfiguration
13
12
 
13
+ from apify._utils import docs_group
14
14
 
15
+
16
+ @docs_group('Classes')
15
17
  class Configuration(CrawleeConfiguration):
16
18
  """A class for specifying the configuration of an Actor.
17
19
 
@@ -251,6 +253,7 @@ class Configuration(CrawleeConfiguration):
251
253
  ),
252
254
  description='Date when the Actor will time out',
253
255
  ),
256
+ BeforeValidator(lambda val: val if val != '' else None), # We should accept empty environment variables as well
254
257
  ] = None
255
258
 
256
259
  standby_port: Annotated[
apify/_models.py CHANGED
@@ -1,4 +1,3 @@
1
- # ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
2
1
  from __future__ import annotations
3
2
 
4
3
  from datetime import datetime, timedelta
@@ -10,7 +9,10 @@ from apify_shared.consts import ActorJobStatus, MetaOrigin, WebhookEventType
10
9
  from crawlee._utils.models import timedelta_ms
11
10
  from crawlee._utils.urls import validate_http_url
12
11
 
12
+ from apify._utils import docs_group
13
13
 
14
+
15
+ @docs_group('Data structures')
14
16
  class Webhook(BaseModel):
15
17
  __model_config__ = ConfigDict(populate_by_name=True)
16
18
 
@@ -29,12 +31,14 @@ class Webhook(BaseModel):
29
31
  ] = None
30
32
 
31
33
 
34
+ @docs_group('Data structures')
32
35
  class ActorRunMeta(BaseModel):
33
36
  __model_config__ = ConfigDict(populate_by_name=True)
34
37
 
35
38
  origin: Annotated[MetaOrigin, Field()]
36
39
 
37
40
 
41
+ @docs_group('Data structures')
38
42
  class ActorRunStats(BaseModel):
39
43
  __model_config__ = ConfigDict(populate_by_name=True)
40
44
 
@@ -55,6 +59,7 @@ class ActorRunStats(BaseModel):
55
59
  compute_units: Annotated[float, Field(alias='computeUnits')]
56
60
 
57
61
 
62
+ @docs_group('Data structures')
58
63
  class ActorRunOptions(BaseModel):
59
64
  __model_config__ = ConfigDict(populate_by_name=True)
60
65
 
@@ -64,6 +69,7 @@ class ActorRunOptions(BaseModel):
64
69
  disk_mbytes: Annotated[int, Field(alias='diskMbytes')]
65
70
 
66
71
 
72
+ @docs_group('Data structures')
67
73
  class ActorRunUsage(BaseModel):
68
74
  __model_config__ = ConfigDict(populate_by_name=True)
69
75
 
@@ -81,6 +87,7 @@ class ActorRunUsage(BaseModel):
81
87
  proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
82
88
 
83
89
 
90
+ @docs_group('Data structures')
84
91
  class ActorRun(BaseModel):
85
92
  __model_config__ = ConfigDict(populate_by_name=True)
86
93
 
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
- from datetime import datetime # noqa: TCH003
4
+ from datetime import datetime
5
5
  from typing import TYPE_CHECKING, Annotated, Any, Literal, Union
6
6
 
7
7
  import websockets.client
@@ -19,6 +19,7 @@ from crawlee.events._types import (
19
19
  EventSystemInfoData,
20
20
  )
21
21
 
22
+ from apify._utils import docs_group
22
23
  from apify.log import logger
23
24
 
24
25
  if TYPE_CHECKING:
@@ -30,11 +31,13 @@ if TYPE_CHECKING:
30
31
  __all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager']
31
32
 
32
33
 
34
+ @docs_group('Data structures')
33
35
  class PersistStateEvent(BaseModel):
34
36
  name: Literal[Event.PERSIST_STATE]
35
37
  data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
36
38
 
37
39
 
40
+ @docs_group('Data structures')
38
41
  class SystemInfoEventData(BaseModel):
39
42
  mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
40
43
  mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
@@ -49,7 +52,7 @@ class SystemInfoEventData(BaseModel):
49
52
  return EventSystemInfoData.model_validate(
50
53
  {
51
54
  'cpu_info': {
52
- 'used_ratio': self.cpu_current_usage,
55
+ 'used_ratio': self.cpu_current_usage / 100,
53
56
  'created_at': self.created_at,
54
57
  },
55
58
  'memory_info': {
@@ -61,26 +64,31 @@ class SystemInfoEventData(BaseModel):
61
64
  )
62
65
 
63
66
 
67
+ @docs_group('Data structures')
64
68
  class SystemInfoEvent(BaseModel):
65
69
  name: Literal[Event.SYSTEM_INFO]
66
70
  data: SystemInfoEventData
67
71
 
68
72
 
73
+ @docs_group('Data structures')
69
74
  class MigratingEvent(BaseModel):
70
75
  name: Literal[Event.MIGRATING]
71
76
  data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
72
77
 
73
78
 
79
+ @docs_group('Data structures')
74
80
  class AbortingEvent(BaseModel):
75
81
  name: Literal[Event.ABORTING]
76
82
  data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
77
83
 
78
84
 
85
+ @docs_group('Data structures')
79
86
  class ExitEvent(BaseModel):
80
87
  name: Literal[Event.EXIT]
81
88
  data: Annotated[EventExitData, Field(default_factory=EventExitData)]
82
89
 
83
90
 
91
+ @docs_group('Data structures')
84
92
  class EventWithoutData(BaseModel):
85
93
  name: Literal[
86
94
  Event.SESSION_RETIRED,
@@ -93,11 +101,13 @@ class EventWithoutData(BaseModel):
93
101
  data: Any = None
94
102
 
95
103
 
104
+ @docs_group('Data structures')
96
105
  class DeprecatedEvent(BaseModel):
97
106
  name: Literal['cpuInfo']
98
107
  data: Annotated[dict[str, Any], Field(default_factory=dict)]
99
108
 
100
109
 
110
+ @docs_group('Data structures')
101
111
  class UnknownEvent(BaseModel):
102
112
  name: str
103
113
  data: Annotated[dict[str, Any], Field(default_factory=dict)]
@@ -125,6 +135,7 @@ event_data_adapter: TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent] =
125
135
  )
126
136
 
127
137
 
138
+ @docs_group('Classes')
128
139
  class PlatformEventManager(EventManager):
129
140
  """A class for managing Actor events.
130
141
 
@@ -16,6 +16,7 @@ from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
16
16
  from crawlee.proxy_configuration import _NewUrlFunction
17
17
 
18
18
  from apify._configuration import Configuration
19
+ from apify._utils import docs_group
19
20
  from apify.log import logger
20
21
 
21
22
  if TYPE_CHECKING:
@@ -68,6 +69,7 @@ def _check(
68
69
  raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
69
70
 
70
71
 
72
+ @docs_group('Classes')
71
73
  @dataclass
72
74
  class ProxyInfo(CrawleeProxyInfo):
73
75
  """Provides information about a proxy connection that is used for requests."""
@@ -87,6 +89,7 @@ class ProxyInfo(CrawleeProxyInfo):
87
89
  """
88
90
 
89
91
 
92
+ @docs_group('Classes')
90
93
  class ProxyConfiguration(CrawleeProxyConfiguration):
91
94
  """Configures a connection to a proxy server with the provided options.
92
95
 
apify/_utils.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import builtins
4
4
  import sys
5
5
  from importlib import metadata
6
+ from typing import Callable, Literal
6
7
 
7
8
 
8
9
  def get_system_info() -> dict:
@@ -11,6 +12,7 @@ def get_system_info() -> dict:
11
12
  system_info: dict[str, str | bool] = {
12
13
  'apify_sdk_version': metadata.version('apify'),
13
14
  'apify_client_version': metadata.version('apify-client'),
15
+ 'crawlee_version': metadata.version('crawlee'),
14
16
  'python_version': python_version,
15
17
  'os': sys.platform,
16
18
  }
@@ -23,3 +25,34 @@ def get_system_info() -> dict:
23
25
 
24
26
  def is_running_in_ipython() -> bool:
25
27
  return getattr(builtins, '__IPYTHON__', False)
28
+
29
+
30
+ GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Errors', 'Functions']
31
+
32
+
33
+ def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
34
+ """Decorator to mark symbols for rendering and grouping in documentation.
35
+
36
+ This decorator is used purely for documentation purposes and does not alter the behavior
37
+ of the decorated callable.
38
+ """
39
+
40
+ def wrapper(func: Callable) -> Callable:
41
+ return func
42
+
43
+ return wrapper
44
+
45
+
46
+ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
47
+ """Decorator for renaming symbols in documentation.
48
+
49
+ This changes the rendered name of the symbol only in the rendered web documentation.
50
+
51
+ This decorator is used purely for documentation purposes and does not alter the behavior
52
+ of the decorated callable.
53
+ """
54
+
55
+ def wrapper(func: Callable) -> Callable:
56
+ return func
57
+
58
+ return wrapper
@@ -5,6 +5,7 @@ from crawlee._utils.crypto import crypto_random_object_id
5
5
  from crawlee.base_storage_client import BaseStorageClient
6
6
 
7
7
  from apify._configuration import Configuration
8
+ from apify._utils import docs_group
8
9
  from apify.apify_storage_client._dataset_client import DatasetClient
9
10
  from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
10
11
  from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient
@@ -13,6 +14,7 @@ from apify.apify_storage_client._request_queue_client import RequestQueueClient
13
14
  from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
14
15
 
15
16
 
17
+ @docs_group('Classes')
16
18
  class ApifyStorageClient(BaseStorageClient):
17
19
  """A storage client implementation based on the Apify platform storage."""
18
20
 
@@ -91,7 +91,7 @@ class DatasetClient(BaseDatasetClient):
91
91
  skip_empty: bool = False,
92
92
  skip_hidden: bool = False,
93
93
  ) -> AsyncIterator[dict]:
94
- return self._client.iterate_items(
94
+ async for item in self._client.iterate_items(
95
95
  offset=offset,
96
96
  limit=limit,
97
97
  clean=clean,
@@ -101,7 +101,8 @@ class DatasetClient(BaseDatasetClient):
101
101
  unwind=unwind,
102
102
  skip_empty=skip_empty,
103
103
  skip_hidden=skip_hidden,
104
- )
104
+ ):
105
+ yield item
105
106
 
106
107
  @override
107
108
  async def get_items_as_bytes(
apify/log.py CHANGED
@@ -17,7 +17,7 @@ logger = logging.getLogger(logger_name)
17
17
 
18
18
 
19
19
  @ignore_docs
20
- class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
20
+ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from parent class)
21
21
  pass
22
22
 
23
23
 
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
3
4
  from urllib.parse import ParseResult, urlparse
4
5
 
5
6
  try:
6
- from scrapy import Request, Spider # noqa: TCH002
7
+ if TYPE_CHECKING:
8
+ from scrapy import Request, Spider
9
+ from scrapy.crawler import Crawler
7
10
  from scrapy.core.downloader.handlers.http11 import TunnelError
8
- from scrapy.crawler import Crawler # noqa: TCH002
9
11
  from scrapy.exceptions import NotConfigured
10
12
  except ImportError as exc:
11
13
  raise ImportError(
@@ -27,7 +29,7 @@ class ApifyHttpProxyMiddleware:
27
29
  proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []}
28
30
  """
29
31
 
30
- def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None:
32
+ def __init__(self, proxy_settings: dict) -> None:
31
33
  """Create a new instance.
32
34
 
33
35
  Args:
@@ -66,7 +68,7 @@ class ApifyHttpProxyMiddleware:
66
68
 
67
69
  return cls(proxy_settings)
68
70
 
69
- async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None:
71
+ async def process_request(self, request: Request, spider: Spider) -> None:
70
72
  """Process a Scrapy request by assigning a new proxy.
71
73
 
72
74
  Args:
@@ -89,7 +91,7 @@ class ApifyHttpProxyMiddleware:
89
91
  Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}')
90
92
 
91
93
  def process_exception(
92
- self: ApifyHttpProxyMiddleware,
94
+ self,
93
95
  request: Request,
94
96
  exception: Exception,
95
97
  spider: Spider,
@@ -116,7 +118,7 @@ class ApifyHttpProxyMiddleware:
116
118
  'reason="{exception}", skipping...'
117
119
  )
118
120
 
119
- async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
121
+ async def _get_new_proxy_url(self) -> ParseResult:
120
122
  """Get a new proxy URL.
121
123
 
122
124
  Raises:
@@ -1,9 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
4
+
3
5
  from itemadapter.adapter import ItemAdapter
4
6
 
5
7
  try:
6
- from scrapy import Item, Spider # noqa: TCH002
8
+ if TYPE_CHECKING:
9
+ from scrapy import Item, Spider
7
10
  except ImportError as exc:
8
11
  raise ImportError(
9
12
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
@@ -19,7 +22,7 @@ class ActorDatasetPushPipeline:
19
22
  """
20
23
 
21
24
  async def process_item(
22
- self: ActorDatasetPushPipeline,
25
+ self,
23
26
  item: Item,
24
27
  spider: Spider,
25
28
  ) -> Item:
apify/scrapy/scheduler.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import traceback
4
+ from typing import TYPE_CHECKING
4
5
 
5
6
  from apify._configuration import Configuration
6
7
  from apify.apify_storage_client import ApifyStorageClient
@@ -8,8 +9,10 @@ from apify.apify_storage_client import ApifyStorageClient
8
9
  try:
9
10
  from scrapy import Spider
10
11
  from scrapy.core.scheduler import BaseScheduler
11
- from scrapy.http.request import Request # noqa: TCH002
12
12
  from scrapy.utils.reactor import is_asyncio_reactor_installed
13
+
14
+ if TYPE_CHECKING:
15
+ from scrapy.http.request import Request
13
16
  except ImportError as exc:
14
17
  raise ImportError(
15
18
  'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
@@ -29,7 +32,7 @@ class ApifyScheduler(BaseScheduler):
29
32
  This scheduler requires the asyncio Twisted reactor to be installed.
30
33
  """
31
34
 
32
- def __init__(self: ApifyScheduler) -> None:
35
+ def __init__(self) -> None:
33
36
  """Create a new instance."""
34
37
  if not is_asyncio_reactor_installed():
35
38
  raise ValueError(
@@ -40,7 +43,7 @@ class ApifyScheduler(BaseScheduler):
40
43
  self._rq: RequestQueue | None = None
41
44
  self.spider: Spider | None = None
42
45
 
43
- def open(self: ApifyScheduler, spider: Spider) -> None: # this has to be named "open"
46
+ def open(self, spider: Spider) -> None: # this has to be named "open"
44
47
  """Open the scheduler.
45
48
 
46
49
  Args:
@@ -58,7 +61,7 @@ class ApifyScheduler(BaseScheduler):
58
61
  traceback.print_exc()
59
62
  raise
60
63
 
61
- def has_pending_requests(self: ApifyScheduler) -> bool:
64
+ def has_pending_requests(self) -> bool:
62
65
  """Check if the scheduler has any pending requests.
63
66
 
64
67
  Returns:
@@ -75,7 +78,7 @@ class ApifyScheduler(BaseScheduler):
75
78
 
76
79
  return not is_finished
77
80
 
78
- def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
81
+ def enqueue_request(self, request: Request) -> bool:
79
82
  """Add a request to the scheduler.
80
83
 
81
84
  This could be called from either from a spider or a downloader middleware (e.g. redirect, retry, ...).
@@ -111,7 +114,7 @@ class ApifyScheduler(BaseScheduler):
111
114
  Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
112
115
  return bool(result.was_already_present)
113
116
 
114
- def next_request(self: ApifyScheduler) -> Request | None:
117
+ def next_request(self) -> Request | None:
115
118
  """Fetch the next request from the scheduler.
116
119
 
117
120
  Returns:
apify/scrapy/utils.py CHANGED
@@ -2,14 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import asyncio
4
4
  from base64 import b64encode
5
+ from typing import TYPE_CHECKING
5
6
  from urllib.parse import unquote
6
7
 
7
8
  from apify_shared.utils import ignore_docs
8
9
 
9
10
  try:
10
- from scrapy.settings import Settings # noqa: TCH002
11
11
  from scrapy.utils.project import get_project_settings
12
12
  from scrapy.utils.python import to_bytes
13
+
14
+ if TYPE_CHECKING:
15
+ from scrapy.settings import Settings
13
16
  except ImportError as exc:
14
17
  raise ImportError(
15
18
  'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run '
@@ -1,3 +1,5 @@
1
1
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
2
2
 
3
- __all__ = ['Dataset', 'KeyValueStore', 'RequestQueue']
3
+ from ._request_list import RequestList
4
+
5
+ __all__ = ['Dataset', 'KeyValueStore', 'RequestQueue', 'RequestList']
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import re
5
+ from asyncio import Task
6
+ from functools import partial
7
+ from typing import Annotated, Any, Union
8
+
9
+ from pydantic import BaseModel, Field, TypeAdapter
10
+
11
+ from crawlee import Request
12
+ from crawlee._types import HttpMethod
13
+ from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
14
+ from crawlee.storages import RequestList as CrawleeRequestList
15
+
16
+ from apify._utils import docs_group
17
+
18
+ URL_NO_COMMAS_REGEX = re.compile(
19
+ r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
20
+ )
21
+
22
+
23
+ class _RequestDetails(BaseModel):
24
+ method: HttpMethod = 'GET'
25
+ payload: str = ''
26
+ headers: Annotated[dict[str, str], Field(default_factory=dict)] = {}
27
+ user_data: Annotated[dict[str, str], Field(default_factory=dict, alias='userData')] = {}
28
+
29
+
30
+ class _RequestsFromUrlInput(_RequestDetails):
31
+ requests_from_url: str = Field(alias='requestsFromUrl')
32
+
33
+
34
+ class _SimpleUrlInput(_RequestDetails):
35
+ url: str
36
+
37
+
38
+ url_input_adapter = TypeAdapter(list[Union[_RequestsFromUrlInput, _SimpleUrlInput]])
39
+
40
+
41
+ @docs_group('Classes')
42
+ class RequestList(CrawleeRequestList):
43
+ """Extends crawlee RequestList.
44
+
45
+ Method open is used to create RequestList from actor's requestListSources input.
46
+ """
47
+
48
+ @staticmethod
49
+ async def open(
50
+ name: str | None = None,
51
+ request_list_sources_input: list[dict[str, Any]] | None = None,
52
+ http_client: BaseHttpClient | None = None,
53
+ ) -> RequestList:
54
+ """Creates RequestList from Actor input requestListSources.
55
+
56
+ Args:
57
+ name: Name of the returned RequestList.
58
+ request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
59
+ http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
60
+
61
+ Returns:
62
+ RequestList created from request_list_sources_input.
63
+
64
+ ### Usage
65
+
66
+ ```python
67
+ example_input = [
68
+ # Gather urls from response body.
69
+ {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
70
+ # Directly include this url.
71
+ {'url': 'https://crawlee.dev', 'method': 'GET'}
72
+ ]
73
+ request_list = await RequestList.open(request_list_sources_input=example_input)
74
+ ```
75
+ """
76
+ request_list_sources_input = request_list_sources_input or []
77
+ return await RequestList._create_request_list(name, request_list_sources_input, http_client)
78
+
79
+ @staticmethod
80
+ async def _create_request_list(
81
+ name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: BaseHttpClient | None
82
+ ) -> RequestList:
83
+ if not http_client:
84
+ http_client = HttpxHttpClient()
85
+
86
+ url_inputs = url_input_adapter.validate_python(request_list_sources_input)
87
+
88
+ simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
89
+ remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
90
+
91
+ simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs)
92
+ remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client)
93
+
94
+ return RequestList(name=name, requests=simple_url_requests + remote_url_requests)
95
+
96
+ @staticmethod
97
+ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
98
+ return [
99
+ Request.from_url(
100
+ method=request_input.method,
101
+ url=request_input.url,
102
+ payload=request_input.payload.encode('utf-8'),
103
+ headers=request_input.headers,
104
+ user_data=request_input.user_data,
105
+ )
106
+ for request_input in simple_url_inputs
107
+ ]
108
+
109
+ @staticmethod
110
+ async def _fetch_requests_from_url(
111
+ remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
112
+ ) -> list[Request]:
113
+ """Crete list of requests from url.
114
+
115
+ Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
116
+ callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
117
+ collected links and additional inputs stored in other attributes of each remote_url_requests_inputs.
118
+ """
119
+ created_requests: list[Request] = []
120
+
121
+ def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
122
+ """Callback to scrape response body with regexp and create Requests from matches."""
123
+ matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
124
+ created_requests.extend(
125
+ [
126
+ Request.from_url(
127
+ match.group(0),
128
+ method=request_input.method,
129
+ payload=request_input.payload.encode('utf-8'),
130
+ headers=request_input.headers,
131
+ user_data=request_input.user_data,
132
+ )
133
+ for match in matches
134
+ ]
135
+ )
136
+
137
+ remote_url_requests = []
138
+ for remote_url_requests_input in remote_url_requests_inputs:
139
+ get_response_task = asyncio.create_task(
140
+ http_client.send_request(
141
+ method='GET',
142
+ url=remote_url_requests_input.requests_from_url,
143
+ )
144
+ )
145
+
146
+ get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input))
147
+ remote_url_requests.append(get_response_task)
148
+
149
+ await asyncio.gather(*remote_url_requests)
150
+ return created_requests
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 2.1.0b1
3
+ Version: 2.1.0b2
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -21,13 +21,13 @@ Classifier: Topic :: Software Development :: Libraries
21
21
  Provides-Extra: scrapy
22
22
  Requires-Dist: apify-client (>=1.8.1)
23
23
  Requires-Dist: apify-shared (>=1.1.2)
24
- Requires-Dist: crawlee (>=0.3.8)
24
+ Requires-Dist: crawlee (>=0.3.9)
25
25
  Requires-Dist: cryptography (>=42.0.0)
26
26
  Requires-Dist: httpx (>=0.27.0)
27
27
  Requires-Dist: lazy-object-proxy (>=1.10.0)
28
28
  Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
29
29
  Requires-Dist: typing-extensions (>=4.1.0)
30
- Requires-Dist: websockets (>=10.0)
30
+ Requires-Dist: websockets (>=10.0,<14.0.0)
31
31
  Project-URL: Apify Homepage, https://apify.com
32
32
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
33
33
  Project-URL: Documentation, https://docs.apify.com/sdk/python/
@@ -1,37 +1,38 @@
1
1
  apify/__init__.py,sha256=ikoi2EpDYl6y-XSVtlU8UsdQdMEyOiIJCRRAaZFDOP8,550
2
- apify/_actor.py,sha256=oPgQ3rxxIEzVcZ9XtI3lf1a_6gwIMgxihNuYGjJpGww,41816
3
- apify/_configuration.py,sha256=fK-BmsdctMJJT93SsupG98_DmknPnfifmIp5pwehYlA,9508
2
+ apify/_actor.py,sha256=AUviY4qrX4UoN7fSZtXXSHqEk4rrQwBymMLjkgb4Mzg,41887
3
+ apify/_configuration.py,sha256=mzxBrz9eocZiGcCWuDV3YYyKGd_3-A4hMu63qRN8Ep4,9618
4
4
  apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
5
5
  apify/_crypto.py,sha256=e0_aM3l9_5Osk-jszYOOjrAKK60OggSHbiw5c30QnsU,5638
6
- apify/_models.py,sha256=oYlTEr-DyQAE-V2rrYD5PhUxTXVPdAig7QV-u6CJw3E,5571
7
- apify/_platform_event_manager.py,sha256=K4cHabbQ7_ex7vkX-c-VhAOp8Efw3HDn5Wp4lfA-qAU,7571
8
- apify/_proxy_configuration.py,sha256=2z4VV_NrnIp6pDpgQKlKpcHM2pPyXiOpFedpPWje48A,13087
9
- apify/_utils.py,sha256=x4lnR9RNulySiEQTft-GeQqUcJsRr0k8p0Sv9NTeWFg,638
6
+ apify/_models.py,sha256=Btlz-23obKY5tJ75JnUwkVNC2lmU1IEBbdU3HvWaVhg,5748
7
+ apify/_platform_event_manager.py,sha256=44xyV0Lpzf4h4VZ0rkyYg_nhbQkEONNor8_Z9gIKO40,7899
8
+ apify/_proxy_configuration.py,sha256=-QaBrO5FmEy78Ylrry15VgBdu3Jpkz4oHAlCD5-9MBk,13169
9
+ apify/_utils.py,sha256=CCLkpAsZKp00ykm88Z_Fbck5PNT0j6mJYOuD0RxzZUs,1620
10
10
  apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
11
- apify/apify_storage_client/_apify_storage_client.py,sha256=xi4OFchxhe-1-sykanH6Zcya4OcBhn2uf7OQ1pV4Ins,2338
12
- apify/apify_storage_client/_dataset_client.py,sha256=j9seF2OKvbSMD9R9XF9fpa1vtr_1w4JcRV--WCmvU4E,5501
11
+ apify/apify_storage_client/_apify_storage_client.py,sha256=NsZBleJNHLBXVyG__bVjdCGEI30cnmVZngCbQaVekfk,2397
12
+ apify/apify_storage_client/_dataset_client.py,sha256=FfXew6tBiZRkpovyPaQ__xhtZZ-rZvjijwBIIyRahH8,5536
13
13
  apify/apify_storage_client/_dataset_collection_client.py,sha256=fkYvYGQCigHD2CDzpWk0swNAkfvAinAhMGpYqllle3E,1445
14
14
  apify/apify_storage_client/_key_value_store_client.py,sha256=uyeQgb75sGFsqIS4sq4hEZ3QP81COLfS3tmTqHc0tso,3340
15
15
  apify/apify_storage_client/_key_value_store_collection_client.py,sha256=vCtMTI-jx89Qp5WHILDNkCthwLuv0MAwm1J_5E4aypU,1519
16
16
  apify/apify_storage_client/_request_queue_client.py,sha256=jAiFkaJ38_myHFGTw-Rk21wmpbN0UCR2w2SFoimFGFc,5826
17
17
  apify/apify_storage_client/_request_queue_collection_client.py,sha256=NnO73UJ9ZrjV8xoudo30wfaM-SojRkG0guhxDyB-K1g,1527
18
18
  apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- apify/log.py,sha256=zIVjrqQ1DNWNQQOAmdmR9oAbf4nJH7CSMB6u4OOUf6I,1448
19
+ apify/log.py,sha256=zElFyEp2RJN0kiHEwJhcjSCAuHrba5zYiq4pK2xsL_o,1450
20
20
  apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  apify/scrapy/__init__.py,sha256=qDPV_zTRFaUqoFOyS5g4uBfz-UCkmWYJ82VXQ_3Cw6k,348
22
22
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
23
- apify/scrapy/middlewares/apify_proxy.py,sha256=_1WO7NKHxIcPf8mSNjsqANTEsx7ygMTuRQW9fbwKMO8,5837
23
+ apify/scrapy/middlewares/apify_proxy.py,sha256=9_-hJqTwQ4yVMjvN9zkJ_GXJADzrrYu8QoHZ6IX6fDs,5764
24
24
  apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
26
- apify/scrapy/pipelines/actor_dataset_push.py,sha256=QERmmExQOGIKQ70-p-lCj5qyE-c-fnYplEqd4mgaB1Q,953
26
+ apify/scrapy/pipelines/actor_dataset_push.py,sha256=otggoULfUdCqOPJLb9wMROZ9WylnlL-209930tMS2Rg,971
27
27
  apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  apify/scrapy/requests.py,sha256=F4VNaX2fGqybJKbhcRcz0_m6dXse5LzKll4gtMuTRko,7480
30
- apify/scrapy/scheduler.py,sha256=AAIKY5i1QxkC1mtmix6n3M2eQaOw-d1T56Noue9xToc,6013
31
- apify/scrapy/utils.py,sha256=0XdFxi1qlUa6gHXG96e1FU9gW0N5Rsu0sVZklFYfC2U,2884
32
- apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
30
+ apify/scrapy/scheduler.py,sha256=03kZxejWWb-TofJ-vpSZuQ28rT-qNjhhpC-QeO2OzoU,5977
31
+ apify/scrapy/utils.py,sha256=758DcHCSAgCTProY0QX74uJ1XrzVsQwvCmFanj2f_3Q,2928
32
+ apify/storages/__init__.py,sha256=AE4ZJ-iAoESmQQh_RUU78fe5CxBUnu9wlEZleQc5SwA,177
33
+ apify/storages/_request_list.py,sha256=4nrvSdMUF-kiwGVIPEfIOygLKgjUpO37Jl8Om-jRbIU,5858
33
34
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- apify-2.1.0b1.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
35
- apify-2.1.0b1.dist-info/METADATA,sha256=wP9WixgCbxLoefOJF57CcIMtBCtJb70JQJBAPNCMSNE,8657
36
- apify-2.1.0b1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
37
- apify-2.1.0b1.dist-info/RECORD,,
35
+ apify-2.1.0b2.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
36
+ apify-2.1.0b2.dist-info/METADATA,sha256=YSc5d6kH4W49HvGyvTSalEFC6TYGqFJIwyL2oF5TV0o,8665
37
+ apify-2.1.0b2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
38
+ apify-2.1.0b2.dist-info/RECORD,,