apify 2.0.0__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/__init__.py CHANGED
@@ -1,24 +1,11 @@
1
1
  from importlib import metadata
2
2
 
3
- from apify_shared.consts import WebhookEventType
4
- from crawlee import Request
5
3
  from crawlee.events._types import Event
6
4
 
7
5
  from apify._actor import Actor
8
6
  from apify._configuration import Configuration
9
- from apify._models import Webhook
10
7
  from apify._proxy_configuration import ProxyConfiguration, ProxyInfo
11
8
 
12
9
  __version__ = metadata.version('apify')
13
10
 
14
- __all__ = [
15
- 'Actor',
16
- 'Event',
17
- 'Configuration',
18
- 'ProxyConfiguration',
19
- 'ProxyInfo',
20
- 'Request',
21
- 'Webhook',
22
- 'WebhookEventType',
23
- '__version__',
24
- ]
11
+ __all__ = ['Actor', 'Event', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__']
apify/_actor.py CHANGED
@@ -11,7 +11,7 @@ from pydantic import AliasChoices
11
11
  from typing_extensions import Self
12
12
 
13
13
  from apify_client import ApifyClientAsync
14
- from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
14
+ from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType
15
15
  from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
16
16
  from crawlee import service_container
17
17
  from crawlee.events._types import Event, EventPersistStateData
@@ -19,12 +19,11 @@ from crawlee.events._types import Event, EventPersistStateData
19
19
  from apify._configuration import Configuration
20
20
  from apify._consts import EVENT_LISTENERS_TIMEOUT
21
21
  from apify._crypto import decrypt_input_secrets, load_private_key
22
- from apify._models import ActorRun
22
+ from apify._log import logger
23
23
  from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
24
24
  from apify._proxy_configuration import ProxyConfiguration
25
25
  from apify._utils import get_system_info, is_running_in_ipython
26
26
  from apify.apify_storage_client import ApifyStorageClient
27
- from apify.log import _configure_logging, logger
28
27
  from apify.storages import Dataset, KeyValueStore, RequestQueue
29
28
 
30
29
  if TYPE_CHECKING:
@@ -33,8 +32,6 @@ if TYPE_CHECKING:
33
32
 
34
33
  from crawlee.proxy_configuration import _NewUrlFunction
35
34
 
36
- from apify._models import Webhook
37
-
38
35
 
39
36
  MainReturnType = TypeVar('MainReturnType')
40
37
 
@@ -46,24 +43,16 @@ class _ActorType:
46
43
  _configuration: Configuration
47
44
  _is_exiting = False
48
45
 
49
- def __init__(
50
- self,
51
- configuration: Configuration | None = None,
52
- *,
53
- configure_logging: bool = True,
54
- ) -> None:
46
+ def __init__(self, config: Configuration | None = None) -> None:
55
47
  """Create an Actor instance.
56
48
 
57
49
  Note that you don't have to do this, all the functionality is accessible using the default instance
58
50
  (e.g. `Actor.open_dataset()`).
59
51
 
60
52
  Args:
61
- configuration: The Actor configuration to be used. If not passed, a new Configuration instance will
62
- be created.
63
- configure_logging: Should the default logging configuration be configured?
53
+ config: The Actor configuration to be used. If not passed, a new Configuration instance will be created.
64
54
  """
65
- self._configuration = configuration or Configuration.get_global_configuration()
66
- self._configure_logging = configure_logging
55
+ self._configuration = config or Configuration.get_global_configuration()
67
56
  self._apify_client = self.new_client()
68
57
 
69
58
  self._event_manager: EventManager
@@ -89,9 +78,6 @@ class _ActorType:
89
78
  When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
90
79
  executing the block code, the `Actor.fail` method is called.
91
80
  """
92
- if self._configure_logging:
93
- _configure_logging(self._configuration)
94
-
95
81
  await self.init()
96
82
  return self
97
83
 
@@ -122,20 +108,15 @@ class _ActorType:
122
108
 
123
109
  return super().__repr__()
124
110
 
125
- def __call__(self, configuration: Configuration | None = None, *, configure_logging: bool = True) -> Self:
111
+ def __call__(self, config: Configuration) -> Self:
126
112
  """Make a new Actor instance with a non-default configuration."""
127
- return self.__class__(configuration=configuration, configure_logging=configure_logging)
113
+ return self.__class__(config=config)
128
114
 
129
115
  @property
130
116
  def apify_client(self) -> ApifyClientAsync:
131
117
  """The ApifyClientAsync instance the Actor instance uses."""
132
118
  return self._apify_client
133
119
 
134
- @property
135
- def configuration(self) -> Configuration:
136
- """The Configuration instance the Actor instance uses."""
137
- return self._configuration
138
-
139
120
  @property
140
121
  def config(self) -> Configuration:
141
122
  """The Configuration instance the Actor instance uses."""
@@ -552,8 +533,8 @@ class _ActorType:
552
533
  memory_mbytes: int | None = None,
553
534
  timeout: timedelta | None = None,
554
535
  wait_for_finish: int | None = None,
555
- webhooks: list[Webhook] | None = None,
556
- ) -> ActorRun:
536
+ webhooks: list[dict] | None = None,
537
+ ) -> dict:
557
538
  """Run an Actor on the Apify platform.
558
539
 
559
540
  Unlike `Actor.call`, this method just starts the run without waiting for finish.
@@ -574,6 +555,10 @@ class _ActorType:
574
555
  webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with
575
556
  the Actor run which can be used to receive a notification, e.g. when the Actor finished or failed.
576
557
  If you already have a webhook set up for the Actor or task, you do not have to add it again here.
558
+ Each webhook is represented by a dictionary containing these items:
559
+ * `event_types`: list of `WebhookEventType` values which trigger the webhook
560
+ * `request_url`: URL to which to send the webhook HTTP request
561
+ * `payload_template` (optional): Optional template for the request payload
577
562
 
578
563
  Returns:
579
564
  Info about the started Actor run
@@ -582,25 +567,16 @@ class _ActorType:
582
567
 
583
568
  client = self.new_client(token=token) if token else self._apify_client
584
569
 
585
- if webhooks:
586
- serialized_webhooks = [
587
- hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
588
- ]
589
- else:
590
- serialized_webhooks = None
591
-
592
- api_result = await client.actor(actor_id).start(
570
+ return await client.actor(actor_id).start(
593
571
  run_input=run_input,
594
572
  content_type=content_type,
595
573
  build=build,
596
574
  memory_mbytes=memory_mbytes,
597
575
  timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
598
576
  wait_for_finish=wait_for_finish,
599
- webhooks=serialized_webhooks,
577
+ webhooks=webhooks,
600
578
  )
601
579
 
602
- return ActorRun.model_validate(api_result)
603
-
604
580
  async def abort(
605
581
  self,
606
582
  run_id: str,
@@ -608,7 +584,7 @@ class _ActorType:
608
584
  token: str | None = None,
609
585
  status_message: str | None = None,
610
586
  gracefully: bool | None = None,
611
- ) -> ActorRun:
587
+ ) -> dict:
612
588
  """Abort given Actor run on the Apify platform using the current user account.
613
589
 
614
590
  The user account is determined by the `APIFY_TOKEN` environment variable.
@@ -631,9 +607,7 @@ class _ActorType:
631
607
  if status_message:
632
608
  await client.run(run_id).update(status_message=status_message)
633
609
 
634
- api_result = await client.run(run_id).abort(gracefully=gracefully)
635
-
636
- return ActorRun.model_validate(api_result)
610
+ return await client.run(run_id).abort(gracefully=gracefully)
637
611
 
638
612
  async def call(
639
613
  self,
@@ -645,9 +619,9 @@ class _ActorType:
645
619
  build: str | None = None,
646
620
  memory_mbytes: int | None = None,
647
621
  timeout: timedelta | None = None,
648
- webhooks: list[Webhook] | None = None,
622
+ webhooks: list[dict] | None = None,
649
623
  wait: timedelta | None = None,
650
- ) -> ActorRun | None:
624
+ ) -> dict | None:
651
625
  """Start an Actor on the Apify Platform and wait for it to finish before returning.
652
626
 
653
627
  It waits indefinitely, unless the wait argument is provided.
@@ -676,25 +650,16 @@ class _ActorType:
676
650
 
677
651
  client = self.new_client(token=token) if token else self._apify_client
678
652
 
679
- if webhooks:
680
- serialized_webhooks = [
681
- hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
682
- ]
683
- else:
684
- serialized_webhooks = None
685
-
686
- api_result = await client.actor(actor_id).call(
653
+ return await client.actor(actor_id).call(
687
654
  run_input=run_input,
688
655
  content_type=content_type,
689
656
  build=build,
690
657
  memory_mbytes=memory_mbytes,
691
658
  timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
692
- webhooks=serialized_webhooks,
659
+ webhooks=webhooks,
693
660
  wait_secs=int(wait.total_seconds()) if wait is not None else None,
694
661
  )
695
662
 
696
- return ActorRun.model_validate(api_result)
697
-
698
663
  async def call_task(
699
664
  self,
700
665
  task_id: str,
@@ -703,10 +668,10 @@ class _ActorType:
703
668
  build: str | None = None,
704
669
  memory_mbytes: int | None = None,
705
670
  timeout: timedelta | None = None,
706
- webhooks: list[Webhook] | None = None,
671
+ webhooks: list[dict] | None = None,
707
672
  wait: timedelta | None = None,
708
673
  token: str | None = None,
709
- ) -> ActorRun | None:
674
+ ) -> dict | None:
710
675
  """Start an Actor task on the Apify Platform and wait for it to finish before returning.
711
676
 
712
677
  It waits indefinitely, unless the wait argument is provided.
@@ -738,24 +703,15 @@ class _ActorType:
738
703
 
739
704
  client = self.new_client(token=token) if token else self._apify_client
740
705
 
741
- if webhooks:
742
- serialized_webhooks = [
743
- hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
744
- ]
745
- else:
746
- serialized_webhooks = None
747
-
748
- api_result = await client.task(task_id).call(
706
+ return await client.task(task_id).call(
749
707
  task_input=task_input,
750
708
  build=build,
751
709
  memory_mbytes=memory_mbytes,
752
710
  timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
753
- webhooks=serialized_webhooks,
711
+ webhooks=webhooks,
754
712
  wait_secs=int(wait.total_seconds()) if wait is not None else None,
755
713
  )
756
714
 
757
- return ActorRun.model_validate(api_result)
758
-
759
715
  async def metamorph(
760
716
  self,
761
717
  target_actor_id: str,
@@ -840,12 +796,14 @@ class _ActorType:
840
796
 
841
797
  async def add_webhook(
842
798
  self,
843
- webhook: Webhook,
844
799
  *,
800
+ event_types: list[WebhookEventType],
801
+ request_url: str,
802
+ payload_template: str | None = None,
845
803
  ignore_ssl_errors: bool | None = None,
846
804
  do_not_retry: bool | None = None,
847
805
  idempotency_key: str | None = None,
848
- ) -> None:
806
+ ) -> dict | None:
849
807
  """Create an ad-hoc webhook for the current Actor run.
850
808
 
851
809
  This webhook lets you receive a notification when the Actor run finished or failed.
@@ -856,7 +814,9 @@ class _ActorType:
856
814
  For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks).
857
815
 
858
816
  Args:
859
- webhook: The webhook to be added
817
+ event_types: List of event types that should trigger the webhook. At least one is required.
818
+ request_url: URL that will be invoked once the webhook is triggered.
819
+ payload_template: Specification of the payload that will be sent to request_url
860
820
  ignore_ssl_errors: Whether the webhook should ignore SSL errors returned by request_url
861
821
  do_not_retry: Whether the webhook should retry sending the payload to request_url upon failure.
862
822
  idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create
@@ -869,17 +829,17 @@ class _ActorType:
869
829
 
870
830
  if not self.is_at_home():
871
831
  self.log.error('Actor.add_webhook() is only supported when running on the Apify platform.')
872
- return
832
+ return None
873
833
 
874
834
  # If is_at_home() is True, config.actor_run_id is always set
875
835
  if not self._configuration.actor_run_id:
876
836
  raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
877
837
 
878
- await self._apify_client.webhooks().create(
838
+ return await self._apify_client.webhooks().create(
879
839
  actor_run_id=self._configuration.actor_run_id,
880
- event_types=webhook.event_types,
881
- request_url=webhook.request_url,
882
- payload_template=webhook.payload_template,
840
+ event_types=event_types,
841
+ request_url=request_url,
842
+ payload_template=payload_template,
883
843
  ignore_ssl_errors=ignore_ssl_errors,
884
844
  do_not_retry=do_not_retry,
885
845
  idempotency_key=idempotency_key,
@@ -890,7 +850,7 @@ class _ActorType:
890
850
  status_message: str,
891
851
  *,
892
852
  is_terminal: bool | None = None,
893
- ) -> ActorRun | None:
853
+ ) -> dict | None:
894
854
  """Set the status message for the current Actor run.
895
855
 
896
856
  Args:
@@ -911,12 +871,10 @@ class _ActorType:
911
871
  if not self._configuration.actor_run_id:
912
872
  raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
913
873
 
914
- api_result = await self._apify_client.run(self._configuration.actor_run_id).update(
874
+ return await self._apify_client.run(self._configuration.actor_run_id).update(
915
875
  status_message=status_message, is_status_message_terminal=is_terminal
916
876
  )
917
877
 
918
- return ActorRun.model_validate(api_result)
919
-
920
878
  async def create_proxy_configuration(
921
879
  self,
922
880
  *,
apify/_log.py ADDED
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from crawlee._log_config import CrawleeLogFormatter
6
+
7
+ # Name of the logger used throughout the library (resolves to 'apify')
8
+ logger_name = __name__.split('.')[0]
9
+
10
+ # Logger used throughout the library
11
+ logger = logging.getLogger(logger_name)
12
+
13
+
14
+ class ActorLogFormatter(CrawleeLogFormatter): # Inherited from parent class
15
+ pass
@@ -20,7 +20,7 @@ from crawlee.events._types import (
20
20
  EventSystemInfoData,
21
21
  )
22
22
 
23
- from apify.log import logger
23
+ from apify._log import logger
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from types import TracebackType
@@ -94,11 +94,6 @@ class EventWithoutData(BaseModel):
94
94
  data: Any = None
95
95
 
96
96
 
97
- class DeprecatedEvent(BaseModel):
98
- name: Literal['cpuInfo']
99
- data: Annotated[dict[str, Any], Field(default_factory=dict)]
100
-
101
-
102
97
  class UnknownEvent(BaseModel):
103
98
  name: str
104
99
  data: Annotated[dict[str, Any], Field(default_factory=dict)]
@@ -114,13 +109,12 @@ EventMessage = Union[
114
109
  ]
115
110
 
116
111
 
117
- event_data_adapter: TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent] = TypeAdapter(
112
+ event_data_adapter: TypeAdapter[EventMessage | UnknownEvent] = TypeAdapter(
118
113
  Union[
119
114
  Annotated[
120
115
  EventMessage,
121
116
  Discriminator('name'),
122
117
  ],
123
- DeprecatedEvent,
124
118
  UnknownEvent,
125
119
  ]
126
120
  )
@@ -195,9 +189,6 @@ class PlatformEventManager(EventManager):
195
189
  try:
196
190
  parsed_message = event_data_adapter.validate_json(message)
197
191
 
198
- if isinstance(parsed_message, DeprecatedEvent):
199
- continue
200
-
201
192
  if isinstance(parsed_message, UnknownEvent):
202
193
  logger.info(
203
194
  f'Unknown message received: event_name={parsed_message.name}, '
@@ -16,7 +16,7 @@ from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
16
16
  from crawlee.proxy_configuration import _NewUrlFunction
17
17
 
18
18
  from apify._configuration import Configuration
19
- from apify.log import logger
19
+ from apify._log import logger
20
20
 
21
21
  if TYPE_CHECKING:
22
22
  from apify_client import ApifyClientAsync
@@ -60,20 +60,18 @@ class DatasetClient(BaseDatasetClient):
60
60
  view: str | None = None,
61
61
  ) -> DatasetItemsListPage:
62
62
  return DatasetItemsListPage.model_validate(
63
- vars(
64
- await self._client.list_items(
65
- offset=offset,
66
- limit=limit,
67
- clean=clean,
68
- desc=desc,
69
- fields=fields,
70
- omit=omit,
71
- unwind=unwind,
72
- skip_empty=skip_empty,
73
- skip_hidden=skip_hidden,
74
- flatten=flatten,
75
- view=view,
76
- )
63
+ await self._client.list_items(
64
+ offset=offset,
65
+ limit=limit,
66
+ clean=clean,
67
+ desc=desc,
68
+ fields=fields,
69
+ omit=omit,
70
+ unwind=unwind,
71
+ skip_empty=skip_empty,
72
+ skip_hidden=skip_hidden,
73
+ flatten=flatten,
74
+ view=view,
77
75
  )
78
76
  )
79
77
 
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
5
- from more_itertools import chunked
6
5
  from typing_extensions import override
7
6
 
8
7
  from crawlee import Request
@@ -158,11 +157,8 @@ class RequestQueueClient(BaseRequestQueueClient):
158
157
  *,
159
158
  forefront: bool = False,
160
159
  ) -> BatchRequestsOperationResponse:
161
- processed = []
162
- unprocessed = []
163
-
164
- for chunk in chunked(requests, 25): # The API endpoint won't accept more than 25 requests at once
165
- response = await self._client.batch_add_requests(
160
+ return BatchRequestsOperationResponse.model_validate(
161
+ await self._client.batch_add_requests(
166
162
  requests=[
167
163
  r.model_dump(
168
164
  by_alias=True,
@@ -174,18 +170,10 @@ class RequestQueueClient(BaseRequestQueueClient):
174
170
  'data',
175
171
  },
176
172
  )
177
- for r in chunk
173
+ for r in requests
178
174
  ],
179
175
  forefront=forefront,
180
176
  )
181
- processed.extend(response['processedRequests'])
182
- unprocessed.extend(response['unprocessedRequests'])
183
-
184
- return BatchRequestsOperationResponse.model_validate(
185
- {
186
- 'processedRequests': processed,
187
- 'unprocessedRequests': unprocessed,
188
- }
189
177
  )
190
178
 
191
179
  @override
@@ -93,7 +93,7 @@ class ApifyHttpProxyMiddleware:
93
93
  request: Request,
94
94
  exception: Exception,
95
95
  spider: Spider,
96
- ) -> None:
96
+ ) -> None | Request:
97
97
  """Process an exception that occurs during request processing.
98
98
 
99
99
  Args:
@@ -102,9 +102,8 @@ class ApifyHttpProxyMiddleware:
102
102
  spider: Scrapy Spider object.
103
103
 
104
104
  Returns:
105
- Returning None, meaning Scrapy will continue processing this exception, executing any other
106
- process_exception() methods of installed middleware, until no middleware is left and the default
107
- exception handling kicks in.
105
+ If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
106
+ Return None otherwise to allow the continuation of request processing.
108
107
  """
109
108
  Actor.log.debug(
110
109
  f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
@@ -115,6 +114,9 @@ class ApifyHttpProxyMiddleware:
115
114
  f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", '
116
115
  'reason="{exception}", skipping...'
117
116
  )
117
+ return request
118
+
119
+ return None
118
120
 
119
121
  async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
120
122
  """Get a new proxy URL.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 2.0.0
3
+ Version: 2.0.0b1
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -18,15 +18,21 @@ Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Topic :: Software Development :: Libraries
20
20
  Provides-Extra: scrapy
21
- Requires-Dist: apify-client (>=1.7.1)
22
- Requires-Dist: apify-shared (>=1.1.2)
23
- Requires-Dist: crawlee (>=0.3.5)
24
- Requires-Dist: cryptography (>=42.0.0)
25
- Requires-Dist: httpx (>=0.27.0)
26
- Requires-Dist: lazy-object-proxy (>=1.10.0)
27
- Requires-Dist: scrapy (>=2.11.0) ; extra == "scrapy"
28
- Requires-Dist: typing-extensions (>=4.1.0)
29
- Requires-Dist: websockets (>=10.0)
21
+ Requires-Dist: aiofiles (>=22.1.0,<23.0.0)
22
+ Requires-Dist: aioshutil (>=1.0,<2.0)
23
+ Requires-Dist: apify-client (>=1.7.1,<2.0.0)
24
+ Requires-Dist: apify-shared (>=1.1.2,<2.0.0)
25
+ Requires-Dist: colorama (>=0.4.6,<0.5.0)
26
+ Requires-Dist: crawlee (>=0.3.0,<0.4.0)
27
+ Requires-Dist: cryptography (>=39.0.0,<40.0.0)
28
+ Requires-Dist: httpx (>=0.27.0,<0.28.0)
29
+ Requires-Dist: lazy-object-proxy (>=1.10.0,<2.0.0)
30
+ Requires-Dist: psutil (>=6.0.0,<7.0.0)
31
+ Requires-Dist: pyee (>=11.0.0,<12.0.0)
32
+ Requires-Dist: scrapy (>=2.11.0,<3.0.0) ; extra == "scrapy"
33
+ Requires-Dist: sortedcollections (>=2.0.0,<3.0.0)
34
+ Requires-Dist: typing-extensions (>=4.1.0,<5.0.0)
35
+ Requires-Dist: websockets (>=10.1,<11.0)
30
36
  Project-URL: Apify Homepage, https://apify.com
31
37
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
32
38
  Project-URL: Documentation, https://docs.apify.com/sdk/python/
@@ -64,108 +70,27 @@ pip install apify[scrapy]
64
70
 
65
71
  For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
66
72
 
67
- ## Examples
68
-
69
- Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
70
-
71
- ### Apify SDK with HTTPX and BeautifulSoup
72
-
73
- This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
73
+ ## Example
74
74
 
75
75
  ```python
76
76
  from apify import Actor
77
77
  from bs4 import BeautifulSoup
78
78
  from httpx import AsyncClient
79
79
 
80
-
81
- async def main() -> None:
82
- async with Actor:
83
- # Retrieve the Actor input, and use default values if not provided.
84
- actor_input = await Actor.get_input() or {}
85
- start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
86
-
87
- # Open the default request queue for handling URLs to be processed.
88
- request_queue = await Actor.open_request_queue()
89
-
90
- # Enqueue the start URLs.
91
- for start_url in start_urls:
92
- url = start_url.get('url')
93
- await request_queue.add_request(url)
94
-
95
- # Process the URLs from the request queue.
96
- while request := await request_queue.fetch_next_request():
97
- Actor.log.info(f'Scraping {request.url} ...')
98
-
99
- # Fetch the HTTP response from the specified URL using HTTPX.
100
- async with AsyncClient() as client:
101
- response = await client.get(request.url)
102
-
103
- # Parse the HTML content using Beautiful Soup.
104
- soup = BeautifulSoup(response.content, 'html.parser')
105
-
106
- # Extract the desired data.
107
- data = {
108
- 'url': actor_input['url'],
109
- 'title': soup.title.string,
110
- 'h1s': [h1.text for h1 in soup.find_all('h1')],
111
- 'h2s': [h2.text for h2 in soup.find_all('h2')],
112
- 'h3s': [h3.text for h3 in soup.find_all('h3')],
113
- }
114
-
115
- # Store the extracted data to the default dataset.
116
- await Actor.push_data(data)
117
- ```
118
-
119
- ### Apify SDK with PlaywrightCrawler from Crawlee
120
-
121
- This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
122
-
123
- ```python
124
- from apify import Actor, Request
125
- from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
126
-
127
-
128
80
  async def main() -> None:
129
81
  async with Actor:
130
- # Retrieve the Actor input, and use default values if not provided.
131
- actor_input = await Actor.get_input() or {}
132
- start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
133
-
134
- # Exit if no start URLs are provided.
135
- if not start_urls:
136
- Actor.log.info('No start URLs specified in Actor input, exiting...')
137
- await Actor.exit()
138
-
139
- # Create a crawler.
140
- crawler = PlaywrightCrawler(
141
- # Limit the crawl to max requests. Remove or increase it for crawling all links.
142
- max_requests_per_crawl=50,
143
- headless=True,
144
- )
145
-
146
- # Define a request handler, which will be called for every request.
147
- @crawler.router.default_handler
148
- async def request_handler(context: PlaywrightCrawlingContext) -> None:
149
- url = context.request.url
150
- Actor.log.info(f'Scraping {url}...')
151
-
152
- # Extract the desired data.
153
- data = {
154
- 'url': context.request.url,
155
- 'title': await context.page.title(),
156
- 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
157
- 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
158
- 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
159
- }
160
-
161
- # Store the extracted data to the default dataset.
162
- await context.push_data(data)
163
-
164
- # Enqueue additional links found on the current page.
165
- await context.enqueue_links()
166
-
167
- # Run the crawler with the starting URLs.
168
- await crawler.run(start_urls)
82
+ # Read the input parameters from the Actor input
83
+ actor_input = await Actor.get_input()
84
+ # Fetch the HTTP response from the specified URL
85
+ async with AsyncClient() as client:
86
+ response = await client.get(actor_input['url'])
87
+ # Process the HTML content
88
+ soup = BeautifulSoup(response.content, 'html.parser')
89
+ # Push the extracted data
90
+ await Actor.push_data({
91
+ 'url': actor_input['url'],
92
+ 'title': soup.title.string,
93
+ })
169
94
  ```
170
95
 
171
96
  ## What are Actors?
@@ -1,37 +1,31 @@
1
- apify/__init__.py,sha256=ikoi2EpDYl6y-XSVtlU8UsdQdMEyOiIJCRRAaZFDOP8,550
2
- apify/_actor.py,sha256=oPgQ3rxxIEzVcZ9XtI3lf1a_6gwIMgxihNuYGjJpGww,41816
1
+ apify/__init__.py,sha256=sZis2RN6B5wlkpF-fS4ludmwAl9KOiFbKGLsRQYx2AQ,358
2
+ apify/_actor.py,sha256=9i-n-JidtpnD6PS5w-clhbSl-OdfAoDKDM9f2Z9Nv4M,40749
3
3
  apify/_configuration.py,sha256=gf7YOun32Whc9DamhoWDLmcUeNwtWVmmBPrl4oq6s4I,8997
4
4
  apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
5
5
  apify/_crypto.py,sha256=b4Czs1NLPkaNzkPjovObjSIbsKnRrgtBkM9JvOysUMA,5612
6
- apify/_models.py,sha256=oYlTEr-DyQAE-V2rrYD5PhUxTXVPdAig7QV-u6CJw3E,5571
7
- apify/_platform_event_manager.py,sha256=h5fBmXtKD4t-yCdOSiLM1-DnCrIbGEmYmz2mOU3A8bA,7627
8
- apify/_proxy_configuration.py,sha256=VdKh_AyCwaCUlpCyaCe30L2S9OZ-vL1SN1g8oLwSeYA,13074
6
+ apify/_log.py,sha256=xZI-OambbbdUh8TZAG7f6vgHONIb5S9d2eLD9ECF8YQ,379
7
+ apify/_platform_event_manager.py,sha256=r15FyPj1Mi9IqC81fjp0fmoRks5Zz5i2-E4X5fWmeIo,7345
8
+ apify/_proxy_configuration.py,sha256=5iCzdyd2juOS3cRAPrso65Ds_mcQidpAjQCkzb9Rr6Q,13075
9
9
  apify/_utils.py,sha256=x4lnR9RNulySiEQTft-GeQqUcJsRr0k8p0Sv9NTeWFg,638
10
10
  apify/apify_storage_client/__init__.py,sha256=-UbR68bFsDR6ln8OFs4t50eqcnY36hujO-SeOt-KmcA,114
11
11
  apify/apify_storage_client/_apify_storage_client.py,sha256=xi4OFchxhe-1-sykanH6Zcya4OcBhn2uf7OQ1pV4Ins,2338
12
- apify/apify_storage_client/_dataset_client.py,sha256=j9seF2OKvbSMD9R9XF9fpa1vtr_1w4JcRV--WCmvU4E,5501
12
+ apify/apify_storage_client/_dataset_client.py,sha256=VsTJ2bzuvC23tpuyb5ijPVxEjDKWD_zbqslfdqcKRLc,5417
13
13
  apify/apify_storage_client/_dataset_collection_client.py,sha256=fkYvYGQCigHD2CDzpWk0swNAkfvAinAhMGpYqllle3E,1445
14
14
  apify/apify_storage_client/_key_value_store_client.py,sha256=uyeQgb75sGFsqIS4sq4hEZ3QP81COLfS3tmTqHc0tso,3340
15
15
  apify/apify_storage_client/_key_value_store_collection_client.py,sha256=vCtMTI-jx89Qp5WHILDNkCthwLuv0MAwm1J_5E4aypU,1519
16
- apify/apify_storage_client/_request_queue_client.py,sha256=P8ws8jEzi2PWpp-cvYfV7kwuKbgH813BpNQ_wMSVtTA,6278
16
+ apify/apify_storage_client/_request_queue_client.py,sha256=jAiFkaJ38_myHFGTw-Rk21wmpbN0UCR2w2SFoimFGFc,5826
17
17
  apify/apify_storage_client/_request_queue_collection_client.py,sha256=NnO73UJ9ZrjV8xoudo30wfaM-SojRkG0guhxDyB-K1g,1527
18
- apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- apify/log.py,sha256=pX6ppIvds8OKqjFpIcshqG4zp_5DiOUU31ksyfSExto,1392
20
18
  apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
19
  apify/scrapy/__init__.py,sha256=qDPV_zTRFaUqoFOyS5g4uBfz-UCkmWYJ82VXQ_3Cw6k,348
22
20
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
23
- apify/scrapy/middlewares/apify_proxy.py,sha256=_1WO7NKHxIcPf8mSNjsqANTEsx7ygMTuRQW9fbwKMO8,5837
24
- apify/scrapy/middlewares/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ apify/scrapy/middlewares/apify_proxy.py,sha256=6iUmbg40vX5bGoBr88hi3VdaAUUeJT9qbPHCfJjC5Kw,5838
25
22
  apify/scrapy/pipelines/__init__.py,sha256=GWPeLN_Zwj8vRBWtXW6DaxdB7mvyQ7Jw5Tz1ccgWlZI,119
26
23
  apify/scrapy/pipelines/actor_dataset_push.py,sha256=QERmmExQOGIKQ70-p-lCj5qyE-c-fnYplEqd4mgaB1Q,953
27
- apify/scrapy/pipelines/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- apify/scrapy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
24
  apify/scrapy/requests.py,sha256=pmm2M-cwrTXyI3t1nRBo9pS6nHfc4zkzS25-NXxzd9I,7637
30
25
  apify/scrapy/scheduler.py,sha256=AAIKY5i1QxkC1mtmix6n3M2eQaOw-d1T56Noue9xToc,6013
31
26
  apify/scrapy/utils.py,sha256=tz_Y8CTqe6KbyMMhLF3m7qqR46jtNH5U7Ty7e19roPU,2814
32
27
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
33
- apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- apify-2.0.0.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
35
- apify-2.0.0.dist-info/METADATA,sha256=DhojQDiiwKEwS7VcAufA7ERVHYHKk5mqHFtddWXL4Qk,8604
36
- apify-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
37
- apify-2.0.0.dist-info/RECORD,,
28
+ apify-2.0.0b1.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
29
+ apify-2.0.0b1.dist-info/METADATA,sha256=ik9dSs0US8_MxCXDUZEA0Dn85fhQQ0uDPuOqvTCCdAA,5545
30
+ apify-2.0.0b1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ apify-2.0.0b1.dist-info/RECORD,,
apify/_models.py DELETED
@@ -1,110 +0,0 @@
1
- # ruff: noqa: TCH001 TCH002 TCH003 (Pydantic)
2
- from __future__ import annotations
3
-
4
- from datetime import datetime, timedelta
5
- from typing import Annotated
6
-
7
- from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
8
-
9
- from apify_shared.consts import ActorJobStatus, MetaOrigin, WebhookEventType
10
- from crawlee._utils.models import timedelta_ms
11
- from crawlee._utils.urls import validate_http_url
12
-
13
-
14
- class Webhook(BaseModel):
15
- __model_config__ = ConfigDict(populate_by_name=True)
16
-
17
- event_types: Annotated[
18
- list[WebhookEventType],
19
- Field(description='Event types that should trigger the webhook'),
20
- ]
21
- request_url: Annotated[
22
- str,
23
- Field(description='URL that the webhook should call'),
24
- BeforeValidator(validate_http_url),
25
- ]
26
- payload_template: Annotated[
27
- str | None,
28
- Field(description='Template for the payload sent by the webook'),
29
- ] = None
30
-
31
-
32
- class ActorRunMeta(BaseModel):
33
- __model_config__ = ConfigDict(populate_by_name=True)
34
-
35
- origin: Annotated[MetaOrigin, Field()]
36
-
37
-
38
- class ActorRunStats(BaseModel):
39
- __model_config__ = ConfigDict(populate_by_name=True)
40
-
41
- input_body_len: Annotated[int, Field(alias='inputBodyLen')]
42
- restart_count: Annotated[int, Field(alias='restartCount')]
43
- resurrect_count: Annotated[int, Field(alias='resurrectCount')]
44
- mem_avg_bytes: Annotated[float | None, Field(alias='memAvgBytes')] = None
45
- mem_max_bytes: Annotated[int | None, Field(alias='memMaxBytes')] = None
46
- mem_current_bytes: Annotated[int | None, Field(alias='memCurrentBytes')] = None
47
- cpu_avg_usage: Annotated[float | None, Field(alias='cpuAvgUsage')] = None
48
- cpu_max_usage: Annotated[float | None, Field(alias='cpuMaxUsage')] = None
49
- cpu_current_usage: Annotated[float | None, Field(alias='cpuCurrentUsage')] = None
50
- net_rx_bytes: Annotated[int | None, Field(alias='netRxBytes')] = None
51
- net_tx_bytes: Annotated[int | None, Field(alias='netTxBytes')] = None
52
- duration: Annotated[timedelta_ms | None, Field(alias='durationMillis')] = None
53
- run_time: Annotated[timedelta | None, Field(alias='runTimeSecs')] = None
54
- metamorph: Annotated[int | None, Field(alias='metamorph')] = None
55
- compute_units: Annotated[float, Field(alias='computeUnits')]
56
-
57
-
58
- class ActorRunOptions(BaseModel):
59
- __model_config__ = ConfigDict(populate_by_name=True)
60
-
61
- build: str
62
- timeout: Annotated[timedelta, Field(alias='timeoutSecs')]
63
- memory_mbytes: Annotated[int, Field(alias='memoryMbytes')]
64
- disk_mbytes: Annotated[int, Field(alias='diskMbytes')]
65
-
66
-
67
- class ActorRunUsage(BaseModel):
68
- __model_config__ = ConfigDict(populate_by_name=True)
69
-
70
- actor_compute_units: Annotated[float | None, Field(alias='ACTOR_COMPUTE_UNITS')] = None
71
- dataset_reads: Annotated[float | None, Field(alias='DATASET_READS')] = None
72
- dataset_writes: Annotated[float | None, Field(alias='DATASET_WRITES')] = None
73
- key_value_store_reads: Annotated[float | None, Field(alias='KEY_VALUE_STORE_READS')] = None
74
- key_value_store_writes: Annotated[float | None, Field(alias='KEY_VALUE_STORE_WRITES')] = None
75
- key_value_store_lists: Annotated[float | None, Field(alias='KEY_VALUE_STORE_LISTS')] = None
76
- request_queue_reads: Annotated[float | None, Field(alias='REQUEST_QUEUE_READS')] = None
77
- request_queue_writes: Annotated[float | None, Field(alias='REQUEST_QUEUE_WRITES')] = None
78
- data_transfer_internal_gbytes: Annotated[float | None, Field(alias='DATA_TRANSFER_INTERNAL_GBYTES')] = None
79
- data_transfer_external_gbytes: Annotated[float | None, Field(alias='DATA_TRANSFER_EXTERNAL_GBYTES')] = None
80
- proxy_residential_transfer_gbytes: Annotated[float | None, Field(alias='PROXY_RESIDENTIAL_TRANSFER_GBYTES')] = None
81
- proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
82
-
83
-
84
- class ActorRun(BaseModel):
85
- __model_config__ = ConfigDict(populate_by_name=True)
86
-
87
- id: Annotated[str, Field(alias='id')]
88
- act_id: Annotated[str, Field(alias='actId')]
89
- user_id: Annotated[str, Field(alias='userId')]
90
- actor_task_id: Annotated[str | None, Field(alias='actorTaskId')] = None
91
- started_at: Annotated[datetime, Field(alias='startedAt')]
92
- finished_at: Annotated[datetime | None, Field(alias='finishedAt')] = None
93
- status: Annotated[ActorJobStatus, Field(alias='status')]
94
- status_message: Annotated[str | None, Field(alias='statusMessage')] = None
95
- is_status_message_terminal: Annotated[bool | None, Field(alias='isStatusMessageTerminal')] = None
96
- meta: Annotated[ActorRunMeta, Field(alias='meta')]
97
- stats: Annotated[ActorRunStats, Field(alias='stats')]
98
- options: Annotated[ActorRunOptions, Field(alias='options')]
99
- build_id: Annotated[str, Field(alias='buildId')]
100
- exit_code: Annotated[int | None, Field(alias='exitCode')] = None
101
- default_key_value_store_id: Annotated[str, Field(alias='defaultKeyValueStoreId')]
102
- default_dataset_id: Annotated[str, Field(alias='defaultDatasetId')]
103
- default_request_queue_id: Annotated[str, Field(alias='defaultRequestQueueId')]
104
- build_number: Annotated[str | None, Field(alias='buildNumber')] = None
105
- container_url: Annotated[str, Field(alias='containerUrl')]
106
- is_container_server_ready: Annotated[bool | None, Field(alias='isContainerServerReady')] = None
107
- git_branch_name: Annotated[str | None, Field(alias='gitBranchName')] = None
108
- usage: Annotated[ActorRunUsage | None, Field(alias='usage')] = None
109
- usage_total_usd: Annotated[float | None, Field(alias='usageTotalUsd')] = None
110
- usage_usd: Annotated[ActorRunUsage | None, Field(alias='usageUsd')] = None
File without changes
apify/log.py DELETED
@@ -1,43 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from typing import TYPE_CHECKING
5
-
6
- from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
7
-
8
- if TYPE_CHECKING:
9
- from apify import Configuration
10
-
11
- # Name of the logger used throughout the library (resolves to 'apify')
12
- logger_name = __name__.split('.')[0]
13
-
14
- # Logger used throughout the library
15
- logger = logging.getLogger(logger_name)
16
-
17
-
18
- class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
19
- pass
20
-
21
-
22
- def _configure_logging(configuration: Configuration) -> None:
23
- apify_client_logger = logging.getLogger('apify_client')
24
- configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
25
-
26
- level = get_configured_log_level(configuration)
27
-
28
- # Keep apify_client logger quiet unless debug logging is requested
29
- if level > logging.DEBUG:
30
- apify_client_logger.setLevel(logging.INFO)
31
- else:
32
- apify_client_logger.setLevel(level)
33
-
34
- # Silence HTTPX logger unless debug logging is requested
35
- httpx_logger = logging.getLogger('httpx')
36
- if level > logging.DEBUG:
37
- httpx_logger.setLevel(logging.WARNING)
38
- else:
39
- httpx_logger.setLevel(level)
40
-
41
- # Use configured log level for apify logger
42
- apify_logger = logging.getLogger('apify')
43
- configure_logger(apify_logger, configuration, remove_old_handlers=True)
File without changes
File without changes
apify/scrapy/py.typed DELETED
File without changes
apify/storages/py.typed DELETED
File without changes
File without changes