apify 2.0.0__py3-none-any.whl → 2.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

apify/__init__.py CHANGED
@@ -1,7 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
3
  from apify_shared.consts import WebhookEventType
4
- from crawlee import Request
5
4
  from crawlee.events._types import Event
6
5
 
7
6
  from apify._actor import Actor
@@ -17,7 +16,6 @@ __all__ = [
17
16
  'Configuration',
18
17
  'ProxyConfiguration',
19
18
  'ProxyInfo',
20
- 'Request',
21
19
  'Webhook',
22
20
  'WebhookEventType',
23
21
  '__version__',
apify/_actor.py CHANGED
@@ -24,7 +24,7 @@ from apify._platform_event_manager import EventManager, LocalEventManager, Platf
24
24
  from apify._proxy_configuration import ProxyConfiguration
25
25
  from apify._utils import get_system_info, is_running_in_ipython
26
26
  from apify.apify_storage_client import ApifyStorageClient
27
- from apify.log import _configure_logging, logger
27
+ from apify.log import logger
28
28
  from apify.storages import Dataset, KeyValueStore, RequestQueue
29
29
 
30
30
  if TYPE_CHECKING:
@@ -46,24 +46,16 @@ class _ActorType:
46
46
  _configuration: Configuration
47
47
  _is_exiting = False
48
48
 
49
- def __init__(
50
- self,
51
- configuration: Configuration | None = None,
52
- *,
53
- configure_logging: bool = True,
54
- ) -> None:
49
+ def __init__(self, config: Configuration | None = None) -> None:
55
50
  """Create an Actor instance.
56
51
 
57
52
  Note that you don't have to do this, all the functionality is accessible using the default instance
58
53
  (e.g. `Actor.open_dataset()`).
59
54
 
60
55
  Args:
61
- configuration: The Actor configuration to be used. If not passed, a new Configuration instance will
62
- be created.
63
- configure_logging: Should the default logging configuration be configured?
56
+ config: The Actor configuration to be used. If not passed, a new Configuration instance will be created.
64
57
  """
65
- self._configuration = configuration or Configuration.get_global_configuration()
66
- self._configure_logging = configure_logging
58
+ self._configuration = config or Configuration.get_global_configuration()
67
59
  self._apify_client = self.new_client()
68
60
 
69
61
  self._event_manager: EventManager
@@ -89,9 +81,6 @@ class _ActorType:
89
81
  When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
90
82
  executing the block code, the `Actor.fail` method is called.
91
83
  """
92
- if self._configure_logging:
93
- _configure_logging(self._configuration)
94
-
95
84
  await self.init()
96
85
  return self
97
86
 
@@ -122,20 +111,15 @@ class _ActorType:
122
111
 
123
112
  return super().__repr__()
124
113
 
125
- def __call__(self, configuration: Configuration | None = None, *, configure_logging: bool = True) -> Self:
114
+ def __call__(self, config: Configuration) -> Self:
126
115
  """Make a new Actor instance with a non-default configuration."""
127
- return self.__class__(configuration=configuration, configure_logging=configure_logging)
116
+ return self.__class__(config=config)
128
117
 
129
118
  @property
130
119
  def apify_client(self) -> ApifyClientAsync:
131
120
  """The ApifyClientAsync instance the Actor instance uses."""
132
121
  return self._apify_client
133
122
 
134
- @property
135
- def configuration(self) -> Configuration:
136
- """The Configuration instance the Actor instance uses."""
137
- return self._configuration
138
-
139
123
  @property
140
124
  def config(self) -> Configuration:
141
125
  """The Configuration instance the Actor instance uses."""
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
5
- from more_itertools import chunked
6
5
  from typing_extensions import override
7
6
 
8
7
  from crawlee import Request
@@ -158,11 +157,8 @@ class RequestQueueClient(BaseRequestQueueClient):
158
157
  *,
159
158
  forefront: bool = False,
160
159
  ) -> BatchRequestsOperationResponse:
161
- processed = []
162
- unprocessed = []
163
-
164
- for chunk in chunked(requests, 25): # The API endpoint won't accept more than 25 requests at once
165
- response = await self._client.batch_add_requests(
160
+ return BatchRequestsOperationResponse.model_validate(
161
+ await self._client.batch_add_requests(
166
162
  requests=[
167
163
  r.model_dump(
168
164
  by_alias=True,
@@ -174,18 +170,10 @@ class RequestQueueClient(BaseRequestQueueClient):
174
170
  'data',
175
171
  },
176
172
  )
177
- for r in chunk
173
+ for r in requests
178
174
  ],
179
175
  forefront=forefront,
180
176
  )
181
- processed.extend(response['processedRequests'])
182
- unprocessed.extend(response['unprocessedRequests'])
183
-
184
- return BatchRequestsOperationResponse.model_validate(
185
- {
186
- 'processedRequests': processed,
187
- 'unprocessedRequests': unprocessed,
188
- }
189
177
  )
190
178
 
191
179
  @override
apify/log.py CHANGED
@@ -1,12 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from typing import TYPE_CHECKING
5
4
 
6
- from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
7
-
8
- if TYPE_CHECKING:
9
- from apify import Configuration
5
+ from crawlee._log_config import CrawleeLogFormatter
10
6
 
11
7
  # Name of the logger used throughout the library (resolves to 'apify')
12
8
  logger_name = __name__.split('.')[0]
@@ -17,27 +13,3 @@ logger = logging.getLogger(logger_name)
17
13
 
18
14
  class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
19
15
  pass
20
-
21
-
22
- def _configure_logging(configuration: Configuration) -> None:
23
- apify_client_logger = logging.getLogger('apify_client')
24
- configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
25
-
26
- level = get_configured_log_level(configuration)
27
-
28
- # Keep apify_client logger quiet unless debug logging is requested
29
- if level > logging.DEBUG:
30
- apify_client_logger.setLevel(logging.INFO)
31
- else:
32
- apify_client_logger.setLevel(level)
33
-
34
- # Silence HTTPX logger unless debug logging is requested
35
- httpx_logger = logging.getLogger('httpx')
36
- if level > logging.DEBUG:
37
- httpx_logger.setLevel(logging.WARNING)
38
- else:
39
- httpx_logger.setLevel(level)
40
-
41
- # Use configured log level for apify logger
42
- apify_logger = logging.getLogger('apify')
43
- configure_logger(apify_logger, configuration, remove_old_handlers=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 2.0.0
3
+ Version: 2.0.0a1
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -20,7 +20,7 @@ Classifier: Topic :: Software Development :: Libraries
20
20
  Provides-Extra: scrapy
21
21
  Requires-Dist: apify-client (>=1.7.1)
22
22
  Requires-Dist: apify-shared (>=1.1.2)
23
- Requires-Dist: crawlee (>=0.3.5)
23
+ Requires-Dist: crawlee (>=0.3.0)
24
24
  Requires-Dist: cryptography (>=42.0.0)
25
25
  Requires-Dist: httpx (>=0.27.0)
26
26
  Requires-Dist: lazy-object-proxy (>=1.10.0)
@@ -64,108 +64,27 @@ pip install apify[scrapy]
64
64
 
65
65
  For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
66
66
 
67
- ## Examples
68
-
69
- Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
70
-
71
- ### Apify SDK with HTTPX and BeautifulSoup
72
-
73
- This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
67
+ ## Example
74
68
 
75
69
  ```python
76
70
  from apify import Actor
77
71
  from bs4 import BeautifulSoup
78
72
  from httpx import AsyncClient
79
73
 
80
-
81
- async def main() -> None:
82
- async with Actor:
83
- # Retrieve the Actor input, and use default values if not provided.
84
- actor_input = await Actor.get_input() or {}
85
- start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
86
-
87
- # Open the default request queue for handling URLs to be processed.
88
- request_queue = await Actor.open_request_queue()
89
-
90
- # Enqueue the start URLs.
91
- for start_url in start_urls:
92
- url = start_url.get('url')
93
- await request_queue.add_request(url)
94
-
95
- # Process the URLs from the request queue.
96
- while request := await request_queue.fetch_next_request():
97
- Actor.log.info(f'Scraping {request.url} ...')
98
-
99
- # Fetch the HTTP response from the specified URL using HTTPX.
100
- async with AsyncClient() as client:
101
- response = await client.get(request.url)
102
-
103
- # Parse the HTML content using Beautiful Soup.
104
- soup = BeautifulSoup(response.content, 'html.parser')
105
-
106
- # Extract the desired data.
107
- data = {
108
- 'url': actor_input['url'],
109
- 'title': soup.title.string,
110
- 'h1s': [h1.text for h1 in soup.find_all('h1')],
111
- 'h2s': [h2.text for h2 in soup.find_all('h2')],
112
- 'h3s': [h3.text for h3 in soup.find_all('h3')],
113
- }
114
-
115
- # Store the extracted data to the default dataset.
116
- await Actor.push_data(data)
117
- ```
118
-
119
- ### Apify SDK with PlaywrightCrawler from Crawlee
120
-
121
- This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
122
-
123
- ```python
124
- from apify import Actor, Request
125
- from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
126
-
127
-
128
74
  async def main() -> None:
129
75
  async with Actor:
130
- # Retrieve the Actor input, and use default values if not provided.
131
- actor_input = await Actor.get_input() or {}
132
- start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
133
-
134
- # Exit if no start URLs are provided.
135
- if not start_urls:
136
- Actor.log.info('No start URLs specified in Actor input, exiting...')
137
- await Actor.exit()
138
-
139
- # Create a crawler.
140
- crawler = PlaywrightCrawler(
141
- # Limit the crawl to max requests. Remove or increase it for crawling all links.
142
- max_requests_per_crawl=50,
143
- headless=True,
144
- )
145
-
146
- # Define a request handler, which will be called for every request.
147
- @crawler.router.default_handler
148
- async def request_handler(context: PlaywrightCrawlingContext) -> None:
149
- url = context.request.url
150
- Actor.log.info(f'Scraping {url}...')
151
-
152
- # Extract the desired data.
153
- data = {
154
- 'url': context.request.url,
155
- 'title': await context.page.title(),
156
- 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
157
- 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
158
- 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
159
- }
160
-
161
- # Store the extracted data to the default dataset.
162
- await context.push_data(data)
163
-
164
- # Enqueue additional links found on the current page.
165
- await context.enqueue_links()
166
-
167
- # Run the crawler with the starting URLs.
168
- await crawler.run(start_urls)
76
+ # Read the input parameters from the Actor input
77
+ actor_input = await Actor.get_input()
78
+ # Fetch the HTTP response from the specified URL
79
+ async with AsyncClient() as client:
80
+ response = await client.get(actor_input['url'])
81
+ # Process the HTML content
82
+ soup = BeautifulSoup(response.content, 'html.parser')
83
+ # Push the extracted data
84
+ await Actor.push_data({
85
+ 'url': actor_input['url'],
86
+ 'title': soup.title.string,
87
+ })
169
88
  ```
170
89
 
171
90
  ## What are Actors?
@@ -1,5 +1,5 @@
1
- apify/__init__.py,sha256=ikoi2EpDYl6y-XSVtlU8UsdQdMEyOiIJCRRAaZFDOP8,550
2
- apify/_actor.py,sha256=oPgQ3rxxIEzVcZ9XtI3lf1a_6gwIMgxihNuYGjJpGww,41816
1
+ apify/__init__.py,sha256=6D62MrlyEsGWRLG5BQXPG5ODGXhBQVjrkJxogVxCT5Y,507
2
+ apify/_actor.py,sha256=0HqsIIdyMGjrUWHoTxuvHb95UMFzvJsMGKqgrqfZVoA,41188
3
3
  apify/_configuration.py,sha256=gf7YOun32Whc9DamhoWDLmcUeNwtWVmmBPrl4oq6s4I,8997
4
4
  apify/_consts.py,sha256=_Xq4hOfOA1iZ3n1P967YWdyncKivpbX6RTlp_qanUoE,330
5
5
  apify/_crypto.py,sha256=b4Czs1NLPkaNzkPjovObjSIbsKnRrgtBkM9JvOysUMA,5612
@@ -13,10 +13,10 @@ apify/apify_storage_client/_dataset_client.py,sha256=j9seF2OKvbSMD9R9XF9fpa1vtr_
13
13
  apify/apify_storage_client/_dataset_collection_client.py,sha256=fkYvYGQCigHD2CDzpWk0swNAkfvAinAhMGpYqllle3E,1445
14
14
  apify/apify_storage_client/_key_value_store_client.py,sha256=uyeQgb75sGFsqIS4sq4hEZ3QP81COLfS3tmTqHc0tso,3340
15
15
  apify/apify_storage_client/_key_value_store_collection_client.py,sha256=vCtMTI-jx89Qp5WHILDNkCthwLuv0MAwm1J_5E4aypU,1519
16
- apify/apify_storage_client/_request_queue_client.py,sha256=P8ws8jEzi2PWpp-cvYfV7kwuKbgH813BpNQ_wMSVtTA,6278
16
+ apify/apify_storage_client/_request_queue_client.py,sha256=jAiFkaJ38_myHFGTw-Rk21wmpbN0UCR2w2SFoimFGFc,5826
17
17
  apify/apify_storage_client/_request_queue_collection_client.py,sha256=NnO73UJ9ZrjV8xoudo30wfaM-SojRkG0guhxDyB-K1g,1527
18
18
  apify/apify_storage_client/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- apify/log.py,sha256=pX6ppIvds8OKqjFpIcshqG4zp_5DiOUU31ksyfSExto,1392
19
+ apify/log.py,sha256=Shns441HqiMC9FDdtmftgQmnJWQL3DAKHBRA0E7lbdQ,390
20
20
  apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  apify/scrapy/__init__.py,sha256=qDPV_zTRFaUqoFOyS5g4uBfz-UCkmWYJ82VXQ_3Cw6k,348
22
22
  apify/scrapy/middlewares/__init__.py,sha256=tfW-d3WFWLeNEjL8fTmon6NwgD-OXx1Bw2fBdU-wPy4,114
@@ -31,7 +31,7 @@ apify/scrapy/scheduler.py,sha256=AAIKY5i1QxkC1mtmix6n3M2eQaOw-d1T56Noue9xToc,601
31
31
  apify/scrapy/utils.py,sha256=tz_Y8CTqe6KbyMMhLF3m7qqR46jtNH5U7Ty7e19roPU,2814
32
32
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
33
33
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- apify-2.0.0.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
35
- apify-2.0.0.dist-info/METADATA,sha256=DhojQDiiwKEwS7VcAufA7ERVHYHKk5mqHFtddWXL4Qk,8604
36
- apify-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
37
- apify-2.0.0.dist-info/RECORD,,
34
+ apify-2.0.0a1.dist-info/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
35
+ apify-2.0.0a1.dist-info/METADATA,sha256=qmyr_aWFO-X6OPEqikAOlAnbbF1loZMbRAne7kHeDfQ,5231
36
+ apify-2.0.0a1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
37
+ apify-2.0.0a1.dist-info/RECORD,,
File without changes