apify 2.0.0__tar.gz → 2.0.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (39) hide show
  1. {apify-2.0.0 → apify-2.0.0a1}/PKG-INFO +15 -96
  2. apify-2.0.0a1/README.md +90 -0
  3. {apify-2.0.0 → apify-2.0.0a1}/pyproject.toml +3 -3
  4. {apify-2.0.0 → apify-2.0.0a1}/src/apify/__init__.py +0 -2
  5. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_actor.py +6 -22
  6. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_request_queue_client.py +3 -15
  7. apify-2.0.0a1/src/apify/log.py +15 -0
  8. apify-2.0.0/README.md +0 -171
  9. apify-2.0.0/src/apify/log.py +0 -43
  10. {apify-2.0.0 → apify-2.0.0a1}/LICENSE +0 -0
  11. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_configuration.py +0 -0
  12. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_consts.py +0 -0
  13. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_crypto.py +0 -0
  14. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_models.py +0 -0
  15. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_platform_event_manager.py +0 -0
  16. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_proxy_configuration.py +0 -0
  17. {apify-2.0.0 → apify-2.0.0a1}/src/apify/_utils.py +0 -0
  18. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/__init__.py +0 -0
  19. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_apify_storage_client.py +0 -0
  20. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_dataset_client.py +0 -0
  21. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_dataset_collection_client.py +0 -0
  22. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_key_value_store_client.py +0 -0
  23. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_key_value_store_collection_client.py +0 -0
  24. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/_request_queue_collection_client.py +0 -0
  25. {apify-2.0.0 → apify-2.0.0a1}/src/apify/apify_storage_client/py.typed +0 -0
  26. {apify-2.0.0 → apify-2.0.0a1}/src/apify/py.typed +0 -0
  27. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/__init__.py +0 -0
  28. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/middlewares/__init__.py +0 -0
  29. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
  30. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/middlewares/py.typed +0 -0
  31. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/pipelines/__init__.py +0 -0
  32. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
  33. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/pipelines/py.typed +0 -0
  34. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/py.typed +0 -0
  35. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/requests.py +0 -0
  36. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/scheduler.py +0 -0
  37. {apify-2.0.0 → apify-2.0.0a1}/src/apify/scrapy/utils.py +0 -0
  38. {apify-2.0.0 → apify-2.0.0a1}/src/apify/storages/__init__.py +0 -0
  39. {apify-2.0.0 → apify-2.0.0a1}/src/apify/storages/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apify
3
- Version: 2.0.0
3
+ Version: 2.0.0a1
4
4
  Summary: Apify SDK for Python
5
5
  License: Apache-2.0
6
6
  Keywords: apify,sdk,automation,chrome,crawlee,crawler,headless,scraper,scraping
@@ -20,7 +20,7 @@ Classifier: Topic :: Software Development :: Libraries
20
20
  Provides-Extra: scrapy
21
21
  Requires-Dist: apify-client (>=1.7.1)
22
22
  Requires-Dist: apify-shared (>=1.1.2)
23
- Requires-Dist: crawlee (>=0.3.5)
23
+ Requires-Dist: crawlee (>=0.3.0)
24
24
  Requires-Dist: cryptography (>=42.0.0)
25
25
  Requires-Dist: httpx (>=0.27.0)
26
26
  Requires-Dist: lazy-object-proxy (>=1.10.0)
@@ -64,108 +64,27 @@ pip install apify[scrapy]
64
64
 
65
65
  For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
66
66
 
67
- ## Examples
68
-
69
- Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
70
-
71
- ### Apify SDK with HTTPX and BeautifulSoup
72
-
73
- This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
67
+ ## Example
74
68
 
75
69
  ```python
76
70
  from apify import Actor
77
71
  from bs4 import BeautifulSoup
78
72
  from httpx import AsyncClient
79
73
 
80
-
81
- async def main() -> None:
82
- async with Actor:
83
- # Retrieve the Actor input, and use default values if not provided.
84
- actor_input = await Actor.get_input() or {}
85
- start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
86
-
87
- # Open the default request queue for handling URLs to be processed.
88
- request_queue = await Actor.open_request_queue()
89
-
90
- # Enqueue the start URLs.
91
- for start_url in start_urls:
92
- url = start_url.get('url')
93
- await request_queue.add_request(url)
94
-
95
- # Process the URLs from the request queue.
96
- while request := await request_queue.fetch_next_request():
97
- Actor.log.info(f'Scraping {request.url} ...')
98
-
99
- # Fetch the HTTP response from the specified URL using HTTPX.
100
- async with AsyncClient() as client:
101
- response = await client.get(request.url)
102
-
103
- # Parse the HTML content using Beautiful Soup.
104
- soup = BeautifulSoup(response.content, 'html.parser')
105
-
106
- # Extract the desired data.
107
- data = {
108
- 'url': actor_input['url'],
109
- 'title': soup.title.string,
110
- 'h1s': [h1.text for h1 in soup.find_all('h1')],
111
- 'h2s': [h2.text for h2 in soup.find_all('h2')],
112
- 'h3s': [h3.text for h3 in soup.find_all('h3')],
113
- }
114
-
115
- # Store the extracted data to the default dataset.
116
- await Actor.push_data(data)
117
- ```
118
-
119
- ### Apify SDK with PlaywrightCrawler from Crawlee
120
-
121
- This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
122
-
123
- ```python
124
- from apify import Actor, Request
125
- from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
126
-
127
-
128
74
  async def main() -> None:
129
75
  async with Actor:
130
- # Retrieve the Actor input, and use default values if not provided.
131
- actor_input = await Actor.get_input() or {}
132
- start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
133
-
134
- # Exit if no start URLs are provided.
135
- if not start_urls:
136
- Actor.log.info('No start URLs specified in Actor input, exiting...')
137
- await Actor.exit()
138
-
139
- # Create a crawler.
140
- crawler = PlaywrightCrawler(
141
- # Limit the crawl to max requests. Remove or increase it for crawling all links.
142
- max_requests_per_crawl=50,
143
- headless=True,
144
- )
145
-
146
- # Define a request handler, which will be called for every request.
147
- @crawler.router.default_handler
148
- async def request_handler(context: PlaywrightCrawlingContext) -> None:
149
- url = context.request.url
150
- Actor.log.info(f'Scraping {url}...')
151
-
152
- # Extract the desired data.
153
- data = {
154
- 'url': context.request.url,
155
- 'title': await context.page.title(),
156
- 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
157
- 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
158
- 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
159
- }
160
-
161
- # Store the extracted data to the default dataset.
162
- await context.push_data(data)
163
-
164
- # Enqueue additional links found on the current page.
165
- await context.enqueue_links()
166
-
167
- # Run the crawler with the starting URLs.
168
- await crawler.run(start_urls)
76
+ # Read the input parameters from the Actor input
77
+ actor_input = await Actor.get_input()
78
+ # Fetch the HTTP response from the specified URL
79
+ async with AsyncClient() as client:
80
+ response = await client.get(actor_input['url'])
81
+ # Process the HTML content
82
+ soup = BeautifulSoup(response.content, 'html.parser')
83
+ # Push the extracted data
84
+ await Actor.push_data({
85
+ 'url': actor_input['url'],
86
+ 'title': soup.title.string,
87
+ })
169
88
  ```
170
89
 
171
90
  ## What are Actors?
@@ -0,0 +1,90 @@
1
+ # Apify SDK for Python
2
+
3
+ The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
4
+ in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
5
+ event handling.
6
+
7
+ If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
8
+ check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
9
+
10
+ ## Installation
11
+
12
+ The Apify SDK for Python is available on PyPI as the `apify` package.
13
+ For default installation, using Pip, run the following:
14
+
15
+ ```bash
16
+ pip install apify
17
+ ```
18
+
19
+ For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
20
+ To install Apify with the `scrapy` extra, use the following command:
21
+
22
+ ```bash
23
+ pip install apify[scrapy]
24
+ ```
25
+
26
+ ## Documentation
27
+
28
+ For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
29
+
30
+ ## Example
31
+
32
+ ```python
33
+ from apify import Actor
34
+ from bs4 import BeautifulSoup
35
+ from httpx import AsyncClient
36
+
37
+ async def main() -> None:
38
+ async with Actor:
39
+ # Read the input parameters from the Actor input
40
+ actor_input = await Actor.get_input()
41
+ # Fetch the HTTP response from the specified URL
42
+ async with AsyncClient() as client:
43
+ response = await client.get(actor_input['url'])
44
+ # Process the HTML content
45
+ soup = BeautifulSoup(response.content, 'html.parser')
46
+ # Push the extracted data
47
+ await Actor.push_data({
48
+ 'url': actor_input['url'],
49
+ 'title': soup.title.string,
50
+ })
51
+ ```
52
+
53
+ ## What are Actors?
54
+
55
+ Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
56
+ They can do anything from small tasks such as filling in forms or unsubscribing from online services,
57
+ all the way up to scraping and processing vast numbers of web pages.
58
+
59
+ They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
60
+ where you can run them at scale, monitor them, schedule them, or publish and monetize them.
61
+
62
+ If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
63
+ in the Apify platform documentation.
64
+
65
+ ## Creating Actors
66
+
67
+ To create and run Actors through Apify Console,
68
+ see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
69
+
70
+ To create and run Python Actors locally, check the documentation for
71
+ [how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
72
+
73
+ ## Guides
74
+
75
+ To see how you can use the Apify SDK with other popular libraries used for web scraping,
76
+ check out our guides for using
77
+ [Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
78
+ [Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
79
+ [Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
80
+ [Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
81
+ or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
82
+
83
+ ## Usage concepts
84
+
85
+ To learn more about the features of the Apify SDK and how to use them,
86
+ check out the Usage Concepts section in the sidebar,
87
+ particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
88
+ [working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
89
+ [handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
90
+ or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "apify"
7
- version = "2.0.0"
7
+ version = "2.0.0a1"
8
8
  description = "Apify SDK for Python"
9
9
  authors = ["Apify Technologies s.r.o. <support@apify.com>"]
10
10
  license = "Apache-2.0"
@@ -48,7 +48,7 @@ keywords = [
48
48
  python = "^3.9"
49
49
  apify-client = ">=1.7.1"
50
50
  apify-shared = ">=1.1.2"
51
- crawlee = ">=0.3.5"
51
+ crawlee = ">=0.3.0"
52
52
  cryptography = ">=42.0.0"
53
53
  httpx = ">=0.27.0"
54
54
  lazy-object-proxy = ">=1.10.0"
@@ -58,7 +58,7 @@ websockets = ">=10.0"
58
58
 
59
59
  [tool.poetry.group.dev.dependencies]
60
60
  build = "~1.2.0"
61
- filelock = "~3.16.0"
61
+ filelock = "~3.15.0"
62
62
  griffe = "~1.2.0"
63
63
  mypy = "~1.11.0"
64
64
  pre-commit = "~3.8.0"
@@ -1,7 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
3
  from apify_shared.consts import WebhookEventType
4
- from crawlee import Request
5
4
  from crawlee.events._types import Event
6
5
 
7
6
  from apify._actor import Actor
@@ -17,7 +16,6 @@ __all__ = [
17
16
  'Configuration',
18
17
  'ProxyConfiguration',
19
18
  'ProxyInfo',
20
- 'Request',
21
19
  'Webhook',
22
20
  'WebhookEventType',
23
21
  '__version__',
@@ -24,7 +24,7 @@ from apify._platform_event_manager import EventManager, LocalEventManager, Platf
24
24
  from apify._proxy_configuration import ProxyConfiguration
25
25
  from apify._utils import get_system_info, is_running_in_ipython
26
26
  from apify.apify_storage_client import ApifyStorageClient
27
- from apify.log import _configure_logging, logger
27
+ from apify.log import logger
28
28
  from apify.storages import Dataset, KeyValueStore, RequestQueue
29
29
 
30
30
  if TYPE_CHECKING:
@@ -46,24 +46,16 @@ class _ActorType:
46
46
  _configuration: Configuration
47
47
  _is_exiting = False
48
48
 
49
- def __init__(
50
- self,
51
- configuration: Configuration | None = None,
52
- *,
53
- configure_logging: bool = True,
54
- ) -> None:
49
+ def __init__(self, config: Configuration | None = None) -> None:
55
50
  """Create an Actor instance.
56
51
 
57
52
  Note that you don't have to do this, all the functionality is accessible using the default instance
58
53
  (e.g. `Actor.open_dataset()`).
59
54
 
60
55
  Args:
61
- configuration: The Actor configuration to be used. If not passed, a new Configuration instance will
62
- be created.
63
- configure_logging: Should the default logging configuration be configured?
56
+ config: The Actor configuration to be used. If not passed, a new Configuration instance will be created.
64
57
  """
65
- self._configuration = configuration or Configuration.get_global_configuration()
66
- self._configure_logging = configure_logging
58
+ self._configuration = config or Configuration.get_global_configuration()
67
59
  self._apify_client = self.new_client()
68
60
 
69
61
  self._event_manager: EventManager
@@ -89,9 +81,6 @@ class _ActorType:
89
81
  When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
90
82
  executing the block code, the `Actor.fail` method is called.
91
83
  """
92
- if self._configure_logging:
93
- _configure_logging(self._configuration)
94
-
95
84
  await self.init()
96
85
  return self
97
86
 
@@ -122,20 +111,15 @@ class _ActorType:
122
111
 
123
112
  return super().__repr__()
124
113
 
125
- def __call__(self, configuration: Configuration | None = None, *, configure_logging: bool = True) -> Self:
114
+ def __call__(self, config: Configuration) -> Self:
126
115
  """Make a new Actor instance with a non-default configuration."""
127
- return self.__class__(configuration=configuration, configure_logging=configure_logging)
116
+ return self.__class__(config=config)
128
117
 
129
118
  @property
130
119
  def apify_client(self) -> ApifyClientAsync:
131
120
  """The ApifyClientAsync instance the Actor instance uses."""
132
121
  return self._apify_client
133
122
 
134
- @property
135
- def configuration(self) -> Configuration:
136
- """The Configuration instance the Actor instance uses."""
137
- return self._configuration
138
-
139
123
  @property
140
124
  def config(self) -> Configuration:
141
125
  """The Configuration instance the Actor instance uses."""
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
5
- from more_itertools import chunked
6
5
  from typing_extensions import override
7
6
 
8
7
  from crawlee import Request
@@ -158,11 +157,8 @@ class RequestQueueClient(BaseRequestQueueClient):
158
157
  *,
159
158
  forefront: bool = False,
160
159
  ) -> BatchRequestsOperationResponse:
161
- processed = []
162
- unprocessed = []
163
-
164
- for chunk in chunked(requests, 25): # The API endpoint won't accept more than 25 requests at once
165
- response = await self._client.batch_add_requests(
160
+ return BatchRequestsOperationResponse.model_validate(
161
+ await self._client.batch_add_requests(
166
162
  requests=[
167
163
  r.model_dump(
168
164
  by_alias=True,
@@ -174,18 +170,10 @@ class RequestQueueClient(BaseRequestQueueClient):
174
170
  'data',
175
171
  },
176
172
  )
177
- for r in chunk
173
+ for r in requests
178
174
  ],
179
175
  forefront=forefront,
180
176
  )
181
- processed.extend(response['processedRequests'])
182
- unprocessed.extend(response['unprocessedRequests'])
183
-
184
- return BatchRequestsOperationResponse.model_validate(
185
- {
186
- 'processedRequests': processed,
187
- 'unprocessedRequests': unprocessed,
188
- }
189
177
  )
190
178
 
191
179
  @override
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from crawlee._log_config import CrawleeLogFormatter
6
+
7
+ # Name of the logger used throughout the library (resolves to 'apify')
8
+ logger_name = __name__.split('.')[0]
9
+
10
+ # Logger used throughout the library
11
+ logger = logging.getLogger(logger_name)
12
+
13
+
14
+ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
15
+ pass
apify-2.0.0/README.md DELETED
@@ -1,171 +0,0 @@
1
- # Apify SDK for Python
2
-
3
- The Apify SDK for Python is the official library to create [Apify Actors](https://docs.apify.com/platform/actors)
4
- in Python. It provides useful features like Actor lifecycle management, local storage emulation, and Actor
5
- event handling.
6
-
7
- If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
8
- check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
9
-
10
- ## Installation
11
-
12
- The Apify SDK for Python is available on PyPI as the `apify` package.
13
- For default installation, using Pip, run the following:
14
-
15
- ```bash
16
- pip install apify
17
- ```
18
-
19
- For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
20
- To install Apify with the `scrapy` extra, use the following command:
21
-
22
- ```bash
23
- pip install apify[scrapy]
24
- ```
25
-
26
- ## Documentation
27
-
28
- For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
29
-
30
- ## Examples
31
-
32
- Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
33
-
34
- ### Apify SDK with HTTPX and BeautifulSoup
35
-
36
- This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
37
-
38
- ```python
39
- from apify import Actor
40
- from bs4 import BeautifulSoup
41
- from httpx import AsyncClient
42
-
43
-
44
- async def main() -> None:
45
- async with Actor:
46
- # Retrieve the Actor input, and use default values if not provided.
47
- actor_input = await Actor.get_input() or {}
48
- start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
49
-
50
- # Open the default request queue for handling URLs to be processed.
51
- request_queue = await Actor.open_request_queue()
52
-
53
- # Enqueue the start URLs.
54
- for start_url in start_urls:
55
- url = start_url.get('url')
56
- await request_queue.add_request(url)
57
-
58
- # Process the URLs from the request queue.
59
- while request := await request_queue.fetch_next_request():
60
- Actor.log.info(f'Scraping {request.url} ...')
61
-
62
- # Fetch the HTTP response from the specified URL using HTTPX.
63
- async with AsyncClient() as client:
64
- response = await client.get(request.url)
65
-
66
- # Parse the HTML content using Beautiful Soup.
67
- soup = BeautifulSoup(response.content, 'html.parser')
68
-
69
- # Extract the desired data.
70
- data = {
71
- 'url': actor_input['url'],
72
- 'title': soup.title.string,
73
- 'h1s': [h1.text for h1 in soup.find_all('h1')],
74
- 'h2s': [h2.text for h2 in soup.find_all('h2')],
75
- 'h3s': [h3.text for h3 in soup.find_all('h3')],
76
- }
77
-
78
- # Store the extracted data to the default dataset.
79
- await Actor.push_data(data)
80
- ```
81
-
82
- ### Apify SDK with PlaywrightCrawler from Crawlee
83
-
84
- This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
85
-
86
- ```python
87
- from apify import Actor, Request
88
- from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
89
-
90
-
91
- async def main() -> None:
92
- async with Actor:
93
- # Retrieve the Actor input, and use default values if not provided.
94
- actor_input = await Actor.get_input() or {}
95
- start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
96
-
97
- # Exit if no start URLs are provided.
98
- if not start_urls:
99
- Actor.log.info('No start URLs specified in Actor input, exiting...')
100
- await Actor.exit()
101
-
102
- # Create a crawler.
103
- crawler = PlaywrightCrawler(
104
- # Limit the crawl to max requests. Remove or increase it for crawling all links.
105
- max_requests_per_crawl=50,
106
- headless=True,
107
- )
108
-
109
- # Define a request handler, which will be called for every request.
110
- @crawler.router.default_handler
111
- async def request_handler(context: PlaywrightCrawlingContext) -> None:
112
- url = context.request.url
113
- Actor.log.info(f'Scraping {url}...')
114
-
115
- # Extract the desired data.
116
- data = {
117
- 'url': context.request.url,
118
- 'title': await context.page.title(),
119
- 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
120
- 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
121
- 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
122
- }
123
-
124
- # Store the extracted data to the default dataset.
125
- await context.push_data(data)
126
-
127
- # Enqueue additional links found on the current page.
128
- await context.enqueue_links()
129
-
130
- # Run the crawler with the starting URLs.
131
- await crawler.run(start_urls)
132
- ```
133
-
134
- ## What are Actors?
135
-
136
- Actors are serverless cloud programs that can do almost anything a human can do in a web browser.
137
- They can do anything from small tasks such as filling in forms or unsubscribing from online services,
138
- all the way up to scraping and processing vast numbers of web pages.
139
-
140
- They can be run either locally, or on the [Apify platform](https://docs.apify.com/platform/),
141
- where you can run them at scale, monitor them, schedule them, or publish and monetize them.
142
-
143
- If you're new to Apify, learn [what is Apify](https://docs.apify.com/platform/about)
144
- in the Apify platform documentation.
145
-
146
- ## Creating Actors
147
-
148
- To create and run Actors through Apify Console,
149
- see the [Console documentation](https://docs.apify.com/academy/getting-started/creating-actors#choose-your-template).
150
-
151
- To create and run Python Actors locally, check the documentation for
152
- [how to create and run Python Actors locally](https://docs.apify.com/sdk/python/docs/overview/running-locally).
153
-
154
- ## Guides
155
-
156
- To see how you can use the Apify SDK with other popular libraries used for web scraping,
157
- check out our guides for using
158
- [Requests and HTTPX](https://docs.apify.com/sdk/python/docs/guides/requests-and-httpx),
159
- [Beautiful Soup](https://docs.apify.com/sdk/python/docs/guides/beautiful-soup),
160
- [Playwright](https://docs.apify.com/sdk/python/docs/guides/playwright),
161
- [Selenium](https://docs.apify.com/sdk/python/docs/guides/selenium),
162
- or [Scrapy](https://docs.apify.com/sdk/python/docs/guides/scrapy).
163
-
164
- ## Usage concepts
165
-
166
- To learn more about the features of the Apify SDK and how to use them,
167
- check out the Usage Concepts section in the sidebar,
168
- particularly the guides for the [Actor lifecycle](https://docs.apify.com/sdk/python/docs/concepts/actor-lifecycle),
169
- [working with storages](https://docs.apify.com/sdk/python/docs/concepts/storages),
170
- [handling Actor events](https://docs.apify.com/sdk/python/docs/concepts/actor-events)
171
- or [how to use proxies](https://docs.apify.com/sdk/python/docs/concepts/proxy-management).
@@ -1,43 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from typing import TYPE_CHECKING
5
-
6
- from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
7
-
8
- if TYPE_CHECKING:
9
- from apify import Configuration
10
-
11
- # Name of the logger used throughout the library (resolves to 'apify')
12
- logger_name = __name__.split('.')[0]
13
-
14
- # Logger used throughout the library
15
- logger = logging.getLogger(logger_name)
16
-
17
-
18
- class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class
19
- pass
20
-
21
-
22
- def _configure_logging(configuration: Configuration) -> None:
23
- apify_client_logger = logging.getLogger('apify_client')
24
- configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
25
-
26
- level = get_configured_log_level(configuration)
27
-
28
- # Keep apify_client logger quiet unless debug logging is requested
29
- if level > logging.DEBUG:
30
- apify_client_logger.setLevel(logging.INFO)
31
- else:
32
- apify_client_logger.setLevel(level)
33
-
34
- # Silence HTTPX logger unless debug logging is requested
35
- httpx_logger = logging.getLogger('httpx')
36
- if level > logging.DEBUG:
37
- httpx_logger.setLevel(logging.WARNING)
38
- else:
39
- httpx_logger.setLevel(level)
40
-
41
- # Use configured log level for apify logger
42
- apify_logger = logging.getLogger('apify')
43
- configure_logger(apify_logger, configuration, remove_old_handlers=True)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes