pydoll-python 2.21.3__tar.gz → 2.22.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/PKG-INFO +135 -96
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/README.md +133 -95
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/tab.py +67 -15
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/__init__.py +2 -0
- pydoll_python-2.22.0/pydoll/commands/accessibility_commands.py +223 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/exceptions.py +11 -0
- pydoll_python-2.22.0/pydoll/extractor/__init__.py +16 -0
- pydoll_python-2.22.0/pydoll/extractor/engine.py +346 -0
- pydoll_python-2.22.0/pydoll/extractor/exceptions.py +23 -0
- pydoll_python-2.22.0/pydoll/extractor/field.py +112 -0
- pydoll_python-2.22.0/pydoll/extractor/model.py +98 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/iframe.py +2 -2
- pydoll_python-2.22.0/pydoll/protocol/accessibility/__init__.py +1 -0
- pydoll_python-2.22.0/pydoll/protocol/accessibility/events.py +47 -0
- pydoll_python-2.22.0/pydoll/protocol/accessibility/methods.py +122 -0
- pydoll_python-2.22.0/pydoll/protocol/accessibility/types.py +192 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pyproject.toml +2 -1
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/LICENSE +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/base.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/chrome.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/edge.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/interfaces.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/browser_options_manager.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/browser_process_manager.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/proxy_manager.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/temp_dir_manager.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/options.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/har_recorder.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/request.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/response.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/browser_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/dom_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/emulation_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/fetch_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/input_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/network_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/page_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/runtime_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/storage_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/target_commands.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/connection_handler.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/managers/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/managers/commands_manager.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/managers/events_manager.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/constants.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/decorators.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/mixins/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/mixins/find_elements_mixin.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/shadow_root.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/utils/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/utils/selector_parser.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/web_element.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/keyboard.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/mouse.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/scroll.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/utils.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/base.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/debugger/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/emulation/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/emulation/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/emulation/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/io/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/har_types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/security/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/events.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/methods.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/types.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/py.typed +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/__init__.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/bundle.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/general.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/socks5_proxy_forwarder.py +0 -0
- {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/user_agent_parser.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydoll-python
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.22.0
|
|
4
4
|
Summary: Pydoll is a library for automating chromium-based browsers without a WebDriver, offering realistic interactions.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Author: Thalison Fernandes
|
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.14
|
|
15
15
|
Requires-Dist: aiofiles (>=25.1.0,<26.0.0)
|
|
16
16
|
Requires-Dist: aiohttp (>=3.9.5,<4.0.0)
|
|
17
|
+
Requires-Dist: pydantic (>=2.0,<3.0)
|
|
17
18
|
Requires-Dist: typing_extensions (>=4.14.0,<5.0.0)
|
|
18
19
|
Requires-Dist: websockets (>=14,<15)
|
|
19
20
|
Description-Content-Type: text/markdown
|
|
@@ -42,39 +43,37 @@ Description-Content-Type: text/markdown
|
|
|
42
43
|
<a href="#support">Support</a>
|
|
43
44
|
</p>
|
|
44
45
|
|
|
45
|
-
Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues
|
|
46
|
+
Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. **No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues.**
|
|
46
47
|
|
|
47
|
-
It combines a high-level API for
|
|
48
|
+
It combines a high-level API for stealthy automation with low-level CDP access for fine-grained control over network, fingerprinting, and browser behavior. And with its new **Pydantic-powered extraction engine**, it maps the DOM directly to structured Python objects, delivering an unmatched Developer Experience (DX).
|
|
48
49
|
|
|
49
|
-
### Sponsors
|
|
50
|
+
### Top Sponsors
|
|
50
51
|
|
|
51
|
-
<a href="https://
|
|
52
|
-
<img
|
|
52
|
+
<a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">
|
|
53
|
+
<img src="public/images/banner-the-webscraping-club.png" alt="The Web Scraping Club" />
|
|
53
54
|
</a>
|
|
54
55
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
**[Sign up through our link](https://www.thordata.com/?ls=github&lk=pydoll)** to support the project and get **1GB free** to get started.
|
|
58
|
-
|
|
59
|
-
---
|
|
60
|
-
|
|
61
|
-
<a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc">
|
|
62
|
-
<img alt="CapSolver" src="public/images/capsolver.jpeg" />
|
|
63
|
-
</a>
|
|
56
|
+
<sub>Read a full review of Pydoll on <b><a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">The Web Scraping Club</a></b>, the #1 newsletter dedicated to web scraping.</sub>
|
|
64
57
|
|
|
65
|
-
|
|
58
|
+
### Sponsors
|
|
66
59
|
|
|
67
|
-
|
|
60
|
+
<table>
|
|
61
|
+
<tr>
|
|
62
|
+
<td><a href="https://www.thordata.com/?ls=github&lk=pydoll"><img src="public/images/Thordata-logo.png" height="30" alt="Thordata" /></a></td>
|
|
63
|
+
<td><a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc"><img src="public/images/capsolver-logo.png" height="40" alt="CapSolver" /></a></td>
|
|
64
|
+
<td><a href="https://www.testmuai.com/?utm_medium=sponsor&utm_source=pydoll"><img src="public/images/logo-lamda-test.svg" height="30" width="130" alt="LambdaTest" /></a></td>
|
|
65
|
+
</tr>
|
|
66
|
+
</table>
|
|
68
67
|
|
|
69
|
-
|
|
68
|
+
<sub>[Learn more about our sponsors](SPONSORS.md) · [Become a sponsor](https://github.com/sponsors/thalissonvs)</sub>
|
|
70
69
|
|
|
71
70
|
### Why Pydoll
|
|
72
71
|
|
|
73
|
-
- **
|
|
72
|
+
- **Structured extraction**: Define a [Pydantic](https://docs.pydantic.dev/) model, call `tab.extract()`, get typed and validated data back. No manual element-by-element querying.
|
|
74
73
|
- **Async and typed**: Built on `asyncio` from the ground up, 100% type-checked with `mypy`. Full IDE autocompletion and static error checking.
|
|
74
|
+
- **Stealth built in**: Human-like mouse movement, realistic typing, and granular [browser preference](https://pydoll.tech/docs/features/configuration/browser-preferences/) control for fingerprint management.
|
|
75
75
|
- **Network control**: [Intercept](https://pydoll.tech/docs/features/network/interception/) requests to block ads/trackers, [monitor](https://pydoll.tech/docs/features/network/monitoring/) traffic for API discovery, and make [authenticated HTTP requests](https://pydoll.tech/docs/features/network/http-requests/) that inherit the browser session.
|
|
76
76
|
- **Shadow DOM and iframes**: Full support for [shadow roots](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/) (including closed) and cross-origin iframes. Discover, query, and interact with elements inside them using the same API.
|
|
77
|
-
- **Ergonomic API**: `tab.find()` for most cases, `tab.query()` for complex [CSS/XPath selectors](https://pydoll.tech/docs/deep-dive/guides/selectors-guide/).
|
|
78
77
|
|
|
79
78
|
## Installation
|
|
80
79
|
|
|
@@ -84,55 +83,124 @@ pip install pydoll-python
|
|
|
84
83
|
|
|
85
84
|
No WebDriver binaries or external dependencies required.
|
|
86
85
|
|
|
87
|
-
##
|
|
86
|
+
## Getting Started
|
|
88
87
|
|
|
89
|
-
|
|
90
|
-
<summary><b>HAR Network Recording</b></summary>
|
|
91
|
-
<br>
|
|
88
|
+
### 1. Stateful Automation & Evasion
|
|
92
89
|
|
|
93
|
-
|
|
90
|
+
When you need to navigate, bypass challenges, or interact with dynamic UI, Pydoll's imperative API handles it with humanized timing by default.
|
|
94
91
|
|
|
95
92
|
```python
|
|
96
|
-
|
|
93
|
+
import asyncio
|
|
94
|
+
from pydoll.browser import Chrome
|
|
95
|
+
from pydoll.constants import Key
|
|
97
96
|
|
|
98
|
-
async
|
|
99
|
-
|
|
97
|
+
async def google_search(query: str):
|
|
98
|
+
async with Chrome() as browser:
|
|
99
|
+
tab = await browser.start()
|
|
100
|
+
await tab.go_to('https://www.google.com')
|
|
100
101
|
|
|
101
|
-
|
|
102
|
-
await tab.
|
|
102
|
+
# Find elements and interact with human-like timing
|
|
103
|
+
search_box = await tab.find(tag_name='textarea', name='q')
|
|
104
|
+
await search_box.insert_text(query)
|
|
105
|
+
await tab.keyboard.press(Key.ENTER)
|
|
103
106
|
|
|
104
|
-
|
|
105
|
-
|
|
107
|
+
first_result = await tab.find(
|
|
108
|
+
tag_name='h3',
|
|
109
|
+
text='autoscrape-labs/pydoll',
|
|
110
|
+
timeout=10,
|
|
111
|
+
)
|
|
112
|
+
await first_result.click()
|
|
113
|
+
print(f"Page loaded: {await tab.title}")
|
|
106
114
|
|
|
107
|
-
|
|
115
|
+
asyncio.run(google_search('pydoll site:github.com'))
|
|
108
116
|
```
|
|
109
117
|
|
|
110
|
-
|
|
118
|
+
### 2. Structured Data Extraction
|
|
119
|
+
|
|
120
|
+
Once you reach the target page, switch to the declarative engine. Define what you want with a model, and Pydoll extracts it — typed, validated, and ready to use.
|
|
111
121
|
|
|
112
122
|
```python
|
|
113
|
-
from pydoll.
|
|
123
|
+
from pydoll.browser.chromium import Chrome
|
|
124
|
+
from pydoll.extractor import ExtractionModel, Field
|
|
125
|
+
|
|
126
|
+
class Quote(ExtractionModel):
|
|
127
|
+
text: str = Field(selector='.text', description='The quote text')
|
|
128
|
+
author: str = Field(selector='.author', description='Who said it')
|
|
129
|
+
tags: list[str] = Field(selector='.tag', description='Tags')
|
|
130
|
+
year: int | None = Field(selector='.year', description='Year', default=None)
|
|
131
|
+
|
|
132
|
+
async def extract_quotes():
|
|
133
|
+
async with Chrome() as browser:
|
|
134
|
+
tab = await browser.start()
|
|
135
|
+
await tab.go_to('https://quotes.toscrape.com')
|
|
136
|
+
|
|
137
|
+
quotes = await tab.extract_all(Quote, scope='.quote', timeout=5)
|
|
114
138
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
)
|
|
118
|
-
|
|
139
|
+
for q in quotes:
|
|
140
|
+
print(f'{q.author}: {q.text}') # fully typed, IDE autocomplete works
|
|
141
|
+
print(q.tags) # list[str], not a raw element
|
|
142
|
+
print(q.model_dump_json()) # pydantic serialization built-in
|
|
143
|
+
|
|
144
|
+
asyncio.run(extract_quotes())
|
|
119
145
|
```
|
|
120
146
|
|
|
121
|
-
|
|
147
|
+
Models support CSS/XPath auto-detection, HTML attribute targeting, custom transforms, and nested models.
|
|
148
|
+
|
|
149
|
+
<details>
|
|
150
|
+
<summary><b>Nested models, transforms, and attribute extraction</b></summary>
|
|
151
|
+
<br>
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from datetime import datetime
|
|
155
|
+
from pydoll.extractor import ExtractionModel, Field
|
|
156
|
+
|
|
157
|
+
def parse_date(raw: str) -> datetime:
|
|
158
|
+
return datetime.strptime(raw.strip(), '%B %d, %Y')
|
|
159
|
+
|
|
160
|
+
class Author(ExtractionModel):
|
|
161
|
+
name: str = Field(selector='.author-title')
|
|
162
|
+
born: datetime = Field(
|
|
163
|
+
selector='.author-born-date',
|
|
164
|
+
transform=parse_date,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
class Article(ExtractionModel):
|
|
168
|
+
title: str = Field(selector='h1')
|
|
169
|
+
url: str = Field(selector='.source-link', attribute='href')
|
|
170
|
+
author: Author = Field(selector='.author-card', description='Nested model')
|
|
171
|
+
|
|
172
|
+
article = await tab.extract(Article, timeout=5)
|
|
173
|
+
article.author.born.year # int — types are preserved all the way down
|
|
174
|
+
```
|
|
122
175
|
</details>
|
|
123
176
|
|
|
177
|
+
## Features
|
|
178
|
+
|
|
124
179
|
<details>
|
|
125
|
-
<summary><b>
|
|
180
|
+
<summary><b>Humanized Mouse Movement</b></summary>
|
|
126
181
|
<br>
|
|
127
182
|
|
|
128
|
-
|
|
183
|
+
Mouse operations produce human-like cursor movement by default:
|
|
184
|
+
|
|
185
|
+
- **Bezier curve paths** with asymmetric control points
|
|
186
|
+
- **Fitts's Law timing**: duration scales with distance
|
|
187
|
+
- **Minimum-jerk velocity**: bell-shaped speed profile
|
|
188
|
+
- **Physiological tremor**: Gaussian noise scaled with velocity
|
|
189
|
+
- **Overshoot correction**: ~70% chance on fast movements, then corrects back
|
|
129
190
|
|
|
130
191
|
```python
|
|
131
|
-
await tab.
|
|
132
|
-
await tab.
|
|
192
|
+
await tab.mouse.move(500, 300)
|
|
193
|
+
await tab.mouse.click(500, 300)
|
|
194
|
+
await tab.mouse.drag(100, 200, 500, 400)
|
|
195
|
+
|
|
196
|
+
button = await tab.find(id='submit')
|
|
197
|
+
await button.click()
|
|
198
|
+
|
|
199
|
+
# Opt out when speed matters
|
|
200
|
+
await tab.mouse.click(500, 300, humanize=False)
|
|
133
201
|
```
|
|
134
202
|
|
|
135
|
-
[
|
|
203
|
+
[Mouse Control Docs](https://pydoll.tech/docs/features/automation/mouse-control/)
|
|
136
204
|
</details>
|
|
137
205
|
|
|
138
206
|
<details>
|
|
@@ -161,75 +229,46 @@ Highlights:
|
|
|
161
229
|
- `deep=True` traverses cross-origin iframes (OOPIFs)
|
|
162
230
|
- Standard `find()`, `query()`, `click()` API inside shadow roots
|
|
163
231
|
|
|
164
|
-
```python
|
|
165
|
-
# Cloudflare Turnstile inside a cross-origin iframe
|
|
166
|
-
shadow_roots = await tab.find_shadow_roots(deep=True, timeout=10)
|
|
167
|
-
for sr in shadow_roots:
|
|
168
|
-
checkbox = await sr.query('input[type="checkbox"]', raise_exc=False)
|
|
169
|
-
if checkbox:
|
|
170
|
-
await checkbox.click()
|
|
171
|
-
```
|
|
172
|
-
|
|
173
232
|
[Shadow DOM Docs](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/)
|
|
174
233
|
</details>
|
|
175
234
|
|
|
176
235
|
<details>
|
|
177
|
-
<summary><b>
|
|
236
|
+
<summary><b>HAR Network Recording</b></summary>
|
|
178
237
|
<br>
|
|
179
238
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
- **Bezier curve paths** with asymmetric control points
|
|
183
|
-
- **Fitts's Law timing**: duration scales with distance
|
|
184
|
-
- **Minimum-jerk velocity**: bell-shaped speed profile
|
|
185
|
-
- **Physiological tremor**: Gaussian noise scaled with velocity
|
|
186
|
-
- **Overshoot correction**: ~70% chance on fast movements, then corrects back
|
|
239
|
+
Record network activity during a browser session and export as HAR 1.2. Replay recorded requests to reproduce exact API sequences.
|
|
187
240
|
|
|
188
241
|
```python
|
|
189
|
-
|
|
190
|
-
await tab.mouse.click(500, 300)
|
|
191
|
-
await tab.mouse.drag(100, 200, 500, 400)
|
|
192
|
-
|
|
193
|
-
button = await tab.find(id='submit')
|
|
194
|
-
await button.click()
|
|
195
|
-
|
|
196
|
-
# Opt out when speed matters
|
|
197
|
-
await tab.mouse.click(500, 300, humanize=False)
|
|
198
|
-
```
|
|
242
|
+
from pydoll.browser.chromium import Chrome
|
|
199
243
|
|
|
200
|
-
|
|
201
|
-
|
|
244
|
+
async with Chrome() as browser:
|
|
245
|
+
tab = await browser.start()
|
|
202
246
|
|
|
203
|
-
|
|
247
|
+
async with tab.request.record() as capture:
|
|
248
|
+
await tab.go_to('https://example.com')
|
|
204
249
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
from pydoll.browser import Chrome
|
|
208
|
-
from pydoll.constants import Key
|
|
250
|
+
capture.save('flow.har')
|
|
251
|
+
print(f'Captured {len(capture.entries)} requests')
|
|
209
252
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
tab = await browser.start()
|
|
213
|
-
await tab.go_to('https://www.google.com')
|
|
253
|
+
responses = await tab.request.replay('flow.har')
|
|
254
|
+
```
|
|
214
255
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
await tab.keyboard.press(Key.ENTER)
|
|
256
|
+
[HAR Recording Docs](https://pydoll.tech/docs/features/network/network-recording/)
|
|
257
|
+
</details>
|
|
218
258
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
timeout=10,
|
|
223
|
-
)
|
|
224
|
-
await first_result.click()
|
|
259
|
+
<details>
|
|
260
|
+
<summary><b>Page Bundles</b></summary>
|
|
261
|
+
<br>
|
|
225
262
|
|
|
226
|
-
|
|
227
|
-
print(f"Page loaded: {await tab.title}")
|
|
263
|
+
Save the current page and all its assets (CSS, JS, images, fonts) as a `.zip` bundle for offline viewing. Optionally inline everything into a single HTML file.
|
|
228
264
|
|
|
229
|
-
|
|
265
|
+
```python
|
|
266
|
+
await tab.save_bundle('page.zip')
|
|
267
|
+
await tab.save_bundle('page-inline.zip', inline_assets=True)
|
|
230
268
|
```
|
|
231
269
|
|
|
232
|
-
|
|
270
|
+
[Screenshots, PDFs & Bundles Docs](https://pydoll.tech/docs/features/automation/screenshots-and-pdfs/)
|
|
271
|
+
</details>
|
|
233
272
|
|
|
234
273
|
<details>
|
|
235
274
|
<summary><b>Hybrid Automation (UI + API)</b></summary>
|
|
@@ -22,39 +22,37 @@
|
|
|
22
22
|
<a href="#support">Support</a>
|
|
23
23
|
</p>
|
|
24
24
|
|
|
25
|
-
Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues
|
|
25
|
+
Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. **No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues.**
|
|
26
26
|
|
|
27
|
-
It combines a high-level API for
|
|
27
|
+
It combines a high-level API for stealthy automation with low-level CDP access for fine-grained control over network, fingerprinting, and browser behavior. And with its new **Pydantic-powered extraction engine**, it maps the DOM directly to structured Python objects, delivering an unmatched Developer Experience (DX).
|
|
28
28
|
|
|
29
|
-
### Sponsors
|
|
29
|
+
### Top Sponsors
|
|
30
30
|
|
|
31
|
-
<a href="https://
|
|
32
|
-
<img
|
|
31
|
+
<a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">
|
|
32
|
+
<img src="public/images/banner-the-webscraping-club.png" alt="The Web Scraping Club" />
|
|
33
33
|
</a>
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
**[Sign up through our link](https://www.thordata.com/?ls=github&lk=pydoll)** to support the project and get **1GB free** to get started.
|
|
38
|
-
|
|
39
|
-
---
|
|
40
|
-
|
|
41
|
-
<a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc">
|
|
42
|
-
<img alt="CapSolver" src="public/images/capsolver.jpeg" />
|
|
43
|
-
</a>
|
|
35
|
+
<sub>Read a full review of Pydoll on <b><a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">The Web Scraping Club</a></b>, the #1 newsletter dedicated to web scraping.</sub>
|
|
44
36
|
|
|
45
|
-
|
|
37
|
+
### Sponsors
|
|
46
38
|
|
|
47
|
-
|
|
39
|
+
<table>
|
|
40
|
+
<tr>
|
|
41
|
+
<td><a href="https://www.thordata.com/?ls=github&lk=pydoll"><img src="public/images/Thordata-logo.png" height="30" alt="Thordata" /></a></td>
|
|
42
|
+
<td><a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc"><img src="public/images/capsolver-logo.png" height="40" alt="CapSolver" /></a></td>
|
|
43
|
+
<td><a href="https://www.testmuai.com/?utm_medium=sponsor&utm_source=pydoll"><img src="public/images/logo-lamda-test.svg" height="30" width="130" alt="LambdaTest" /></a></td>
|
|
44
|
+
</tr>
|
|
45
|
+
</table>
|
|
48
46
|
|
|
49
|
-
|
|
47
|
+
<sub>[Learn more about our sponsors](SPONSORS.md) · [Become a sponsor](https://github.com/sponsors/thalissonvs)</sub>
|
|
50
48
|
|
|
51
49
|
### Why Pydoll
|
|
52
50
|
|
|
53
|
-
- **
|
|
51
|
+
- **Structured extraction**: Define a [Pydantic](https://docs.pydantic.dev/) model, call `tab.extract()`, get typed and validated data back. No manual element-by-element querying.
|
|
54
52
|
- **Async and typed**: Built on `asyncio` from the ground up, 100% type-checked with `mypy`. Full IDE autocompletion and static error checking.
|
|
53
|
+
- **Stealth built in**: Human-like mouse movement, realistic typing, and granular [browser preference](https://pydoll.tech/docs/features/configuration/browser-preferences/) control for fingerprint management.
|
|
55
54
|
- **Network control**: [Intercept](https://pydoll.tech/docs/features/network/interception/) requests to block ads/trackers, [monitor](https://pydoll.tech/docs/features/network/monitoring/) traffic for API discovery, and make [authenticated HTTP requests](https://pydoll.tech/docs/features/network/http-requests/) that inherit the browser session.
|
|
56
55
|
- **Shadow DOM and iframes**: Full support for [shadow roots](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/) (including closed) and cross-origin iframes. Discover, query, and interact with elements inside them using the same API.
|
|
57
|
-
- **Ergonomic API**: `tab.find()` for most cases, `tab.query()` for complex [CSS/XPath selectors](https://pydoll.tech/docs/deep-dive/guides/selectors-guide/).
|
|
58
56
|
|
|
59
57
|
## Installation
|
|
60
58
|
|
|
@@ -64,55 +62,124 @@ pip install pydoll-python
|
|
|
64
62
|
|
|
65
63
|
No WebDriver binaries or external dependencies required.
|
|
66
64
|
|
|
67
|
-
##
|
|
65
|
+
## Getting Started
|
|
68
66
|
|
|
69
|
-
|
|
70
|
-
<summary><b>HAR Network Recording</b></summary>
|
|
71
|
-
<br>
|
|
67
|
+
### 1. Stateful Automation & Evasion
|
|
72
68
|
|
|
73
|
-
|
|
69
|
+
When you need to navigate, bypass challenges, or interact with dynamic UI, Pydoll's imperative API handles it with humanized timing by default.
|
|
74
70
|
|
|
75
71
|
```python
|
|
76
|
-
|
|
72
|
+
import asyncio
|
|
73
|
+
from pydoll.browser import Chrome
|
|
74
|
+
from pydoll.constants import Key
|
|
77
75
|
|
|
78
|
-
async
|
|
79
|
-
|
|
76
|
+
async def google_search(query: str):
|
|
77
|
+
async with Chrome() as browser:
|
|
78
|
+
tab = await browser.start()
|
|
79
|
+
await tab.go_to('https://www.google.com')
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
await tab.
|
|
81
|
+
# Find elements and interact with human-like timing
|
|
82
|
+
search_box = await tab.find(tag_name='textarea', name='q')
|
|
83
|
+
await search_box.insert_text(query)
|
|
84
|
+
await tab.keyboard.press(Key.ENTER)
|
|
83
85
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
+
first_result = await tab.find(
|
|
87
|
+
tag_name='h3',
|
|
88
|
+
text='autoscrape-labs/pydoll',
|
|
89
|
+
timeout=10,
|
|
90
|
+
)
|
|
91
|
+
await first_result.click()
|
|
92
|
+
print(f"Page loaded: {await tab.title}")
|
|
86
93
|
|
|
87
|
-
|
|
94
|
+
asyncio.run(google_search('pydoll site:github.com'))
|
|
88
95
|
```
|
|
89
96
|
|
|
90
|
-
|
|
97
|
+
### 2. Structured Data Extraction
|
|
98
|
+
|
|
99
|
+
Once you reach the target page, switch to the declarative engine. Define what you want with a model, and Pydoll extracts it — typed, validated, and ready to use.
|
|
91
100
|
|
|
92
101
|
```python
|
|
93
|
-
from pydoll.
|
|
102
|
+
from pydoll.browser.chromium import Chrome
|
|
103
|
+
from pydoll.extractor import ExtractionModel, Field
|
|
104
|
+
|
|
105
|
+
class Quote(ExtractionModel):
|
|
106
|
+
text: str = Field(selector='.text', description='The quote text')
|
|
107
|
+
author: str = Field(selector='.author', description='Who said it')
|
|
108
|
+
tags: list[str] = Field(selector='.tag', description='Tags')
|
|
109
|
+
year: int | None = Field(selector='.year', description='Year', default=None)
|
|
110
|
+
|
|
111
|
+
async def extract_quotes():
|
|
112
|
+
async with Chrome() as browser:
|
|
113
|
+
tab = await browser.start()
|
|
114
|
+
await tab.go_to('https://quotes.toscrape.com')
|
|
115
|
+
|
|
116
|
+
quotes = await tab.extract_all(Quote, scope='.quote', timeout=5)
|
|
94
117
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
)
|
|
98
|
-
|
|
118
|
+
for q in quotes:
|
|
119
|
+
print(f'{q.author}: {q.text}') # fully typed, IDE autocomplete works
|
|
120
|
+
print(q.tags) # list[str], not a raw element
|
|
121
|
+
print(q.model_dump_json()) # pydantic serialization built-in
|
|
122
|
+
|
|
123
|
+
asyncio.run(extract_quotes())
|
|
99
124
|
```
|
|
100
125
|
|
|
101
|
-
|
|
126
|
+
Models support CSS/XPath auto-detection, HTML attribute targeting, custom transforms, and nested models.
|
|
127
|
+
|
|
128
|
+
<details>
|
|
129
|
+
<summary><b>Nested models, transforms, and attribute extraction</b></summary>
|
|
130
|
+
<br>
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from datetime import datetime
|
|
134
|
+
from pydoll.extractor import ExtractionModel, Field
|
|
135
|
+
|
|
136
|
+
def parse_date(raw: str) -> datetime:
|
|
137
|
+
return datetime.strptime(raw.strip(), '%B %d, %Y')
|
|
138
|
+
|
|
139
|
+
class Author(ExtractionModel):
|
|
140
|
+
name: str = Field(selector='.author-title')
|
|
141
|
+
born: datetime = Field(
|
|
142
|
+
selector='.author-born-date',
|
|
143
|
+
transform=parse_date,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
class Article(ExtractionModel):
|
|
147
|
+
title: str = Field(selector='h1')
|
|
148
|
+
url: str = Field(selector='.source-link', attribute='href')
|
|
149
|
+
author: Author = Field(selector='.author-card', description='Nested model')
|
|
150
|
+
|
|
151
|
+
article = await tab.extract(Article, timeout=5)
|
|
152
|
+
article.author.born.year # int — types are preserved all the way down
|
|
153
|
+
```
|
|
102
154
|
</details>
|
|
103
155
|
|
|
156
|
+
## Features
|
|
157
|
+
|
|
104
158
|
<details>
|
|
105
|
-
<summary><b>
|
|
159
|
+
<summary><b>Humanized Mouse Movement</b></summary>
|
|
106
160
|
<br>
|
|
107
161
|
|
|
108
|
-
|
|
162
|
+
Mouse operations produce human-like cursor movement by default:
|
|
163
|
+
|
|
164
|
+
- **Bezier curve paths** with asymmetric control points
|
|
165
|
+
- **Fitts's Law timing**: duration scales with distance
|
|
166
|
+
- **Minimum-jerk velocity**: bell-shaped speed profile
|
|
167
|
+
- **Physiological tremor**: Gaussian noise scaled with velocity
|
|
168
|
+
- **Overshoot correction**: ~70% chance on fast movements, then corrects back
|
|
109
169
|
|
|
110
170
|
```python
|
|
111
|
-
await tab.
|
|
112
|
-
await tab.
|
|
171
|
+
await tab.mouse.move(500, 300)
|
|
172
|
+
await tab.mouse.click(500, 300)
|
|
173
|
+
await tab.mouse.drag(100, 200, 500, 400)
|
|
174
|
+
|
|
175
|
+
button = await tab.find(id='submit')
|
|
176
|
+
await button.click()
|
|
177
|
+
|
|
178
|
+
# Opt out when speed matters
|
|
179
|
+
await tab.mouse.click(500, 300, humanize=False)
|
|
113
180
|
```
|
|
114
181
|
|
|
115
|
-
[
|
|
182
|
+
[Mouse Control Docs](https://pydoll.tech/docs/features/automation/mouse-control/)
|
|
116
183
|
</details>
|
|
117
184
|
|
|
118
185
|
<details>
|
|
@@ -141,75 +208,46 @@ Highlights:
|
|
|
141
208
|
- `deep=True` traverses cross-origin iframes (OOPIFs)
|
|
142
209
|
- Standard `find()`, `query()`, `click()` API inside shadow roots
|
|
143
210
|
|
|
144
|
-
```python
|
|
145
|
-
# Cloudflare Turnstile inside a cross-origin iframe
|
|
146
|
-
shadow_roots = await tab.find_shadow_roots(deep=True, timeout=10)
|
|
147
|
-
for sr in shadow_roots:
|
|
148
|
-
checkbox = await sr.query('input[type="checkbox"]', raise_exc=False)
|
|
149
|
-
if checkbox:
|
|
150
|
-
await checkbox.click()
|
|
151
|
-
```
|
|
152
|
-
|
|
153
211
|
[Shadow DOM Docs](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/)
|
|
154
212
|
</details>
|
|
155
213
|
|
|
156
214
|
<details>
|
|
157
|
-
<summary><b>
|
|
215
|
+
<summary><b>HAR Network Recording</b></summary>
|
|
158
216
|
<br>
|
|
159
217
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
- **Bezier curve paths** with asymmetric control points
|
|
163
|
-
- **Fitts's Law timing**: duration scales with distance
|
|
164
|
-
- **Minimum-jerk velocity**: bell-shaped speed profile
|
|
165
|
-
- **Physiological tremor**: Gaussian noise scaled with velocity
|
|
166
|
-
- **Overshoot correction**: ~70% chance on fast movements, then corrects back
|
|
218
|
+
Record network activity during a browser session and export as HAR 1.2. Replay recorded requests to reproduce exact API sequences.
|
|
167
219
|
|
|
168
220
|
```python
|
|
169
|
-
|
|
170
|
-
await tab.mouse.click(500, 300)
|
|
171
|
-
await tab.mouse.drag(100, 200, 500, 400)
|
|
172
|
-
|
|
173
|
-
button = await tab.find(id='submit')
|
|
174
|
-
await button.click()
|
|
175
|
-
|
|
176
|
-
# Opt out when speed matters
|
|
177
|
-
await tab.mouse.click(500, 300, humanize=False)
|
|
178
|
-
```
|
|
221
|
+
from pydoll.browser.chromium import Chrome
|
|
179
222
|
|
|
180
|
-
|
|
181
|
-
|
|
223
|
+
async with Chrome() as browser:
|
|
224
|
+
tab = await browser.start()
|
|
182
225
|
|
|
183
|
-
|
|
226
|
+
async with tab.request.record() as capture:
|
|
227
|
+
await tab.go_to('https://example.com')
|
|
184
228
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
from pydoll.browser import Chrome
|
|
188
|
-
from pydoll.constants import Key
|
|
229
|
+
capture.save('flow.har')
|
|
230
|
+
print(f'Captured {len(capture.entries)} requests')
|
|
189
231
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
tab = await browser.start()
|
|
193
|
-
await tab.go_to('https://www.google.com')
|
|
232
|
+
responses = await tab.request.replay('flow.har')
|
|
233
|
+
```
|
|
194
234
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
await tab.keyboard.press(Key.ENTER)
|
|
235
|
+
[HAR Recording Docs](https://pydoll.tech/docs/features/network/network-recording/)
|
|
236
|
+
</details>
|
|
198
237
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
timeout=10,
|
|
203
|
-
)
|
|
204
|
-
await first_result.click()
|
|
238
|
+
<details>
|
|
239
|
+
<summary><b>Page Bundles</b></summary>
|
|
240
|
+
<br>
|
|
205
241
|
|
|
206
|
-
|
|
207
|
-
print(f"Page loaded: {await tab.title}")
|
|
242
|
+
Save the current page and all its assets (CSS, JS, images, fonts) as a `.zip` bundle for offline viewing. Optionally inline everything into a single HTML file.
|
|
208
243
|
|
|
209
|
-
|
|
244
|
+
```python
|
|
245
|
+
await tab.save_bundle('page.zip')
|
|
246
|
+
await tab.save_bundle('page-inline.zip', inline_assets=True)
|
|
210
247
|
```
|
|
211
248
|
|
|
212
|
-
|
|
249
|
+
[Screenshots, PDFs & Bundles Docs](https://pydoll.tech/docs/features/automation/screenshots-and-pdfs/)
|
|
250
|
+
</details>
|
|
213
251
|
|
|
214
252
|
<details>
|
|
215
253
|
<summary><b>Hybrid Automation (UI + API)</b></summary>
|