pydoll-python 2.12.0__tar.gz → 2.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydoll_python-2.12.1/PKG-INFO +297 -0
- pydoll_python-2.12.1/README.md +276 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/chromium/base.py +2 -1
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/managers/temp_dir_manager.py +45 -3
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/tab.py +10 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/constants.py +36 -4
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/elements/mixins/find_elements_mixin.py +124 -22
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/elements/web_element.py +578 -14
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/base.py +5 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/page/methods.py +7 -7
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/utils.py +28 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pyproject.toml +1 -1
- pydoll_python-2.12.0/PKG-INFO +0 -635
- pydoll_python-2.12.0/README.md +0 -614
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/LICENSE +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/chromium/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/chromium/chrome.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/chromium/edge.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/interfaces.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/managers/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/managers/browser_options_manager.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/managers/browser_process_manager.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/managers/proxy_manager.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/options.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/requests/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/requests/request.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/browser/requests/response.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/browser_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/dom_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/fetch_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/input_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/network_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/page_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/runtime_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/storage_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/commands/target_commands.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/connection/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/connection/connection_handler.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/connection/managers/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/connection/managers/commands_manager.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/connection/managers/events_manager.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/decorators.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/elements/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/elements/mixins/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/exceptions.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/interactions/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/interactions/keyboard.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/interactions/scroll.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/browser/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/browser/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/browser/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/browser/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/debugger/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/dom/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/dom/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/dom/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/dom/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/emulation/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/fetch/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/fetch/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/fetch/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/fetch/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/input/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/input/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/input/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/input/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/io/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/network/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/network/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/network/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/network/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/page/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/page/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/page/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/runtime/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/runtime/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/runtime/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/runtime/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/security/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/storage/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/storage/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/storage/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/storage/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/target/__init__.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/target/events.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/target/methods.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/protocol/target/types.py +0 -0
- {pydoll_python-2.12.0 → pydoll_python-2.12.1}/pydoll/py.typed +0 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pydoll-python
|
|
3
|
+
Version: 2.12.1
|
|
4
|
+
Summary: Pydoll is a library for automating chromium-based browsers without a WebDriver, offering realistic interactions.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: Thalison Fernandes
|
|
7
|
+
Author-email: thalissfernandes99@gmail.com
|
|
8
|
+
Requires-Python: >=3.10,<4.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Requires-Dist: aiofiles (>=25.1.0,<26.0.0)
|
|
16
|
+
Requires-Dist: aiohttp (>=3.9.5,<4.0.0)
|
|
17
|
+
Requires-Dist: typing_extensions (>=4.14.0,<5.0.0)
|
|
18
|
+
Requires-Dist: websockets (>=14,<15)
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<img src="https://github.com/user-attachments/assets/2c380638-b04a-4b04-b1c8-2958e4237a94" alt="Pydoll Logo" /> <br>
|
|
23
|
+
</p>
|
|
24
|
+
</p> <h1 align="center">Pydoll: The Evasion-First Web Automation Framework</h1> <p align="center"> A 100% Typed, <b><code>async</code>-native</b> automation library built for modern bot evasion and high-performance scraping. </p>
|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<a href="https://github.com/autoscrape-labs/pydoll/stargazers"><img src="https://img.shields.io/github/stars/autoscrape-labs/pydoll?style=social"></a>
|
|
28
|
+
<a href="https://codecov.io/gh/autoscrape-labs/pydoll" >
|
|
29
|
+
<img src="https://codecov.io/gh/autoscrape-labs/pydoll/graph/badge.svg?token=40I938OGM9"/>
|
|
30
|
+
</a>
|
|
31
|
+
<img src="https://github.com/autoscrape-labs/pydoll/actions/workflows/tests.yml/badge.svg" alt="Tests">
|
|
32
|
+
<img src="https://github.com/autoscrape-labs/pydoll/actions/workflows/ruff-ci.yml/badge.svg" alt="Ruff CI">
|
|
33
|
+
<img src="https://github.com/autoscrape-labs/pydoll/actions/workflows/mypy.yml/badge.svg" alt="MyPy CI">
|
|
34
|
+
<img src="https://img.shields.io/badge/python-%3E%3D3.10-blue" alt="Python >= 3.10">
|
|
35
|
+
<a href="https://deepwiki.com/autoscrape-labs/pydoll"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
<p align="center"> 📖 <a href="https://pydoll.tech/">Full Documentation</a> • 🚀 <a href="#-getting-started-in-60-seconds">Getting Started</a> • ⚡ <a href="#-the-pydoll-feature-ecosystem">Advanced Features</a> • 🧠 <a href="#-more-than-an-api-a-knowledge-base">Deep Dives</a> • 💖 <a href="#-support-this-project">Support This Project</a> </p>
|
|
40
|
+
|
|
41
|
+
Pydoll is built on a simple philosophy: powerful automation shouldn't require you to fight the browser.
|
|
42
|
+
|
|
43
|
+
Forget broken `webdrivers`, compatibility issues, or being blocked by `navigator.webdriver=true`. Pydoll connects directly to the Chrome DevTools Protocol (CDP), providing a natively asynchronous, robust, and **fully typed** architecture.
|
|
44
|
+
|
|
45
|
+
It's designed for modern scraping, combining an **intuitive high-level API** (for productivity) with **deep-level control** over the network and browser behavior (for evasion), allowing you to bypass complex anti-bot defenses.
|
|
46
|
+
|
|
47
|
+
### The Pydoll Philosophy
|
|
48
|
+
|
|
49
|
+
* **Stealth-by-Design:** Pydoll is built for evasion. Our [human-like interactions](https://pydoll.tech/docs/features/automation/human-interactions/) simulate real user clicks, typing, and scrolling to pass behavioral analysis, while granular [Browser Preferences](https://pydoll.tech/docs/features/configuration/browser-preferences/) control lets you patch your browser fingerprint.
|
|
50
|
+
* **Async & Typed Architecture:** Built from the ground up on `asyncio` and **100% type-checked** with `mypy`. This means top-tier I/O performance for concurrent tasks and a fantastic Developer Experience (DX) with autocompletion and error-checking in your IDE.
|
|
51
|
+
* **Total Network Control:** Go beyond basic HTTP proxies. Pydoll gives you tools to [intercept](https://pydoll.tech/docs/features/network/interception/) (to block ads/trackers) and [monitor](https://pydoll.tech/docs/features/network/monitoring/) traffic, plus [deep documentation](https://pydoll.tech/docs/deep-dive/network/socks-proxies/) on why SOCKS5 is essential to prevent DNS leaks.
|
|
52
|
+
* **Hybrid Automation (The Game-Changer):** Use the UI automation to log in, then use `tab.request` to make blazing-fast API calls that [inherit the entire browser session](https://pydoll.tech/docs/features/network/http-requests/).
|
|
53
|
+
* **Ergonomics Meets Power:** Easy for the simple, powerful for the complex. Use `tab.find()` for 90% of cases and `tab.query()` for complex [CSS/XPath selectors](https://pydoll.tech/docs/deep-dive/guides/selectors-guide/).
|
|
54
|
+
|
|
55
|
+
## 📦 Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install pydoll-python
|
|
59
|
+
```
|
|
60
|
+
That's it. No `webdrivers`. No external dependencies.
|
|
61
|
+
|
|
62
|
+
## 🚀 Getting Started in 60 Seconds
|
|
63
|
+
|
|
64
|
+
Thanks to its `async` architecture and context managers, Pydoll is clean and efficient.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import asyncio
|
|
68
|
+
from pydoll.browser import Chrome
|
|
69
|
+
from pydoll.constants import Key
|
|
70
|
+
|
|
71
|
+
async def google_search(query: str):
|
|
72
|
+
# Context manager handles browser start() and stop()
|
|
73
|
+
async with Chrome() as browser:
|
|
74
|
+
tab = await browser.start()
|
|
75
|
+
await tab.go_to('https://www.google.com')
|
|
76
|
+
|
|
77
|
+
# Intuitive finding API: find by HTML attributes
|
|
78
|
+
search_box = await tab.find(tag_name='textarea', name='q')
|
|
79
|
+
|
|
80
|
+
# "Human-like" interactions simulate typing
|
|
81
|
+
await search_box.insert_text(query)
|
|
82
|
+
await search_box.press_keyboard_key(Key.ENTER)
|
|
83
|
+
|
|
84
|
+
# Find by text and click (simulates mouse movement)
|
|
85
|
+
first_result = await tab.find(
|
|
86
|
+
tag_name='h3',
|
|
87
|
+
text='autoscrape-labs/pydoll', # Supports partial text matching
|
|
88
|
+
timeout=10,
|
|
89
|
+
)
|
|
90
|
+
await first_result.click()
|
|
91
|
+
|
|
92
|
+
# Wait for an element to confirm navigation
|
|
93
|
+
await tab.find(id='repository-container-header', timeout=10)
|
|
94
|
+
print(f"Page loaded: {await tab.title}")
|
|
95
|
+
|
|
96
|
+
asyncio.run(google_search('pydoll python'))
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## ⚡ The Pydoll Feature Ecosystem
|
|
100
|
+
|
|
101
|
+
Pydoll is a complete toolkit for professional automation.
|
|
102
|
+
|
|
103
|
+
<details>
|
|
104
|
+
<summary><b>1. Hybrid Automation (UI + API): The Game-Changer</b></summary>
|
|
105
|
+
<br>
|
|
106
|
+
|
|
107
|
+
Tired of manually extracting and managing cookies to use `requests` or `httpx`? Pydoll solves this.
|
|
108
|
+
|
|
109
|
+
Use the UI automation to pass a complex login (with CAPTCHAs, JS challenges, etc.) and then use `tab.request` to make **authenticated** API calls that automatically inherit all cookies, headers, and session state from the browser. It's the best of both worlds: the robustness of UI automation for auth, and the speed of direct API calls for data extraction.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
# 1. Log in via the UI (handles CAPTCHAs, JS, etc.)
|
|
113
|
+
await tab.go_to('https://my-site.com/login')
|
|
114
|
+
await (await tab.find(id='username')).type_text('user')
|
|
115
|
+
await (await tab.find(id='password')).type_text('pass123')
|
|
116
|
+
await (await tab.find(id='login-btn')).click()
|
|
117
|
+
|
|
118
|
+
# 2. Now, use the browser's session to hit the API!
|
|
119
|
+
# This request automatically INHERITS the login cookies
|
|
120
|
+
response = await tab.request.get('https://my-site.com/api/user/profile')
|
|
121
|
+
user_data = response.json()
|
|
122
|
+
print(f"Welcome, {user_data['name']}!")
|
|
123
|
+
```
|
|
124
|
+
[**📖 Read more about Hybrid Automation**](https://pydoll.tech/docs/features/network/http-requests/)
|
|
125
|
+
</details>
|
|
126
|
+
|
|
127
|
+
<details>
|
|
128
|
+
<summary><b>2. Total Network Control: Monitor & Intercept Traffic</b></summary>
|
|
129
|
+
<br>
|
|
130
|
+
|
|
131
|
+
Take full control of the network stack. Pydoll allows you to not only **monitor** traffic for reverse-engineering APIs but also to **intercept** requests in real-time.
|
|
132
|
+
|
|
133
|
+
Use this to block ads, trackers, images, or CSS to dramatically speed up your scraping and save bandwidth, or even to modify headers and mock API responses for testing.
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import asyncio
|
|
137
|
+
from pydoll.browser.chromium import Chrome
|
|
138
|
+
from pydoll.protocol.fetch.events import FetchEvent, RequestPausedEvent
|
|
139
|
+
from pydoll.protocol.network.types import ErrorReason
|
|
140
|
+
|
|
141
|
+
async def block_images():
|
|
142
|
+
async with Chrome() as browser:
|
|
143
|
+
tab = await browser.start()
|
|
144
|
+
|
|
145
|
+
async def block_resource(event: RequestPausedEvent):
|
|
146
|
+
request_id = event['params']['requestId']
|
|
147
|
+
resource_type = event['params']['resourceType']
|
|
148
|
+
url = event['params']['request']['url']
|
|
149
|
+
|
|
150
|
+
# Block images and stylesheets
|
|
151
|
+
if resource_type in ['Image', 'Stylesheet']:
|
|
152
|
+
await tab.fail_request(request_id, ErrorReason.BLOCKED_BY_CLIENT)
|
|
153
|
+
else:
|
|
154
|
+
# Continue other requests
|
|
155
|
+
await tab.continue_request(request_id)
|
|
156
|
+
|
|
157
|
+
await tab.enable_fetch_events()
|
|
158
|
+
await tab.on(FetchEvent.REQUEST_PAUSED, block_resource)
|
|
159
|
+
|
|
160
|
+
await tab.go_to('https://example.com')
|
|
161
|
+
await asyncio.sleep(3)
|
|
162
|
+
await tab.disable_fetch_events()
|
|
163
|
+
|
|
164
|
+
asyncio.run(block_images())
|
|
165
|
+
```
|
|
166
|
+
[**📖 Network Monitoring Docs**](https://pydoll.tech/docs/features/network/monitoring/) | [**📖 Request Interception Docs**](https://pydoll.tech/docs/features/network/interception/)
|
|
167
|
+
</details>
|
|
168
|
+
|
|
169
|
+
<details>
|
|
170
|
+
<summary><b>3. Deep Browser Control: The Fingerprint Evasion Manual</b></summary>
|
|
171
|
+
<br>
|
|
172
|
+
|
|
173
|
+
A `User-Agent` isn't enough. Pydoll gives you granular control over [Browser Preferences](https://pydoll.tech/docs/features/configuration/browser-preferences/), allowing you to modify hundreds of internal Chrome settings to build a robust and consistent fingerprint.
|
|
174
|
+
|
|
175
|
+
Our documentation doesn't just give you the tool; it [explains in detail](https://pydoll.tech/docs/deep-dive/fingerprinting/browser-fingerprinting/) how `canvas`, WebGL, font, and timezone fingerprinting works, and how to use these preferences to defend your automation.
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
options = ChromiumOptions()
|
|
179
|
+
|
|
180
|
+
# Create a realistic and clean browser profile
|
|
181
|
+
options.browser_preferences = {
|
|
182
|
+
'profile': {
|
|
183
|
+
'default_content_setting_values': {
|
|
184
|
+
'notifications': 2, # Block notification popups
|
|
185
|
+
'geolocation': 2, # Block location requests
|
|
186
|
+
},
|
|
187
|
+
'password_manager_enabled': False # Disable "save password" prompt
|
|
188
|
+
},
|
|
189
|
+
'intl': {
|
|
190
|
+
'accept_languages': 'en-US,en', # Make consistent with your proxy IP
|
|
191
|
+
},
|
|
192
|
+
'browser': {
|
|
193
|
+
'check_default_browser': False, # Don't ask to be default browser
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
```
|
|
197
|
+
[**📖 Full Guide to Browser Preferences**](https://pydoll.tech/docs/features/configuration/browser-preferences/)
|
|
198
|
+
</details>
|
|
199
|
+
|
|
200
|
+
<details>
|
|
201
|
+
<summary><b>4. Built for Scale: Concurrency, Contexts & Remote Connections</b></summary>
|
|
202
|
+
<br>
|
|
203
|
+
|
|
204
|
+
Pydoll is built for scale. Its `async` architecture allows you to manage [multiple tabs](https://pydoll.tech/docs/features/browser-management/tabs/) and [browser contexts](https://pydoll.tech/docs/features/browser-management/contexts/) (isolated sessions) in parallel.
|
|
205
|
+
|
|
206
|
+
Furthermore, Pydoll excels in production architectures. You can run your browser in a Docker container and **connect to it remotely** from your Python script, decoupling the controller from the worker. Our documentation includes guides on [how to set up your own remote server](https://pydoll.tech/docs/features/advanced/remote-connections/).
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
# Example: Scrape 2 sites in parallel
|
|
210
|
+
|
|
211
|
+
async def scrape_page(url, tab):
|
|
212
|
+
await tab.go_to(url)
|
|
213
|
+
return await tab.title
|
|
214
|
+
|
|
215
|
+
async def concurrent_scraping():
|
|
216
|
+
async with Chrome() as browser:
|
|
217
|
+
tab_google = await browser.start()
|
|
218
|
+
tab_ddg = await browser.new_tab() # Create a new tab
|
|
219
|
+
|
|
220
|
+
# Execute both scraping tasks concurrently
|
|
221
|
+
tasks = [
|
|
222
|
+
scrape_page('https://google.com/', tab_google),
|
|
223
|
+
scrape_page('https://duckduckgo.com/', tab_ddg)
|
|
224
|
+
]
|
|
225
|
+
results = await asyncio.gather(*tasks)
|
|
226
|
+
print(results)
|
|
227
|
+
```
|
|
228
|
+
[**📖 Multi-Tab Management Docs**](https://pydoll.tech/docs/features/browser-management/tabs/) | [**📖 Remote Connection Docs**](https://pydoll.tech/docs/features/advanced/remote-connections/)
|
|
229
|
+
</details>
|
|
230
|
+
|
|
231
|
+
<details>
|
|
232
|
+
<summary><b>5. Robust Engineering: `@retry` Decorator & 100% Typed</b></summary>
|
|
233
|
+
<br>
|
|
234
|
+
|
|
235
|
+
**Reliable Engineering:** Pydoll is **fully typed**, providing a fantastic Developer Experience (DX) with full autocompletion in your IDE and error-checking before you even run your code. [Read about our Type System](https://pydoll.tech/docs/deep-dive/fundamentals/typing-system/).
|
|
236
|
+
|
|
237
|
+
**Robust-by-Design:** The `@retry` decorator turns fragile scripts into production-ready automations. It doesn't just "try again"; it lets you execute custom **recovery logic** (`on_retry`), like refreshing the page or rotating a proxy, before the next attempt.
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
from pydoll.decorators import retry
|
|
241
|
+
from pydoll.exceptions import ElementNotFound, NetworkError
|
|
242
|
+
|
|
243
|
+
@retry(
|
|
244
|
+
max_retries=3,
|
|
245
|
+
exceptions=[ElementNotFound, NetworkError], # Only retry on these specific errors
|
|
246
|
+
on_retry=my_recovery_function, # Run your custom recovery logic
|
|
247
|
+
exponential_backoff=True # Wait 2s, 4s, 8s...
|
|
248
|
+
)
|
|
249
|
+
async def scrape_product(self, url: str):
|
|
250
|
+
# ... your scraping logic ...
|
|
251
|
+
```
|
|
252
|
+
[**📖 `@retry` Decorator Docs**](https://pydoll.tech/docs/features/advanced/decorators/)
|
|
253
|
+
</details>
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## 🧠 More Than an API: A Knowledge Base
|
|
258
|
+
|
|
259
|
+
Pydoll is not a black box. We believe that to defeat anti-bot systems, you must understand them. Our documentation is one of the most comprehensive public resources on the subject, teaching you not just the "how," but the "why."
|
|
260
|
+
|
|
261
|
+
### 1. The Battle Against Fingerprinting (Strategic Guide)
|
|
262
|
+
Understand how bots are detected and how Pydoll is designed to win.
|
|
263
|
+
* **[Evasion Techniques Guide](https://pydoll.tech/docs/deep-dive/fingerprinting/evasion-techniques/)**: Our unified 3-layer evasion strategy.
|
|
264
|
+
* **[Network Fingerprinting](https://pydoll.tech/docs/deep-dive/fingerprinting/network-fingerprinting/)**: How your IP, TTL, and TLS (JA3) headers give you away.
|
|
265
|
+
* **[Browser Fingerprinting](https://pydoll.tech/docs/deep-dive/fingerprinting/browser-fingerprinting/)**: How `canvas`, WebGL, and fonts create your unique ID.
|
|
266
|
+
* **[Behavioral Fingerprinting](https://pydoll.tech/docs/deep-dive/fingerprinting/behavioral-fingerprinting/)**: Why mouse/keyboard telemetry is the new front line of detection.
|
|
267
|
+
|
|
268
|
+
### 2. The Advanced Networking Manual (The Foundation)
|
|
269
|
+
Proxies are more than just `--proxy-server`.
|
|
270
|
+
* **[HTTP vs. SOCKS5](https://pydoll.tech/docs/deep-dive/network/socks-proxies/)**: Why SOCKS5 is superior (it solves DNS leaks).
|
|
271
|
+
* **[Proxy Detection](https://pydoll.tech/docs/deep-dive/network/proxy-detection/)**: How sites know you're using a proxy (WebRTC Leaks).
|
|
272
|
+
* **[Build Your Own Proxy](https://pydoll.tech/docs/deep-dive/network/build-proxy/)**: Yes, we even teach you how to build your own SOCKS5 proxy server in Python.
|
|
273
|
+
|
|
274
|
+
### 3. Transparent Architecture (Software Engineering)
|
|
275
|
+
Software engineering you can trust.
|
|
276
|
+
* **[Domain-Driven Design (OOP)](https://pydoll.tech/docs/deep-dive/architecture/browser-domain/)**: The clean architecture behind `Browser`, `Tab`, and `WebElement`.
|
|
277
|
+
* **[The FindElements Mixin](https://pydoll.tech/docs/deep-dive/architecture/find-elements-mixin/)**: The magic behind the intuitive `find()` API.
|
|
278
|
+
* **[The Connection Layer](https://pydoll.tech/docs/deep-dive/fundamentals/connection-layer/)**: How Pydoll manages `asyncio` and the CDP.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## 🤝 Contributing
|
|
283
|
+
|
|
284
|
+
We would love your help to make Pydoll even better! Check out our [contribution guidelines](CONTRIBUTING.md) to get started.
|
|
285
|
+
|
|
286
|
+
## 💖 Support This Project
|
|
287
|
+
|
|
288
|
+
If you find Pydoll useful, consider [sponsoring my work on GitHub](https://github.com/sponsors/thalissonvs). Every contribution helps keep the project alive and drives new features!
|
|
289
|
+
|
|
290
|
+
## 📄 License
|
|
291
|
+
|
|
292
|
+
Pydoll is licensed under the [MIT License](LICENSE).
|
|
293
|
+
|
|
294
|
+
<p align="center">
|
|
295
|
+
<b>Pydoll</b> — Web automation, taken seriously.
|
|
296
|
+
</p>
|
|
297
|
+
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://github.com/user-attachments/assets/2c380638-b04a-4b04-b1c8-2958e4237a94" alt="Pydoll Logo" /> <br>
|
|
3
|
+
</p>
|
|
4
|
+
</p> <h1 align="center">Pydoll: The Evasion-First Web Automation Framework</h1> <p align="center"> A 100% Typed, <b><code>async</code>-native</b> automation library built for modern bot evasion and high-performance scraping. </p>
|
|
5
|
+
|
|
6
|
+
<p align="center">
|
|
7
|
+
<a href="https://github.com/autoscrape-labs/pydoll/stargazers"><img src="https://img.shields.io/github/stars/autoscrape-labs/pydoll?style=social"></a>
|
|
8
|
+
<a href="https://codecov.io/gh/autoscrape-labs/pydoll" >
|
|
9
|
+
<img src="https://codecov.io/gh/autoscrape-labs/pydoll/graph/badge.svg?token=40I938OGM9"/>
|
|
10
|
+
</a>
|
|
11
|
+
<img src="https://github.com/autoscrape-labs/pydoll/actions/workflows/tests.yml/badge.svg" alt="Tests">
|
|
12
|
+
<img src="https://github.com/autoscrape-labs/pydoll/actions/workflows/ruff-ci.yml/badge.svg" alt="Ruff CI">
|
|
13
|
+
<img src="https://github.com/autoscrape-labs/pydoll/actions/workflows/mypy.yml/badge.svg" alt="MyPy CI">
|
|
14
|
+
<img src="https://img.shields.io/badge/python-%3E%3D3.10-blue" alt="Python >= 3.10">
|
|
15
|
+
<a href="https://deepwiki.com/autoscrape-labs/pydoll"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
<p align="center"> 📖 <a href="https://pydoll.tech/">Full Documentation</a> • 🚀 <a href="#-getting-started-in-60-seconds">Getting Started</a> • ⚡ <a href="#-the-pydoll-feature-ecosystem">Advanced Features</a> • 🧠 <a href="#-more-than-an-api-a-knowledge-base">Deep Dives</a> • 💖 <a href="#-support-this-project">Support This Project</a> </p>
|
|
20
|
+
|
|
21
|
+
Pydoll is built on a simple philosophy: powerful automation shouldn't require you to fight the browser.
|
|
22
|
+
|
|
23
|
+
Forget broken `webdrivers`, compatibility issues, or being blocked by `navigator.webdriver=true`. Pydoll connects directly to the Chrome DevTools Protocol (CDP), providing a natively asynchronous, robust, and **fully typed** architecture.
|
|
24
|
+
|
|
25
|
+
It's designed for modern scraping, combining an **intuitive high-level API** (for productivity) with **deep-level control** over the network and browser behavior (for evasion), allowing you to bypass complex anti-bot defenses.
|
|
26
|
+
|
|
27
|
+
### The Pydoll Philosophy
|
|
28
|
+
|
|
29
|
+
* **Stealth-by-Design:** Pydoll is built for evasion. Our [human-like interactions](https://pydoll.tech/docs/features/automation/human-interactions/) simulate real user clicks, typing, and scrolling to pass behavioral analysis, while granular [Browser Preferences](https://pydoll.tech/docs/features/configuration/browser-preferences/) control lets you patch your browser fingerprint.
|
|
30
|
+
* **Async & Typed Architecture:** Built from the ground up on `asyncio` and **100% type-checked** with `mypy`. This means top-tier I/O performance for concurrent tasks and a fantastic Developer Experience (DX) with autocompletion and error-checking in your IDE.
|
|
31
|
+
* **Total Network Control:** Go beyond basic HTTP proxies. Pydoll gives you tools to [intercept](https://pydoll.tech/docs/features/network/interception/) (to block ads/trackers) and [monitor](https://pydoll.tech/docs/features/network/monitoring/) traffic, plus [deep documentation](https://pydoll.tech/docs/deep-dive/network/socks-proxies/) on why SOCKS5 is essential to prevent DNS leaks.
|
|
32
|
+
* **Hybrid Automation (The Game-Changer):** Use the UI automation to log in, then use `tab.request` to make blazing-fast API calls that [inherit the entire browser session](https://pydoll.tech/docs/features/network/http-requests/).
|
|
33
|
+
* **Ergonomics Meets Power:** Easy for the simple, powerful for the complex. Use `tab.find()` for 90% of cases and `tab.query()` for complex [CSS/XPath selectors](https://pydoll.tech/docs/deep-dive/guides/selectors-guide/).
|
|
34
|
+
|
|
35
|
+
## 📦 Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install pydoll-python
|
|
39
|
+
```
|
|
40
|
+
That's it. No `webdrivers`. No external dependencies.
|
|
41
|
+
|
|
42
|
+
## 🚀 Getting Started in 60 Seconds
|
|
43
|
+
|
|
44
|
+
Thanks to its `async` architecture and context managers, Pydoll is clean and efficient.
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import asyncio
|
|
48
|
+
from pydoll.browser import Chrome
|
|
49
|
+
from pydoll.constants import Key
|
|
50
|
+
|
|
51
|
+
async def google_search(query: str):
|
|
52
|
+
# Context manager handles browser start() and stop()
|
|
53
|
+
async with Chrome() as browser:
|
|
54
|
+
tab = await browser.start()
|
|
55
|
+
await tab.go_to('https://www.google.com')
|
|
56
|
+
|
|
57
|
+
# Intuitive finding API: find by HTML attributes
|
|
58
|
+
search_box = await tab.find(tag_name='textarea', name='q')
|
|
59
|
+
|
|
60
|
+
# "Human-like" interactions simulate typing
|
|
61
|
+
await search_box.insert_text(query)
|
|
62
|
+
await search_box.press_keyboard_key(Key.ENTER)
|
|
63
|
+
|
|
64
|
+
# Find by text and click (simulates mouse movement)
|
|
65
|
+
first_result = await tab.find(
|
|
66
|
+
tag_name='h3',
|
|
67
|
+
text='autoscrape-labs/pydoll', # Supports partial text matching
|
|
68
|
+
timeout=10,
|
|
69
|
+
)
|
|
70
|
+
await first_result.click()
|
|
71
|
+
|
|
72
|
+
# Wait for an element to confirm navigation
|
|
73
|
+
await tab.find(id='repository-container-header', timeout=10)
|
|
74
|
+
print(f"Page loaded: {await tab.title}")
|
|
75
|
+
|
|
76
|
+
asyncio.run(google_search('pydoll python'))
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## ⚡ The Pydoll Feature Ecosystem
|
|
80
|
+
|
|
81
|
+
Pydoll is a complete toolkit for professional automation.
|
|
82
|
+
|
|
83
|
+
<details>
|
|
84
|
+
<summary><b>1. Hybrid Automation (UI + API): The Game-Changer</b></summary>
|
|
85
|
+
<br>
|
|
86
|
+
|
|
87
|
+
Tired of manually extracting and managing cookies to use `requests` or `httpx`? Pydoll solves this.
|
|
88
|
+
|
|
89
|
+
Use the UI automation to pass a complex login (with CAPTCHAs, JS challenges, etc.) and then use `tab.request` to make **authenticated** API calls that automatically inherit all cookies, headers, and session state from the browser. It's the best of both worlds: the robustness of UI automation for auth, and the speed of direct API calls for data extraction.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
# 1. Log in via the UI (handles CAPTCHAs, JS, etc.)
|
|
93
|
+
await tab.go_to('https://my-site.com/login')
|
|
94
|
+
await (await tab.find(id='username')).type_text('user')
|
|
95
|
+
await (await tab.find(id='password')).type_text('pass123')
|
|
96
|
+
await (await tab.find(id='login-btn')).click()
|
|
97
|
+
|
|
98
|
+
# 2. Now, use the browser's session to hit the API!
|
|
99
|
+
# This request automatically INHERITS the login cookies
|
|
100
|
+
response = await tab.request.get('https://my-site.com/api/user/profile')
|
|
101
|
+
user_data = response.json()
|
|
102
|
+
print(f"Welcome, {user_data['name']}!")
|
|
103
|
+
```
|
|
104
|
+
[**📖 Read more about Hybrid Automation**](https://pydoll.tech/docs/features/network/http-requests/)
|
|
105
|
+
</details>
|
|
106
|
+
|
|
107
|
+
<details>
|
|
108
|
+
<summary><b>2. Total Network Control: Monitor & Intercept Traffic</b></summary>
|
|
109
|
+
<br>
|
|
110
|
+
|
|
111
|
+
Take full control of the network stack. Pydoll allows you to not only **monitor** traffic for reverse-engineering APIs but also to **intercept** requests in real-time.
|
|
112
|
+
|
|
113
|
+
Use this to block ads, trackers, images, or CSS to dramatically speed up your scraping and save bandwidth, or even to modify headers and mock API responses for testing.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
import asyncio
|
|
117
|
+
from pydoll.browser.chromium import Chrome
|
|
118
|
+
from pydoll.protocol.fetch.events import FetchEvent, RequestPausedEvent
|
|
119
|
+
from pydoll.protocol.network.types import ErrorReason
|
|
120
|
+
|
|
121
|
+
async def block_images():
|
|
122
|
+
async with Chrome() as browser:
|
|
123
|
+
tab = await browser.start()
|
|
124
|
+
|
|
125
|
+
async def block_resource(event: RequestPausedEvent):
|
|
126
|
+
request_id = event['params']['requestId']
|
|
127
|
+
resource_type = event['params']['resourceType']
|
|
128
|
+
url = event['params']['request']['url']
|
|
129
|
+
|
|
130
|
+
# Block images and stylesheets
|
|
131
|
+
if resource_type in ['Image', 'Stylesheet']:
|
|
132
|
+
await tab.fail_request(request_id, ErrorReason.BLOCKED_BY_CLIENT)
|
|
133
|
+
else:
|
|
134
|
+
# Continue other requests
|
|
135
|
+
await tab.continue_request(request_id)
|
|
136
|
+
|
|
137
|
+
await tab.enable_fetch_events()
|
|
138
|
+
await tab.on(FetchEvent.REQUEST_PAUSED, block_resource)
|
|
139
|
+
|
|
140
|
+
await tab.go_to('https://example.com')
|
|
141
|
+
await asyncio.sleep(3)
|
|
142
|
+
await tab.disable_fetch_events()
|
|
143
|
+
|
|
144
|
+
asyncio.run(block_images())
|
|
145
|
+
```
|
|
146
|
+
[**📖 Network Monitoring Docs**](https://pydoll.tech/docs/features/network/monitoring/) | [**📖 Request Interception Docs**](https://pydoll.tech/docs/features/network/interception/)
|
|
147
|
+
</details>
|
|
148
|
+
|
|
149
|
+
<details>
|
|
150
|
+
<summary><b>3. Deep Browser Control: The Fingerprint Evasion Manual</b></summary>
|
|
151
|
+
<br>
|
|
152
|
+
|
|
153
|
+
A `User-Agent` isn't enough. Pydoll gives you granular control over [Browser Preferences](https://pydoll.tech/docs/features/configuration/browser-preferences/), allowing you to modify hundreds of internal Chrome settings to build a robust and consistent fingerprint.
|
|
154
|
+
|
|
155
|
+
Our documentation doesn't just give you the tool; it [explains in detail](https://pydoll.tech/docs/deep-dive/fingerprinting/browser-fingerprinting/) how `canvas`, WebGL, font, and timezone fingerprinting works, and how to use these preferences to defend your automation.
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
options = ChromiumOptions()
|
|
159
|
+
|
|
160
|
+
# Create a realistic and clean browser profile
|
|
161
|
+
options.browser_preferences = {
|
|
162
|
+
'profile': {
|
|
163
|
+
'default_content_setting_values': {
|
|
164
|
+
'notifications': 2, # Block notification popups
|
|
165
|
+
'geolocation': 2, # Block location requests
|
|
166
|
+
},
|
|
167
|
+
'password_manager_enabled': False # Disable "save password" prompt
|
|
168
|
+
},
|
|
169
|
+
'intl': {
|
|
170
|
+
'accept_languages': 'en-US,en', # Make consistent with your proxy IP
|
|
171
|
+
},
|
|
172
|
+
'browser': {
|
|
173
|
+
'check_default_browser': False, # Don't ask to be default browser
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
[**📖 Full Guide to Browser Preferences**](https://pydoll.tech/docs/features/configuration/browser-preferences/)
|
|
178
|
+
</details>
|
|
179
|
+
|
|
180
|
+
<details>
|
|
181
|
+
<summary><b>4. Built for Scale: Concurrency, Contexts & Remote Connections</b></summary>
|
|
182
|
+
<br>
|
|
183
|
+
|
|
184
|
+
Pydoll is built for scale. Its `async` architecture allows you to manage [multiple tabs](https://pydoll.tech/docs/features/browser-management/tabs/) and [browser contexts](https://pydoll.tech/docs/features/browser-management/contexts/) (isolated sessions) in parallel.
|
|
185
|
+
|
|
186
|
+
Furthermore, Pydoll excels in production architectures. You can run your browser in a Docker container and **connect to it remotely** from your Python script, decoupling the controller from the worker. Our documentation includes guides on [how to set up your own remote server](https://pydoll.tech/docs/features/advanced/remote-connections/).
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
# Example: Scrape 2 sites in parallel
|
|
190
|
+
|
|
191
|
+
async def scrape_page(url, tab):
|
|
192
|
+
await tab.go_to(url)
|
|
193
|
+
return await tab.title
|
|
194
|
+
|
|
195
|
+
async def concurrent_scraping():
|
|
196
|
+
async with Chrome() as browser:
|
|
197
|
+
tab_google = await browser.start()
|
|
198
|
+
tab_ddg = await browser.new_tab() # Create a new tab
|
|
199
|
+
|
|
200
|
+
# Execute both scraping tasks concurrently
|
|
201
|
+
tasks = [
|
|
202
|
+
scrape_page('https://google.com/', tab_google),
|
|
203
|
+
scrape_page('https://duckduckgo.com/', tab_ddg)
|
|
204
|
+
]
|
|
205
|
+
results = await asyncio.gather(*tasks)
|
|
206
|
+
print(results)
|
|
207
|
+
```
|
|
208
|
+
[**📖 Multi-Tab Management Docs**](https://pydoll.tech/docs/features/browser-management/tabs/) | [**📖 Remote Connection Docs**](https://pydoll.tech/docs/features/advanced/remote-connections/)
|
|
209
|
+
</details>
|
|
210
|
+
|
|
211
|
+
<details>
|
|
212
|
+
<summary><b>5. Robust Engineering: `@retry` Decorator & 100% Typed</b></summary>
|
|
213
|
+
<br>
|
|
214
|
+
|
|
215
|
+
**Reliable Engineering:** Pydoll is **fully typed**, providing a fantastic Developer Experience (DX) with full autocompletion in your IDE and error-checking before you even run your code. [Read about our Type System](https://pydoll.tech/docs/deep-dive/fundamentals/typing-system/).
|
|
216
|
+
|
|
217
|
+
**Robust-by-Design:** The `@retry` decorator turns fragile scripts into production-ready automations. It doesn't just "try again"; it lets you execute custom **recovery logic** (`on_retry`), like refreshing the page or rotating a proxy, before the next attempt.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from pydoll.decorators import retry
|
|
221
|
+
from pydoll.exceptions import ElementNotFound, NetworkError
|
|
222
|
+
|
|
223
|
+
@retry(
|
|
224
|
+
max_retries=3,
|
|
225
|
+
exceptions=[ElementNotFound, NetworkError], # Only retry on these specific errors
|
|
226
|
+
on_retry=my_recovery_function, # Run your custom recovery logic
|
|
227
|
+
exponential_backoff=True # Wait 2s, 4s, 8s...
|
|
228
|
+
)
|
|
229
|
+
async def scrape_product(self, url: str):
|
|
230
|
+
# ... your scraping logic ...
|
|
231
|
+
```
|
|
232
|
+
[**📖 `@retry` Decorator Docs**](https://pydoll.tech/docs/features/advanced/decorators/)
|
|
233
|
+
</details>
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## 🧠 More Than an API: A Knowledge Base
|
|
238
|
+
|
|
239
|
+
Pydoll is not a black box. We believe that to defeat anti-bot systems, you must understand them. Our documentation is one of the most comprehensive public resources on the subject, teaching you not just the "how," but the "why."
|
|
240
|
+
|
|
241
|
+
### 1. The Battle Against Fingerprinting (Strategic Guide)
|
|
242
|
+
Understand how bots are detected and how Pydoll is designed to win.
|
|
243
|
+
* **[Evasion Techniques Guide](https://pydoll.tech/docs/deep-dive/fingerprinting/evasion-techniques/)**: Our unified 3-layer evasion strategy.
|
|
244
|
+
* **[Network Fingerprinting](https://pydoll.tech/docs/deep-dive/fingerprinting/network-fingerprinting/)**: How your IP, TTL, and TLS (JA3) headers give you away.
|
|
245
|
+
* **[Browser Fingerprinting](https://pydoll.tech/docs/deep-dive/fingerprinting/browser-fingerprinting/)**: How `canvas`, WebGL, and fonts create your unique ID.
|
|
246
|
+
* **[Behavioral Fingerprinting](https://pydoll.tech/docs/deep-dive/fingerprinting/behavioral-fingerprinting/)**: Why mouse/keyboard telemetry is the new front line of detection.
|
|
247
|
+
|
|
248
|
+
### 2. The Advanced Networking Manual (The Foundation)
|
|
249
|
+
Proxies are more than just `--proxy-server`.
|
|
250
|
+
* **[HTTP vs. SOCKS5](https://pydoll.tech/docs/deep-dive/network/socks-proxies/)**: Why SOCKS5 is superior (it solves DNS leaks).
|
|
251
|
+
* **[Proxy Detection](https://pydoll.tech/docs/deep-dive/network/proxy-detection/)**: How sites know you're using a proxy (WebRTC Leaks).
|
|
252
|
+
* **[Build Your Own Proxy](https://pydoll.tech/docs/deep-dive/network/build-proxy/)**: Yes, we even teach you how to build your own SOCKS5 proxy server in Python.
|
|
253
|
+
|
|
254
|
+
### 3. Transparent Architecture (Software Engineering)
|
|
255
|
+
Software engineering you can trust.
|
|
256
|
+
* **[Domain-Driven Design (OOP)](https://pydoll.tech/docs/deep-dive/architecture/browser-domain/)**: The clean architecture behind `Browser`, `Tab`, and `WebElement`.
|
|
257
|
+
* **[The FindElements Mixin](https://pydoll.tech/docs/deep-dive/architecture/find-elements-mixin/)**: The magic behind the intuitive `find()` API.
|
|
258
|
+
* **[The Connection Layer](https://pydoll.tech/docs/deep-dive/fundamentals/connection-layer/)**: How Pydoll manages `asyncio` and the CDP.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## 🤝 Contributing
|
|
263
|
+
|
|
264
|
+
We would love your help to make Pydoll even better! Check out our [contribution guidelines](CONTRIBUTING.md) to get started.
|
|
265
|
+
|
|
266
|
+
## 💖 Support This Project
|
|
267
|
+
|
|
268
|
+
If you find Pydoll useful, consider [sponsoring my work on GitHub](https://github.com/sponsors/thalissonvs). Every contribution helps keep the project alive and drives new features!
|
|
269
|
+
|
|
270
|
+
## 📄 License
|
|
271
|
+
|
|
272
|
+
Pydoll is licensed under the [MIT License](LICENSE).
|
|
273
|
+
|
|
274
|
+
<p align="center">
|
|
275
|
+
<b>Pydoll</b> — Web automation, taken seriously.
|
|
276
|
+
</p>
|
|
@@ -213,8 +213,9 @@ class Browser(ABC): # noqa: PLR0904
|
|
|
213
213
|
logger.info('Stopping browser process')
|
|
214
214
|
await self._execute_command(BrowserCommands.close())
|
|
215
215
|
self._browser_process_manager.stop_process()
|
|
216
|
-
self._temp_directory_manager.cleanup()
|
|
217
216
|
await self._connection_handler.close()
|
|
217
|
+
await asyncio.sleep(0.5 if os.name == 'nt' else 0.1)
|
|
218
|
+
self._temp_directory_manager.cleanup()
|
|
218
219
|
logger.info('Browser process stopped and resources cleaned up')
|
|
219
220
|
|
|
220
221
|
async def close(self):
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
import shutil
|
|
3
4
|
import time
|
|
4
5
|
from pathlib import Path
|
|
@@ -80,16 +81,37 @@ class TempDirectoryManager:
|
|
|
80
81
|
Handles Chromium-specific locked files like CrashpadMetrics.
|
|
81
82
|
"""
|
|
82
83
|
matches = ['CrashpadMetrics-active.pma']
|
|
84
|
+
match_substrings = ['Safe Browsing', 'Safe Browsing Cookies']
|
|
85
|
+
# Extra patterns commonly locked on Windows; compare case-insensitively
|
|
86
|
+
windows_locked_substrings = [
|
|
87
|
+
'\\cache\\',
|
|
88
|
+
'/cache/',
|
|
89
|
+
'no_vary_search',
|
|
90
|
+
'journal.baj',
|
|
91
|
+
'\\network\\cookies',
|
|
92
|
+
'/network/cookies',
|
|
93
|
+
'cookies-journal',
|
|
94
|
+
]
|
|
83
95
|
exc_type, exc_value, _ = exc_info
|
|
84
96
|
|
|
85
97
|
if exc_type is PermissionError:
|
|
86
|
-
|
|
98
|
+
filename = Path(path).name
|
|
99
|
+
# Known Chromium files that may remain locked briefly on Windows
|
|
100
|
+
path_lc = path.lower()
|
|
101
|
+
windows_match = os.name == 'nt' and any(
|
|
102
|
+
substr in path_lc for substr in windows_locked_substrings
|
|
103
|
+
)
|
|
104
|
+
if (
|
|
105
|
+
filename in matches
|
|
106
|
+
or any(substr in path for substr in match_substrings)
|
|
107
|
+
or windows_match
|
|
108
|
+
):
|
|
87
109
|
try:
|
|
88
110
|
self.retry_process_file(func, path)
|
|
89
111
|
return
|
|
90
112
|
except PermissionError:
|
|
91
|
-
logger.warning(f'
|
|
92
|
-
|
|
113
|
+
logger.warning(f'Ignoring locked Chrome file during cleanup: {path}')
|
|
114
|
+
return
|
|
93
115
|
elif exc_type is OSError:
|
|
94
116
|
return
|
|
95
117
|
raise exc_value
|
|
@@ -104,3 +126,23 @@ class TempDirectoryManager:
|
|
|
104
126
|
for temp_dir in self._temp_dirs:
|
|
105
127
|
logger.info(f'Cleaning up temp directory: {temp_dir.name}')
|
|
106
128
|
shutil.rmtree(temp_dir.name, onerror=self.handle_cleanup_error)
|
|
129
|
+
remaining = Path(temp_dir.name)
|
|
130
|
+
if not remaining.exists():
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
for attempt in range(10):
|
|
134
|
+
time.sleep(0.2)
|
|
135
|
+
try:
|
|
136
|
+
shutil.rmtree(temp_dir.name, onerror=self.handle_cleanup_error)
|
|
137
|
+
except Exception: # noqa: BLE001 - best-effort cleanup
|
|
138
|
+
pass
|
|
139
|
+
if not remaining.exists():
|
|
140
|
+
logger.debug(
|
|
141
|
+
f'Temp directory removed after retry #{attempt + 1}: {temp_dir.name}'
|
|
142
|
+
)
|
|
143
|
+
break
|
|
144
|
+
if remaining.exists():
|
|
145
|
+
logger.warning(
|
|
146
|
+
f'Temp directory still present after retries (leftover files may remain): '
|
|
147
|
+
f'{temp_dir.name}'
|
|
148
|
+
)
|