pydoll-python 2.21.3__tar.gz → 2.22.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/PKG-INFO +135 -96
  2. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/README.md +133 -95
  3. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/tab.py +67 -15
  4. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/__init__.py +2 -0
  5. pydoll_python-2.22.0/pydoll/commands/accessibility_commands.py +223 -0
  6. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/exceptions.py +11 -0
  7. pydoll_python-2.22.0/pydoll/extractor/__init__.py +16 -0
  8. pydoll_python-2.22.0/pydoll/extractor/engine.py +346 -0
  9. pydoll_python-2.22.0/pydoll/extractor/exceptions.py +23 -0
  10. pydoll_python-2.22.0/pydoll/extractor/field.py +112 -0
  11. pydoll_python-2.22.0/pydoll/extractor/model.py +98 -0
  12. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/iframe.py +2 -2
  13. pydoll_python-2.22.0/pydoll/protocol/accessibility/__init__.py +1 -0
  14. pydoll_python-2.22.0/pydoll/protocol/accessibility/events.py +47 -0
  15. pydoll_python-2.22.0/pydoll/protocol/accessibility/methods.py +122 -0
  16. pydoll_python-2.22.0/pydoll/protocol/accessibility/types.py +192 -0
  17. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pyproject.toml +2 -1
  18. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/LICENSE +0 -0
  19. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/__init__.py +0 -0
  20. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/__init__.py +0 -0
  21. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/__init__.py +0 -0
  22. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/base.py +0 -0
  23. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/chrome.py +0 -0
  24. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/chromium/edge.py +0 -0
  25. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/interfaces.py +0 -0
  26. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/__init__.py +0 -0
  27. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/browser_options_manager.py +0 -0
  28. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/browser_process_manager.py +0 -0
  29. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/proxy_manager.py +0 -0
  30. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/managers/temp_dir_manager.py +0 -0
  31. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/options.py +0 -0
  32. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/__init__.py +0 -0
  33. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/har_recorder.py +0 -0
  34. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/request.py +0 -0
  35. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/browser/requests/response.py +0 -0
  36. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/browser_commands.py +0 -0
  37. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/dom_commands.py +0 -0
  38. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/emulation_commands.py +0 -0
  39. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/fetch_commands.py +0 -0
  40. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/input_commands.py +0 -0
  41. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/network_commands.py +0 -0
  42. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/page_commands.py +0 -0
  43. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/runtime_commands.py +0 -0
  44. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/storage_commands.py +0 -0
  45. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/commands/target_commands.py +0 -0
  46. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/__init__.py +0 -0
  47. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/connection_handler.py +0 -0
  48. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/managers/__init__.py +0 -0
  49. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/managers/commands_manager.py +0 -0
  50. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/connection/managers/events_manager.py +0 -0
  51. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/constants.py +0 -0
  52. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/decorators.py +0 -0
  53. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/__init__.py +0 -0
  54. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/mixins/__init__.py +0 -0
  55. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/mixins/find_elements_mixin.py +0 -0
  56. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/shadow_root.py +0 -0
  57. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/utils/__init__.py +0 -0
  58. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/utils/selector_parser.py +0 -0
  59. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/elements/web_element.py +0 -0
  60. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/__init__.py +0 -0
  61. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/keyboard.py +0 -0
  62. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/mouse.py +0 -0
  63. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/scroll.py +0 -0
  64. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/interactions/utils.py +0 -0
  65. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/__init__.py +0 -0
  66. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/base.py +0 -0
  67. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/__init__.py +0 -0
  68. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/events.py +0 -0
  69. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/methods.py +0 -0
  70. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/browser/types.py +0 -0
  71. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/debugger/types.py +0 -0
  72. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/__init__.py +0 -0
  73. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/events.py +0 -0
  74. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/methods.py +0 -0
  75. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/dom/types.py +0 -0
  76. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/emulation/__init__.py +0 -0
  77. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/emulation/methods.py +0 -0
  78. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/emulation/types.py +0 -0
  79. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/__init__.py +0 -0
  80. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/events.py +0 -0
  81. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/methods.py +0 -0
  82. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/fetch/types.py +0 -0
  83. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/__init__.py +0 -0
  84. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/events.py +0 -0
  85. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/methods.py +0 -0
  86. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/input/types.py +0 -0
  87. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/io/types.py +0 -0
  88. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/__init__.py +0 -0
  89. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/events.py +0 -0
  90. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/har_types.py +0 -0
  91. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/methods.py +0 -0
  92. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/network/types.py +0 -0
  93. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/__init__.py +0 -0
  94. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/events.py +0 -0
  95. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/methods.py +0 -0
  96. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/page/types.py +0 -0
  97. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/__init__.py +0 -0
  98. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/events.py +0 -0
  99. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/methods.py +0 -0
  100. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/runtime/types.py +0 -0
  101. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/security/types.py +0 -0
  102. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/__init__.py +0 -0
  103. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/events.py +0 -0
  104. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/methods.py +0 -0
  105. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/storage/types.py +0 -0
  106. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/__init__.py +0 -0
  107. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/events.py +0 -0
  108. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/methods.py +0 -0
  109. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/protocol/target/types.py +0 -0
  110. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/py.typed +0 -0
  111. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/__init__.py +0 -0
  112. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/bundle.py +0 -0
  113. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/general.py +0 -0
  114. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/socks5_proxy_forwarder.py +0 -0
  115. {pydoll_python-2.21.3 → pydoll_python-2.22.0}/pydoll/utils/user_agent_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydoll-python
3
- Version: 2.21.3
3
+ Version: 2.22.0
4
4
  Summary: Pydoll is a library for automating chromium-based browsers without a WebDriver, offering realistic interactions.
5
5
  License-File: LICENSE
6
6
  Author: Thalison Fernandes
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.13
14
14
  Classifier: Programming Language :: Python :: 3.14
15
15
  Requires-Dist: aiofiles (>=25.1.0,<26.0.0)
16
16
  Requires-Dist: aiohttp (>=3.9.5,<4.0.0)
17
+ Requires-Dist: pydantic (>=2.0,<3.0)
17
18
  Requires-Dist: typing_extensions (>=4.14.0,<5.0.0)
18
19
  Requires-Dist: websockets (>=14,<15)
19
20
  Description-Content-Type: text/markdown
@@ -42,39 +43,37 @@ Description-Content-Type: text/markdown
42
43
  <a href="#support">Support</a>
43
44
  </p>
44
45
 
45
- Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues.
46
+ Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. **No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues.**
46
47
 
47
- It combines a high-level API for common tasks with low-level CDP access for fine-grained control over network, fingerprinting, and browser behavior. The entire codebase is async-native and fully type-checked with mypy.
48
+ It combines a high-level API for stealthy automation with low-level CDP access for fine-grained control over network, fingerprinting, and browser behavior. And with its new **Pydantic-powered extraction engine**, it maps the DOM directly to structured Python objects, delivering an unmatched Developer Experience (DX).
48
49
 
49
- ### Sponsors
50
+ ### Top Sponsors
50
51
 
51
- <a href="https://www.thordata.com/?ls=github&lk=pydoll">
52
- <img alt="Thordata" src="public/images/thordata.png" />
52
+ <a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">
53
+ <img src="public/images/banner-the-webscraping-club.png" alt="The Web Scraping Club" />
53
54
  </a>
54
55
 
55
- Pydoll is proudly sponsored by **[Thordata](https://www.thordata.com/?ls=github&lk=pydoll)**: a residential proxy network built for serious web scraping and automation. With **190+ real residential and ISP locations**, fully encrypted connections, and infrastructure optimized for high-performance workflows, Thordata is an excellent choice for scaling your Pydoll automations.
56
-
57
- **[Sign up through our link](https://www.thordata.com/?ls=github&lk=pydoll)** to support the project and get **1GB free** to get started.
58
-
59
- ---
60
-
61
- <a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc">
62
- <img alt="CapSolver" src="public/images/capsolver.jpeg" />
63
- </a>
56
+ <sub>Read a full review of Pydoll on <b><a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">The Web Scraping Club</a></b>, the #1 newsletter dedicated to web scraping.</sub>
64
57
 
65
- Pydoll excels at behavioral evasion, but it doesn't solve captchas. That's where **[CapSolver](https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc)** comes in. An AI-powered service that handles reCAPTCHA, Cloudflare challenges, and more, seamlessly integrating with your automation workflows.
58
+ ### Sponsors
66
59
 
67
- **[Register with our invite code](https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc)** and use code **PYDOLL** to get an extra **6% balance bonus**.
60
+ <table>
61
+ <tr>
62
+ <td><a href="https://www.thordata.com/?ls=github&lk=pydoll"><img src="public/images/Thordata-logo.png" height="30" alt="Thordata" /></a></td>
63
+ <td><a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc"><img src="public/images/capsolver-logo.png" height="40" alt="CapSolver" /></a></td>
64
+ <td><a href="https://www.testmuai.com/?utm_medium=sponsor&utm_source=pydoll"><img src="public/images/logo-lamda-test.svg" height="30" width="130" alt="LambdaTest" /></a></td>
65
+ </tr>
66
+ </table>
68
67
 
69
- ---
68
+ <sub>[Learn more about our sponsors](SPONSORS.md) &middot; [Become a sponsor](https://github.com/sponsors/thalissonvs)</sub>
70
69
 
71
70
  ### Why Pydoll
72
71
 
73
- - **Stealth-first**: Human-like mouse movement, realistic typing, and granular [browser preference](https://pydoll.tech/docs/features/configuration/browser-preferences/) control for fingerprint management.
72
+ - **Structured extraction**: Define a [Pydantic](https://docs.pydantic.dev/) model, call `tab.extract()`, get typed and validated data back. No manual element-by-element querying.
74
73
  - **Async and typed**: Built on `asyncio` from the ground up, 100% type-checked with `mypy`. Full IDE autocompletion and static error checking.
74
+ - **Stealth built in**: Human-like mouse movement, realistic typing, and granular [browser preference](https://pydoll.tech/docs/features/configuration/browser-preferences/) control for fingerprint management.
75
75
  - **Network control**: [Intercept](https://pydoll.tech/docs/features/network/interception/) requests to block ads/trackers, [monitor](https://pydoll.tech/docs/features/network/monitoring/) traffic for API discovery, and make [authenticated HTTP requests](https://pydoll.tech/docs/features/network/http-requests/) that inherit the browser session.
76
76
  - **Shadow DOM and iframes**: Full support for [shadow roots](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/) (including closed) and cross-origin iframes. Discover, query, and interact with elements inside them using the same API.
77
- - **Ergonomic API**: `tab.find()` for most cases, `tab.query()` for complex [CSS/XPath selectors](https://pydoll.tech/docs/deep-dive/guides/selectors-guide/).
78
77
 
79
78
  ## Installation
80
79
 
@@ -84,55 +83,124 @@ pip install pydoll-python
84
83
 
85
84
  No WebDriver binaries or external dependencies required.
86
85
 
87
- ## What's New
86
+ ## Getting Started
88
87
 
89
- <details>
90
- <summary><b>HAR Network Recording</b></summary>
91
- <br>
88
+ ### 1. Stateful Automation & Evasion
92
89
 
93
- Record network activity during a browser session and export as HAR 1.2. Replay recorded requests to reproduce exact API sequences.
90
+ When you need to navigate, bypass challenges, or interact with dynamic UI, Pydoll's imperative API handles it with humanized timing by default.
94
91
 
95
92
  ```python
96
- from pydoll.browser.chromium import Chrome
93
+ import asyncio
94
+ from pydoll.browser import Chrome
95
+ from pydoll.constants import Key
97
96
 
98
- async with Chrome() as browser:
99
- tab = await browser.start()
97
+ async def google_search(query: str):
98
+ async with Chrome() as browser:
99
+ tab = await browser.start()
100
+ await tab.go_to('https://www.google.com')
100
101
 
101
- async with tab.request.record() as capture:
102
- await tab.go_to('https://example.com')
102
+ # Find elements and interact with human-like timing
103
+ search_box = await tab.find(tag_name='textarea', name='q')
104
+ await search_box.insert_text(query)
105
+ await tab.keyboard.press(Key.ENTER)
103
106
 
104
- capture.save('flow.har')
105
- print(f'Captured {len(capture.entries)} requests')
107
+ first_result = await tab.find(
108
+ tag_name='h3',
109
+ text='autoscrape-labs/pydoll',
110
+ timeout=10,
111
+ )
112
+ await first_result.click()
113
+ print(f"Page loaded: {await tab.title}")
106
114
 
107
- responses = await tab.request.replay('flow.har')
115
+ asyncio.run(google_search('pydoll site:github.com'))
108
116
  ```
109
117
 
110
- Filter by resource type:
118
+ ### 2. Structured Data Extraction
119
+
120
+ Once you reach the target page, switch to the declarative engine. Define what you want with a model, and Pydoll extracts it — typed, validated, and ready to use.
111
121
 
112
122
  ```python
113
- from pydoll.protocol.network.types import ResourceType
123
+ from pydoll.browser.chromium import Chrome
124
+ from pydoll.extractor import ExtractionModel, Field
125
+
126
+ class Quote(ExtractionModel):
127
+ text: str = Field(selector='.text', description='The quote text')
128
+ author: str = Field(selector='.author', description='Who said it')
129
+ tags: list[str] = Field(selector='.tag', description='Tags')
130
+ year: int | None = Field(selector='.year', description='Year', default=None)
131
+
132
+ async def extract_quotes():
133
+ async with Chrome() as browser:
134
+ tab = await browser.start()
135
+ await tab.go_to('https://quotes.toscrape.com')
136
+
137
+ quotes = await tab.extract_all(Quote, scope='.quote', timeout=5)
114
138
 
115
- async with tab.request.record(
116
- resource_types=[ResourceType.FETCH, ResourceType.XHR]
117
- ) as capture:
118
- await tab.go_to('https://example.com')
139
+ for q in quotes:
140
+ print(f'{q.author}: {q.text}') # fully typed, IDE autocomplete works
141
+ print(q.tags) # list[str], not a raw element
142
+ print(q.model_dump_json()) # pydantic serialization built-in
143
+
144
+ asyncio.run(extract_quotes())
119
145
  ```
120
146
 
121
- [HAR Recording Docs](https://pydoll.tech/docs/features/network/network-recording/)
147
+ Models support CSS/XPath auto-detection, HTML attribute targeting, custom transforms, and nested models.
148
+
149
+ <details>
150
+ <summary><b>Nested models, transforms, and attribute extraction</b></summary>
151
+ <br>
152
+
153
+ ```python
154
+ from datetime import datetime
155
+ from pydoll.extractor import ExtractionModel, Field
156
+
157
+ def parse_date(raw: str) -> datetime:
158
+ return datetime.strptime(raw.strip(), '%B %d, %Y')
159
+
160
+ class Author(ExtractionModel):
161
+ name: str = Field(selector='.author-title')
162
+ born: datetime = Field(
163
+ selector='.author-born-date',
164
+ transform=parse_date,
165
+ )
166
+
167
+ class Article(ExtractionModel):
168
+ title: str = Field(selector='h1')
169
+ url: str = Field(selector='.source-link', attribute='href')
170
+ author: Author = Field(selector='.author-card', description='Nested model')
171
+
172
+ article = await tab.extract(Article, timeout=5)
173
+ article.author.born.year # int — types are preserved all the way down
174
+ ```
122
175
  </details>
123
176
 
177
+ ## Features
178
+
124
179
  <details>
125
- <summary><b>Page Bundles</b></summary>
180
+ <summary><b>Humanized Mouse Movement</b></summary>
126
181
  <br>
127
182
 
128
- Save the current page and all its assets (CSS, JS, images, fonts) as a `.zip` bundle for offline viewing. Optionally inline everything into a single HTML file.
183
+ Mouse operations produce human-like cursor movement by default:
184
+
185
+ - **Bezier curve paths** with asymmetric control points
186
+ - **Fitts's Law timing**: duration scales with distance
187
+ - **Minimum-jerk velocity**: bell-shaped speed profile
188
+ - **Physiological tremor**: Gaussian noise scaled with velocity
189
+ - **Overshoot correction**: ~70% chance on fast movements, then corrects back
129
190
 
130
191
  ```python
131
- await tab.save_bundle('page.zip')
132
- await tab.save_bundle('page-inline.zip', inline_assets=True)
192
+ await tab.mouse.move(500, 300)
193
+ await tab.mouse.click(500, 300)
194
+ await tab.mouse.drag(100, 200, 500, 400)
195
+
196
+ button = await tab.find(id='submit')
197
+ await button.click()
198
+
199
+ # Opt out when speed matters
200
+ await tab.mouse.click(500, 300, humanize=False)
133
201
  ```
134
202
 
135
- [Screenshots, PDFs & Bundles Docs](https://pydoll.tech/docs/features/automation/screenshots-and-pdfs/)
203
+ [Mouse Control Docs](https://pydoll.tech/docs/features/automation/mouse-control/)
136
204
  </details>
137
205
 
138
206
  <details>
@@ -161,75 +229,46 @@ Highlights:
161
229
  - `deep=True` traverses cross-origin iframes (OOPIFs)
162
230
  - Standard `find()`, `query()`, `click()` API inside shadow roots
163
231
 
164
- ```python
165
- # Cloudflare Turnstile inside a cross-origin iframe
166
- shadow_roots = await tab.find_shadow_roots(deep=True, timeout=10)
167
- for sr in shadow_roots:
168
- checkbox = await sr.query('input[type="checkbox"]', raise_exc=False)
169
- if checkbox:
170
- await checkbox.click()
171
- ```
172
-
173
232
  [Shadow DOM Docs](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/)
174
233
  </details>
175
234
 
176
235
  <details>
177
- <summary><b>Humanized Mouse Movement</b></summary>
236
+ <summary><b>HAR Network Recording</b></summary>
178
237
  <br>
179
238
 
180
- Mouse operations produce human-like cursor movement by default:
181
-
182
- - **Bezier curve paths** with asymmetric control points
183
- - **Fitts's Law timing**: duration scales with distance
184
- - **Minimum-jerk velocity**: bell-shaped speed profile
185
- - **Physiological tremor**: Gaussian noise scaled with velocity
186
- - **Overshoot correction**: ~70% chance on fast movements, then corrects back
239
+ Record network activity during a browser session and export as HAR 1.2. Replay recorded requests to reproduce exact API sequences.
187
240
 
188
241
  ```python
189
- await tab.mouse.move(500, 300)
190
- await tab.mouse.click(500, 300)
191
- await tab.mouse.drag(100, 200, 500, 400)
192
-
193
- button = await tab.find(id='submit')
194
- await button.click()
195
-
196
- # Opt out when speed matters
197
- await tab.mouse.click(500, 300, humanize=False)
198
- ```
242
+ from pydoll.browser.chromium import Chrome
199
243
 
200
- [Mouse Control Docs](https://pydoll.tech/docs/features/automation/mouse-control/)
201
- </details>
244
+ async with Chrome() as browser:
245
+ tab = await browser.start()
202
246
 
203
- ## Getting Started
247
+ async with tab.request.record() as capture:
248
+ await tab.go_to('https://example.com')
204
249
 
205
- ```python
206
- import asyncio
207
- from pydoll.browser import Chrome
208
- from pydoll.constants import Key
250
+ capture.save('flow.har')
251
+ print(f'Captured {len(capture.entries)} requests')
209
252
 
210
- async def google_search(query: str):
211
- async with Chrome() as browser:
212
- tab = await browser.start()
213
- await tab.go_to('https://www.google.com')
253
+ responses = await tab.request.replay('flow.har')
254
+ ```
214
255
 
215
- search_box = await tab.find(tag_name='textarea', name='q')
216
- await search_box.insert_text(query)
217
- await tab.keyboard.press(Key.ENTER)
256
+ [HAR Recording Docs](https://pydoll.tech/docs/features/network/network-recording/)
257
+ </details>
218
258
 
219
- first_result = await tab.find(
220
- tag_name='h3',
221
- text='autoscrape-labs/pydoll',
222
- timeout=10,
223
- )
224
- await first_result.click()
259
+ <details>
260
+ <summary><b>Page Bundles</b></summary>
261
+ <br>
225
262
 
226
- await tab.find(id='repository-container-header', timeout=10)
227
- print(f"Page loaded: {await tab.title}")
263
+ Save the current page and all its assets (CSS, JS, images, fonts) as a `.zip` bundle for offline viewing. Optionally inline everything into a single HTML file.
228
264
 
229
- asyncio.run(google_search('pydoll site:github.com'))
265
+ ```python
266
+ await tab.save_bundle('page.zip')
267
+ await tab.save_bundle('page-inline.zip', inline_assets=True)
230
268
  ```
231
269
 
232
- ## Features
270
+ [Screenshots, PDFs & Bundles Docs](https://pydoll.tech/docs/features/automation/screenshots-and-pdfs/)
271
+ </details>
233
272
 
234
273
  <details>
235
274
  <summary><b>Hybrid Automation (UI + API)</b></summary>
@@ -22,39 +22,37 @@
22
22
  <a href="#support">Support</a>
23
23
  </p>
24
24
 
25
- Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues.
25
+ Pydoll automates Chromium-based browsers (Chrome, Edge) by connecting directly to the Chrome DevTools Protocol over WebSocket. **No WebDriver binary, no `navigator.webdriver` flag, no compatibility issues.**
26
26
 
27
- It combines a high-level API for common tasks with low-level CDP access for fine-grained control over network, fingerprinting, and browser behavior. The entire codebase is async-native and fully type-checked with mypy.
27
+ It combines a high-level API for stealthy automation with low-level CDP access for fine-grained control over network, fingerprinting, and browser behavior. And with its new **Pydantic-powered extraction engine**, it maps the DOM directly to structured Python objects, delivering an unmatched Developer Experience (DX).
28
28
 
29
- ### Sponsors
29
+ ### Top Sponsors
30
30
 
31
- <a href="https://www.thordata.com/?ls=github&lk=pydoll">
32
- <img alt="Thordata" src="public/images/thordata.png" />
31
+ <a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">
32
+ <img src="public/images/banner-the-webscraping-club.png" alt="The Web Scraping Club" />
33
33
  </a>
34
34
 
35
- Pydoll is proudly sponsored by **[Thordata](https://www.thordata.com/?ls=github&lk=pydoll)**: a residential proxy network built for serious web scraping and automation. With **190+ real residential and ISP locations**, fully encrypted connections, and infrastructure optimized for high-performance workflows, Thordata is an excellent choice for scaling your Pydoll automations.
36
-
37
- **[Sign up through our link](https://www.thordata.com/?ls=github&lk=pydoll)** to support the project and get **1GB free** to get started.
38
-
39
- ---
40
-
41
- <a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc">
42
- <img alt="CapSolver" src="public/images/capsolver.jpeg" />
43
- </a>
35
+ <sub>Read a full review of Pydoll on <b><a href="https://substack.thewebscraping.club/p/pydoll-webdriver-scraping?utm_source=github&utm_medium=repo&utm_campaign=pydoll">The Web Scraping Club</a></b>, the #1 newsletter dedicated to web scraping.</sub>
44
36
 
45
- Pydoll excels at behavioral evasion, but it doesn't solve captchas. That's where **[CapSolver](https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc)** comes in. An AI-powered service that handles reCAPTCHA, Cloudflare challenges, and more, seamlessly integrating with your automation workflows.
37
+ ### Sponsors
46
38
 
47
- **[Register with our invite code](https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc)** and use code **PYDOLL** to get an extra **6% balance bonus**.
39
+ <table>
40
+ <tr>
41
+ <td><a href="https://www.thordata.com/?ls=github&lk=pydoll"><img src="public/images/Thordata-logo.png" height="30" alt="Thordata" /></a></td>
42
+ <td><a href="https://dashboard.capsolver.com/passport/register?inviteCode=WPhTbOsbXEpc"><img src="public/images/capsolver-logo.png" height="40" alt="CapSolver" /></a></td>
43
+ <td><a href="https://www.testmuai.com/?utm_medium=sponsor&utm_source=pydoll"><img src="public/images/logo-lamda-test.svg" height="30" width="130" alt="LambdaTest" /></a></td>
44
+ </tr>
45
+ </table>
48
46
 
49
- ---
47
+ <sub>[Learn more about our sponsors](SPONSORS.md) &middot; [Become a sponsor](https://github.com/sponsors/thalissonvs)</sub>
50
48
 
51
49
  ### Why Pydoll
52
50
 
53
- - **Stealth-first**: Human-like mouse movement, realistic typing, and granular [browser preference](https://pydoll.tech/docs/features/configuration/browser-preferences/) control for fingerprint management.
51
+ - **Structured extraction**: Define a [Pydantic](https://docs.pydantic.dev/) model, call `tab.extract()`, get typed and validated data back. No manual element-by-element querying.
54
52
  - **Async and typed**: Built on `asyncio` from the ground up, 100% type-checked with `mypy`. Full IDE autocompletion and static error checking.
53
+ - **Stealth built in**: Human-like mouse movement, realistic typing, and granular [browser preference](https://pydoll.tech/docs/features/configuration/browser-preferences/) control for fingerprint management.
55
54
  - **Network control**: [Intercept](https://pydoll.tech/docs/features/network/interception/) requests to block ads/trackers, [monitor](https://pydoll.tech/docs/features/network/monitoring/) traffic for API discovery, and make [authenticated HTTP requests](https://pydoll.tech/docs/features/network/http-requests/) that inherit the browser session.
56
55
  - **Shadow DOM and iframes**: Full support for [shadow roots](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/) (including closed) and cross-origin iframes. Discover, query, and interact with elements inside them using the same API.
57
- - **Ergonomic API**: `tab.find()` for most cases, `tab.query()` for complex [CSS/XPath selectors](https://pydoll.tech/docs/deep-dive/guides/selectors-guide/).
58
56
 
59
57
  ## Installation
60
58
 
@@ -64,55 +62,124 @@ pip install pydoll-python
64
62
 
65
63
  No WebDriver binaries or external dependencies required.
66
64
 
67
- ## What's New
65
+ ## Getting Started
68
66
 
69
- <details>
70
- <summary><b>HAR Network Recording</b></summary>
71
- <br>
67
+ ### 1. Stateful Automation & Evasion
72
68
 
73
- Record network activity during a browser session and export as HAR 1.2. Replay recorded requests to reproduce exact API sequences.
69
+ When you need to navigate, bypass challenges, or interact with dynamic UI, Pydoll's imperative API handles it with humanized timing by default.
74
70
 
75
71
  ```python
76
- from pydoll.browser.chromium import Chrome
72
+ import asyncio
73
+ from pydoll.browser import Chrome
74
+ from pydoll.constants import Key
77
75
 
78
- async with Chrome() as browser:
79
- tab = await browser.start()
76
+ async def google_search(query: str):
77
+ async with Chrome() as browser:
78
+ tab = await browser.start()
79
+ await tab.go_to('https://www.google.com')
80
80
 
81
- async with tab.request.record() as capture:
82
- await tab.go_to('https://example.com')
81
+ # Find elements and interact with human-like timing
82
+ search_box = await tab.find(tag_name='textarea', name='q')
83
+ await search_box.insert_text(query)
84
+ await tab.keyboard.press(Key.ENTER)
83
85
 
84
- capture.save('flow.har')
85
- print(f'Captured {len(capture.entries)} requests')
86
+ first_result = await tab.find(
87
+ tag_name='h3',
88
+ text='autoscrape-labs/pydoll',
89
+ timeout=10,
90
+ )
91
+ await first_result.click()
92
+ print(f"Page loaded: {await tab.title}")
86
93
 
87
- responses = await tab.request.replay('flow.har')
94
+ asyncio.run(google_search('pydoll site:github.com'))
88
95
  ```
89
96
 
90
- Filter by resource type:
97
+ ### 2. Structured Data Extraction
98
+
99
+ Once you reach the target page, switch to the declarative engine. Define what you want with a model, and Pydoll extracts it — typed, validated, and ready to use.
91
100
 
92
101
  ```python
93
- from pydoll.protocol.network.types import ResourceType
102
+ from pydoll.browser.chromium import Chrome
103
+ from pydoll.extractor import ExtractionModel, Field
104
+
105
+ class Quote(ExtractionModel):
106
+ text: str = Field(selector='.text', description='The quote text')
107
+ author: str = Field(selector='.author', description='Who said it')
108
+ tags: list[str] = Field(selector='.tag', description='Tags')
109
+ year: int | None = Field(selector='.year', description='Year', default=None)
110
+
111
+ async def extract_quotes():
112
+ async with Chrome() as browser:
113
+ tab = await browser.start()
114
+ await tab.go_to('https://quotes.toscrape.com')
115
+
116
+ quotes = await tab.extract_all(Quote, scope='.quote', timeout=5)
94
117
 
95
- async with tab.request.record(
96
- resource_types=[ResourceType.FETCH, ResourceType.XHR]
97
- ) as capture:
98
- await tab.go_to('https://example.com')
118
+ for q in quotes:
119
+ print(f'{q.author}: {q.text}') # fully typed, IDE autocomplete works
120
+ print(q.tags) # list[str], not a raw element
121
+ print(q.model_dump_json()) # pydantic serialization built-in
122
+
123
+ asyncio.run(extract_quotes())
99
124
  ```
100
125
 
101
- [HAR Recording Docs](https://pydoll.tech/docs/features/network/network-recording/)
126
+ Models support CSS/XPath auto-detection, HTML attribute targeting, custom transforms, and nested models.
127
+
128
+ <details>
129
+ <summary><b>Nested models, transforms, and attribute extraction</b></summary>
130
+ <br>
131
+
132
+ ```python
133
+ from datetime import datetime
134
+ from pydoll.extractor import ExtractionModel, Field
135
+
136
+ def parse_date(raw: str) -> datetime:
137
+ return datetime.strptime(raw.strip(), '%B %d, %Y')
138
+
139
+ class Author(ExtractionModel):
140
+ name: str = Field(selector='.author-title')
141
+ born: datetime = Field(
142
+ selector='.author-born-date',
143
+ transform=parse_date,
144
+ )
145
+
146
+ class Article(ExtractionModel):
147
+ title: str = Field(selector='h1')
148
+ url: str = Field(selector='.source-link', attribute='href')
149
+ author: Author = Field(selector='.author-card', description='Nested model')
150
+
151
+ article = await tab.extract(Article, timeout=5)
152
+ article.author.born.year # int — types are preserved all the way down
153
+ ```
102
154
  </details>
103
155
 
156
+ ## Features
157
+
104
158
  <details>
105
- <summary><b>Page Bundles</b></summary>
159
+ <summary><b>Humanized Mouse Movement</b></summary>
106
160
  <br>
107
161
 
108
- Save the current page and all its assets (CSS, JS, images, fonts) as a `.zip` bundle for offline viewing. Optionally inline everything into a single HTML file.
162
+ Mouse operations produce human-like cursor movement by default:
163
+
164
+ - **Bezier curve paths** with asymmetric control points
165
+ - **Fitts's Law timing**: duration scales with distance
166
+ - **Minimum-jerk velocity**: bell-shaped speed profile
167
+ - **Physiological tremor**: Gaussian noise scaled with velocity
168
+ - **Overshoot correction**: ~70% chance on fast movements, then corrects back
109
169
 
110
170
  ```python
111
- await tab.save_bundle('page.zip')
112
- await tab.save_bundle('page-inline.zip', inline_assets=True)
171
+ await tab.mouse.move(500, 300)
172
+ await tab.mouse.click(500, 300)
173
+ await tab.mouse.drag(100, 200, 500, 400)
174
+
175
+ button = await tab.find(id='submit')
176
+ await button.click()
177
+
178
+ # Opt out when speed matters
179
+ await tab.mouse.click(500, 300, humanize=False)
113
180
  ```
114
181
 
115
- [Screenshots, PDFs & Bundles Docs](https://pydoll.tech/docs/features/automation/screenshots-and-pdfs/)
182
+ [Mouse Control Docs](https://pydoll.tech/docs/features/automation/mouse-control/)
116
183
  </details>
117
184
 
118
185
  <details>
@@ -141,75 +208,46 @@ Highlights:
141
208
  - `deep=True` traverses cross-origin iframes (OOPIFs)
142
209
  - Standard `find()`, `query()`, `click()` API inside shadow roots
143
210
 
144
- ```python
145
- # Cloudflare Turnstile inside a cross-origin iframe
146
- shadow_roots = await tab.find_shadow_roots(deep=True, timeout=10)
147
- for sr in shadow_roots:
148
- checkbox = await sr.query('input[type="checkbox"]', raise_exc=False)
149
- if checkbox:
150
- await checkbox.click()
151
- ```
152
-
153
211
  [Shadow DOM Docs](https://pydoll.tech/docs/deep-dive/architecture/shadow-dom/)
154
212
  </details>
155
213
 
156
214
  <details>
157
- <summary><b>Humanized Mouse Movement</b></summary>
215
+ <summary><b>HAR Network Recording</b></summary>
158
216
  <br>
159
217
 
160
- Mouse operations produce human-like cursor movement by default:
161
-
162
- - **Bezier curve paths** with asymmetric control points
163
- - **Fitts's Law timing**: duration scales with distance
164
- - **Minimum-jerk velocity**: bell-shaped speed profile
165
- - **Physiological tremor**: Gaussian noise scaled with velocity
166
- - **Overshoot correction**: ~70% chance on fast movements, then corrects back
218
+ Record network activity during a browser session and export as HAR 1.2. Replay recorded requests to reproduce exact API sequences.
167
219
 
168
220
  ```python
169
- await tab.mouse.move(500, 300)
170
- await tab.mouse.click(500, 300)
171
- await tab.mouse.drag(100, 200, 500, 400)
172
-
173
- button = await tab.find(id='submit')
174
- await button.click()
175
-
176
- # Opt out when speed matters
177
- await tab.mouse.click(500, 300, humanize=False)
178
- ```
221
+ from pydoll.browser.chromium import Chrome
179
222
 
180
- [Mouse Control Docs](https://pydoll.tech/docs/features/automation/mouse-control/)
181
- </details>
223
+ async with Chrome() as browser:
224
+ tab = await browser.start()
182
225
 
183
- ## Getting Started
226
+ async with tab.request.record() as capture:
227
+ await tab.go_to('https://example.com')
184
228
 
185
- ```python
186
- import asyncio
187
- from pydoll.browser import Chrome
188
- from pydoll.constants import Key
229
+ capture.save('flow.har')
230
+ print(f'Captured {len(capture.entries)} requests')
189
231
 
190
- async def google_search(query: str):
191
- async with Chrome() as browser:
192
- tab = await browser.start()
193
- await tab.go_to('https://www.google.com')
232
+ responses = await tab.request.replay('flow.har')
233
+ ```
194
234
 
195
- search_box = await tab.find(tag_name='textarea', name='q')
196
- await search_box.insert_text(query)
197
- await tab.keyboard.press(Key.ENTER)
235
+ [HAR Recording Docs](https://pydoll.tech/docs/features/network/network-recording/)
236
+ </details>
198
237
 
199
- first_result = await tab.find(
200
- tag_name='h3',
201
- text='autoscrape-labs/pydoll',
202
- timeout=10,
203
- )
204
- await first_result.click()
238
+ <details>
239
+ <summary><b>Page Bundles</b></summary>
240
+ <br>
205
241
 
206
- await tab.find(id='repository-container-header', timeout=10)
207
- print(f"Page loaded: {await tab.title}")
242
+ Save the current page and all its assets (CSS, JS, images, fonts) as a `.zip` bundle for offline viewing. Optionally inline everything into a single HTML file.
208
243
 
209
- asyncio.run(google_search('pydoll site:github.com'))
244
+ ```python
245
+ await tab.save_bundle('page.zip')
246
+ await tab.save_bundle('page-inline.zip', inline_assets=True)
210
247
  ```
211
248
 
212
- ## Features
249
+ [Screenshots, PDFs & Bundles Docs](https://pydoll.tech/docs/features/automation/screenshots-and-pdfs/)
250
+ </details>
213
251
 
214
252
  <details>
215
253
  <summary><b>Hybrid Automation (UI + API)</b></summary>