webtap-tool 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webtap-tool might be problematic. Click here for more details.
- webtap/VISION.md +234 -0
- webtap/__init__.py +56 -0
- webtap/api.py +222 -0
- webtap/app.py +76 -0
- webtap/cdp/README.md +268 -0
- webtap/cdp/__init__.py +14 -0
- webtap/cdp/query.py +107 -0
- webtap/cdp/schema/README.md +41 -0
- webtap/cdp/schema/cdp_protocol.json +32785 -0
- webtap/cdp/schema/cdp_version.json +8 -0
- webtap/cdp/session.py +365 -0
- webtap/commands/DEVELOPER_GUIDE.md +314 -0
- webtap/commands/TIPS.md +153 -0
- webtap/commands/__init__.py +7 -0
- webtap/commands/_builders.py +127 -0
- webtap/commands/_errors.py +108 -0
- webtap/commands/_tips.py +147 -0
- webtap/commands/_utils.py +227 -0
- webtap/commands/body.py +161 -0
- webtap/commands/connection.py +168 -0
- webtap/commands/console.py +69 -0
- webtap/commands/events.py +109 -0
- webtap/commands/fetch.py +219 -0
- webtap/commands/filters.py +224 -0
- webtap/commands/inspect.py +146 -0
- webtap/commands/javascript.py +87 -0
- webtap/commands/launch.py +86 -0
- webtap/commands/navigation.py +199 -0
- webtap/commands/network.py +85 -0
- webtap/commands/setup.py +127 -0
- webtap/filters.py +289 -0
- webtap/services/README.md +83 -0
- webtap/services/__init__.py +15 -0
- webtap/services/body.py +113 -0
- webtap/services/console.py +116 -0
- webtap/services/fetch.py +397 -0
- webtap/services/main.py +175 -0
- webtap/services/network.py +105 -0
- webtap/services/setup.py +219 -0
- webtap_tool-0.1.1.dist-info/METADATA +427 -0
- webtap_tool-0.1.1.dist-info/RECORD +43 -0
- webtap_tool-0.1.1.dist-info/WHEEL +4 -0
- webtap_tool-0.1.1.dist-info/entry_points.txt +2 -0
webtap/cdp/README.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# Chrome DevTools Protocol (CDP) Integration
|
|
2
|
+
|
|
3
|
+
This module handles the core CDP connection and event management for WebTap.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The CDP module provides:
|
|
8
|
+
- WebSocket connection to Chrome's debugging port
|
|
9
|
+
- Event capture and storage in DuckDB
|
|
10
|
+
- Dynamic field discovery for flexible querying
|
|
11
|
+
- Native event storage (no transformation)
|
|
12
|
+
|
|
13
|
+
## Architecture
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
Chrome Browser
|
|
17
|
+
↓ (WebSocket)
|
|
18
|
+
CDPSession (session.py)
|
|
19
|
+
├── WebSocketApp (connection management)
|
|
20
|
+
├── DuckDB (event storage)
|
|
21
|
+
└── Field Discovery (dynamic paths)
|
|
22
|
+
↓
|
|
23
|
+
Query Builder (query.py)
|
|
24
|
+
└── SQL Generation
|
|
25
|
+
↓
|
|
26
|
+
WebTap Commands
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Core Components
|
|
30
|
+
|
|
31
|
+
### session.py
|
|
32
|
+
The main CDP session manager:
|
|
33
|
+
- Establishes WebSocket connection
|
|
34
|
+
- Stores events as-is in DuckDB
|
|
35
|
+
- Discovers field paths dynamically
|
|
36
|
+
- Handles CDP command execution
|
|
37
|
+
|
|
38
|
+
### query.py
|
|
39
|
+
Dynamic query builder:
|
|
40
|
+
- Fuzzy field matching
|
|
41
|
+
- SQL generation for JSON queries
|
|
42
|
+
- Cross-event correlation
|
|
43
|
+
|
|
44
|
+
### schema/
|
|
45
|
+
CDP protocol reference:
|
|
46
|
+
- Protocol version information
|
|
47
|
+
- Domain definitions (future)
|
|
48
|
+
|
|
49
|
+
## Philosophy: Native Storage
|
|
50
|
+
|
|
51
|
+
We store CDP events exactly as received:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
# CDP sends this
|
|
55
|
+
{
|
|
56
|
+
"method": "Network.responseReceived",
|
|
57
|
+
"params": {
|
|
58
|
+
"requestId": "123.456",
|
|
59
|
+
"response": {
|
|
60
|
+
"status": 200,
|
|
61
|
+
"headers": {...}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# We store it as-is in DuckDB
|
|
67
|
+
# No transformation, no data loss
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Event Domains
|
|
71
|
+
|
|
72
|
+
Currently capturing events from:
|
|
73
|
+
|
|
74
|
+
### Network Domain
|
|
75
|
+
- `Network.requestWillBeSent`
|
|
76
|
+
- `Network.responseReceived`
|
|
77
|
+
- `Network.loadingFinished`
|
|
78
|
+
- `Network.loadingFailed`
|
|
79
|
+
|
|
80
|
+
### Page Domain
|
|
81
|
+
- `Page.frameNavigated`
|
|
82
|
+
- `Page.domContentEventFired`
|
|
83
|
+
- `Page.loadEventFired`
|
|
84
|
+
|
|
85
|
+
### Runtime Domain
|
|
86
|
+
- `Runtime.consoleAPICalled`
|
|
87
|
+
- `Runtime.exceptionThrown`
|
|
88
|
+
|
|
89
|
+
### Fetch Domain
|
|
90
|
+
- `Fetch.requestPaused`
|
|
91
|
+
- `Fetch.authRequired`
|
|
92
|
+
|
|
93
|
+
### Storage Domain
|
|
94
|
+
- `Storage.cookiesChanged`
|
|
95
|
+
- `Storage.cacheStorageContentUpdated`
|
|
96
|
+
|
|
97
|
+
## Database Schema
|
|
98
|
+
|
|
99
|
+
### events table
|
|
100
|
+
```sql
|
|
101
|
+
CREATE TABLE events (
|
|
102
|
+
rowid INTEGER PRIMARY KEY,
|
|
103
|
+
event JSON,
|
|
104
|
+
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Query Examples
|
|
109
|
+
|
|
110
|
+
```sql
|
|
111
|
+
-- Find all 404 responses
|
|
112
|
+
SELECT * FROM events
|
|
113
|
+
WHERE json_extract_string(event, '$.params.response.status') = '404'
|
|
114
|
+
|
|
115
|
+
-- Get request/response pairs
|
|
116
|
+
SELECT
|
|
117
|
+
e1.rowid as request_row,
|
|
118
|
+
e2.rowid as response_row,
|
|
119
|
+
json_extract_string(e1.event, '$.params.request.url') as url
|
|
120
|
+
FROM events e1
|
|
121
|
+
JOIN events e2 ON
|
|
122
|
+
json_extract_string(e1.event, '$.params.requestId') =
|
|
123
|
+
json_extract_string(e2.event, '$.params.requestId')
|
|
124
|
+
WHERE
|
|
125
|
+
json_extract_string(e1.event, '$.method') = 'Network.requestWillBeSent'
|
|
126
|
+
AND json_extract_string(e2.event, '$.method') = 'Network.responseReceived'
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Field Discovery
|
|
130
|
+
|
|
131
|
+
The system automatically discovers all field paths:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# When we see this event:
|
|
135
|
+
{
|
|
136
|
+
"method": "Network.responseReceived",
|
|
137
|
+
"params": {
|
|
138
|
+
"response": {
|
|
139
|
+
"status": 200,
|
|
140
|
+
"url": "https://example.com"
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# We discover these paths:
|
|
146
|
+
# - method
|
|
147
|
+
# - params.response.status
|
|
148
|
+
# - params.response.url
|
|
149
|
+
|
|
150
|
+
# Users can then query with fuzzy matching:
|
|
151
|
+
events(status=200) # Finds params.response.status
|
|
152
|
+
events(url="example") # Finds params.response.url
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Connection Management
|
|
156
|
+
|
|
157
|
+
### Initialization
|
|
158
|
+
```python
|
|
159
|
+
cdp = CDPSession()
|
|
160
|
+
await cdp.connect("localhost", 9222, page_id)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Event Flow
|
|
164
|
+
1. Chrome sends event over WebSocket
|
|
165
|
+
2. CDPSession receives in `on_message()`
|
|
166
|
+
3. Event stored in DuckDB immediately
|
|
167
|
+
4. Field paths extracted for discovery
|
|
168
|
+
5. Event available for querying
|
|
169
|
+
|
|
170
|
+
## CDP Command Execution
|
|
171
|
+
|
|
172
|
+
Direct command execution:
|
|
173
|
+
```python
|
|
174
|
+
# Get response body
|
|
175
|
+
result = cdp.execute("Network.getResponseBody", {
|
|
176
|
+
"requestId": "123.456"
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
# Evaluate JavaScript
|
|
180
|
+
result = cdp.execute("Runtime.evaluate", {
|
|
181
|
+
"expression": "document.title"
|
|
182
|
+
})
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Performance Considerations
|
|
186
|
+
|
|
187
|
+
- **Minimal Processing**: Events stored as-is
|
|
188
|
+
- **Lazy Evaluation**: Field discovery on-demand
|
|
189
|
+
- **Efficient Storage**: DuckDB's columnar format
|
|
190
|
+
- **Fast Queries**: JSON functions optimized in DuckDB
|
|
191
|
+
|
|
192
|
+
## Extension Points
|
|
193
|
+
|
|
194
|
+
### Adding New Domains
|
|
195
|
+
To capture events from additional CDP domains:
|
|
196
|
+
|
|
197
|
+
1. Enable the domain:
|
|
198
|
+
```python
|
|
199
|
+
cdp.execute("DOMStorage.enable")
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
2. Events automatically captured and stored
|
|
203
|
+
|
|
204
|
+
3. Query them:
|
|
205
|
+
```python
|
|
206
|
+
events(method="DOMStorage.*")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Custom Event Processing
|
|
210
|
+
While we store events as-is, you can add custom processors:
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
def process_network_event(event):
|
|
214
|
+
# Custom logic here
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
# Register processor
|
|
218
|
+
cdp.register_processor("Network.*", process_network_event)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Integration with SDP
|
|
222
|
+
|
|
223
|
+
The CDP module will work alongside the future SDP (Svelte Debug Protocol) module:
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
CDP Events (Network, DOM, Console)
|
|
227
|
+
+
|
|
228
|
+
SDP Events (State, Components, Reactivity)
|
|
229
|
+
↓
|
|
230
|
+
Unified Event Stream in DuckDB
|
|
231
|
+
↓
|
|
232
|
+
Correlated Analysis
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Best Practices
|
|
236
|
+
|
|
237
|
+
1. **Don't Transform**: Store CDP data as-is
|
|
238
|
+
2. **Query Don't Parse**: Use SQL for extraction
|
|
239
|
+
3. **Discover Don't Define**: Let field paths emerge
|
|
240
|
+
4. **Correlate Don't Duplicate**: Link events by IDs
|
|
241
|
+
|
|
242
|
+
## Debugging
|
|
243
|
+
|
|
244
|
+
### Enable verbose logging
|
|
245
|
+
```python
|
|
246
|
+
import logging
|
|
247
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Check connection
|
|
251
|
+
```python
|
|
252
|
+
cdp.connected # Should be True
|
|
253
|
+
cdp.ws.sock.connected # WebSocket status
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Inspect stored events
|
|
257
|
+
```python
|
|
258
|
+
cdp.query("SELECT COUNT(*) FROM events")
|
|
259
|
+
cdp.query("SELECT * FROM events ORDER BY rowid DESC LIMIT 5")
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Future Enhancements
|
|
263
|
+
|
|
264
|
+
- [ ] Event compression for long sessions
|
|
265
|
+
- [ ] Streaming to external storage
|
|
266
|
+
- [ ] Real-time event subscriptions
|
|
267
|
+
- [ ] Custom domain definitions
|
|
268
|
+
- [ ] Event replay functionality
|
webtap/cdp/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Chrome DevTools Protocol client with native event storage.
|
|
2
|
+
|
|
3
|
+
Native CDP approach - store events as-is, query on-demand.
|
|
4
|
+
Built on WebSocketApp + DuckDB for minimal overhead.
|
|
5
|
+
|
|
6
|
+
PUBLIC API:
|
|
7
|
+
- CDPSession: Main CDP client with WebSocket connection and event storage
|
|
8
|
+
- build_query: Dynamic query builder with field discovery
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from webtap.cdp.query import build_query
|
|
12
|
+
from webtap.cdp.session import CDPSession
|
|
13
|
+
|
|
14
|
+
__all__ = ["CDPSession", "build_query"]
|
webtap/cdp/query.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Dynamic CDP query builder with field discovery.
|
|
2
|
+
|
|
3
|
+
PUBLIC API:
|
|
4
|
+
- build_query: Build SQL queries with automatic field discovery
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_query(
|
|
9
|
+
session, query: dict, event_type: str | list[str] | None = None, limit: int = 20
|
|
10
|
+
) -> tuple[str, dict[str, list[str]]]:
|
|
11
|
+
"""Build SQL queries with automatic CDP field discovery.
|
|
12
|
+
|
|
13
|
+
Uses CDPSession's live field_paths lookup built from actual events.
|
|
14
|
+
Supports filtering, wildcard matching, and multi-field extraction.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
session: CDPSession with field_paths lookup.
|
|
18
|
+
query: Field names and values - "*" extracts only, values filter.
|
|
19
|
+
event_type: Optional CDP event type(s) to filter.
|
|
20
|
+
limit: Maximum results. Defaults to 20.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Tuple of (sql_query, discovered_fields_dict).
|
|
24
|
+
|
|
25
|
+
Examples:
|
|
26
|
+
build_query(session, {"url": "*"}) # Extract all URL fields
|
|
27
|
+
build_query(session, {"status": 200}) # Filter by status=200
|
|
28
|
+
build_query(session, {"url": "*youtube*", "status": 200}) # Multiple fields
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Field discovery using live field_paths
|
|
32
|
+
discovered = {}
|
|
33
|
+
for key in query.keys():
|
|
34
|
+
if key in ["limit"]:
|
|
35
|
+
continue
|
|
36
|
+
discovered[key] = session.discover_field_paths(key)
|
|
37
|
+
|
|
38
|
+
# Handle case where no fields found
|
|
39
|
+
if not any(discovered.values()):
|
|
40
|
+
return "SELECT NULL as no_fields_found FROM events LIMIT 0", discovered
|
|
41
|
+
|
|
42
|
+
# Build WHERE conditions
|
|
43
|
+
where_conditions = []
|
|
44
|
+
|
|
45
|
+
# Filter by CDP event type
|
|
46
|
+
if event_type:
|
|
47
|
+
if isinstance(event_type, str):
|
|
48
|
+
where_conditions.append(f"json_extract_string(event, '$.method') = '{event_type}'")
|
|
49
|
+
elif isinstance(event_type, list):
|
|
50
|
+
types_str = ", ".join(f"'{t}'" for t in event_type)
|
|
51
|
+
where_conditions.append(f"json_extract_string(event, '$.method') IN ({types_str})")
|
|
52
|
+
|
|
53
|
+
# Build field filters using discovered paths
|
|
54
|
+
for key, value in query.items():
|
|
55
|
+
if key in ["limit"] or value == "*":
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
paths = discovered.get(key, [])
|
|
59
|
+
if not paths:
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
# Create filter conditions for each path
|
|
63
|
+
path_conditions = []
|
|
64
|
+
for path in paths:
|
|
65
|
+
# Remove event type prefix for JSON path
|
|
66
|
+
actual_path = path.split(":", 1)[1] if ":" in path else path
|
|
67
|
+
json_path = "$." + actual_path
|
|
68
|
+
|
|
69
|
+
if isinstance(value, str):
|
|
70
|
+
# Convert wildcards to SQL LIKE patterns
|
|
71
|
+
if "*" in value or "?" in value:
|
|
72
|
+
pattern = value.replace("*", "%").replace("?", "_")
|
|
73
|
+
else:
|
|
74
|
+
pattern = value
|
|
75
|
+
path_conditions.append(f"json_extract_string(event, '{json_path}') LIKE '{pattern}'")
|
|
76
|
+
elif isinstance(value, (int, float)):
|
|
77
|
+
path_conditions.append(f"CAST(json_extract_string(event, '{json_path}') AS NUMERIC) = {value}")
|
|
78
|
+
elif isinstance(value, bool):
|
|
79
|
+
path_conditions.append(f"json_extract_string(event, '{json_path}') = '{str(value).lower()}'")
|
|
80
|
+
elif value is None:
|
|
81
|
+
path_conditions.append(f"json_extract_string(event, '{json_path}') IS NULL")
|
|
82
|
+
|
|
83
|
+
# OR conditions between different paths for same field
|
|
84
|
+
if path_conditions:
|
|
85
|
+
if len(path_conditions) == 1:
|
|
86
|
+
where_conditions.append(path_conditions[0])
|
|
87
|
+
else:
|
|
88
|
+
where_conditions.append(f"({' OR '.join(path_conditions)})")
|
|
89
|
+
|
|
90
|
+
# Build SELECT clause with rowid and discovered fields
|
|
91
|
+
select_parts = ["rowid"]
|
|
92
|
+
for key, paths in discovered.items():
|
|
93
|
+
for path in paths:
|
|
94
|
+
# Use actual path for JSON, full path for column alias
|
|
95
|
+
actual_path = path.split(":", 1)[1] if ":" in path else path
|
|
96
|
+
json_path = "$." + actual_path
|
|
97
|
+
select_parts.append(f"json_extract_string(event, '{json_path}') as \"{path}\"")
|
|
98
|
+
|
|
99
|
+
# Assemble final SQL query
|
|
100
|
+
sql = f"SELECT {', '.join(select_parts)} FROM events"
|
|
101
|
+
|
|
102
|
+
if where_conditions:
|
|
103
|
+
sql += " WHERE " + " AND ".join(where_conditions)
|
|
104
|
+
|
|
105
|
+
sql += f" ORDER BY rowid DESC LIMIT {limit}"
|
|
106
|
+
|
|
107
|
+
return sql, discovered
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Chrome DevTools Protocol Schema
|
|
2
|
+
|
|
3
|
+
This directory contains the CDP protocol schema and version information fetched from Chrome.
|
|
4
|
+
|
|
5
|
+
## Files
|
|
6
|
+
|
|
7
|
+
- `cdp_protocol.json` - Full CDP protocol schema with all domains, commands, events, and types
|
|
8
|
+
- `cdp_version.json` - Chrome version and protocol version information
|
|
9
|
+
|
|
10
|
+
## Fetching Latest Schema
|
|
11
|
+
|
|
12
|
+
To update these files with the latest protocol from your Chrome instance:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# Ensure Chrome is running with debugging enabled:
|
|
16
|
+
# google-chrome --remote-debugging-port=9222
|
|
17
|
+
|
|
18
|
+
# Fetch protocol schema
|
|
19
|
+
curl -s http://localhost:9222/json/protocol > cdp_protocol.json
|
|
20
|
+
|
|
21
|
+
# Fetch version info
|
|
22
|
+
curl -s http://localhost:9222/json/version | jq '.' > cdp_version.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Using the Schema
|
|
26
|
+
|
|
27
|
+
These files are useful for:
|
|
28
|
+
- Understanding available CDP commands and their parameters
|
|
29
|
+
- Debugging protocol issues
|
|
30
|
+
- Validating command usage
|
|
31
|
+
- Discovering new CDP features
|
|
32
|
+
|
|
33
|
+
## Example: Finding Fetch Commands
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# List all Fetch domain commands
|
|
37
|
+
cat cdp_protocol.json | jq '.domains[] | select(.domain == "Fetch") | .commands[].name'
|
|
38
|
+
|
|
39
|
+
# Get details for a specific command
|
|
40
|
+
cat cdp_protocol.json | jq '.domains[] | select(.domain == "Fetch") | .commands[] | select(.name == "continueResponse")'
|
|
41
|
+
```
|