webtap-tool 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webtap-tool might be problematic. Click here for more details.
- webtap_tool-0.1.1/.gitignore +3 -0
- webtap_tool-0.1.1/ARCHITECTURE.md +150 -0
- webtap_tool-0.1.1/CHANGELOG.md +69 -0
- webtap_tool-0.1.1/PKG-INFO +427 -0
- webtap_tool-0.1.1/README.md +400 -0
- webtap_tool-0.1.1/data/filters.json +92 -0
- webtap_tool-0.1.1/extension/manifest.json +12 -0
- webtap_tool-0.1.1/extension/popup.html +181 -0
- webtap_tool-0.1.1/extension/popup.js +298 -0
- webtap_tool-0.1.1/llms.txt +310 -0
- webtap_tool-0.1.1/pyproject.toml +50 -0
- webtap_tool-0.1.1/src/webtap/VISION.md +234 -0
- webtap_tool-0.1.1/src/webtap/__init__.py +56 -0
- webtap_tool-0.1.1/src/webtap/api.py +222 -0
- webtap_tool-0.1.1/src/webtap/app.py +76 -0
- webtap_tool-0.1.1/src/webtap/cdp/README.md +268 -0
- webtap_tool-0.1.1/src/webtap/cdp/__init__.py +14 -0
- webtap_tool-0.1.1/src/webtap/cdp/query.py +107 -0
- webtap_tool-0.1.1/src/webtap/cdp/schema/README.md +41 -0
- webtap_tool-0.1.1/src/webtap/cdp/schema/cdp_protocol.json +32785 -0
- webtap_tool-0.1.1/src/webtap/cdp/schema/cdp_version.json +8 -0
- webtap_tool-0.1.1/src/webtap/cdp/session.py +365 -0
- webtap_tool-0.1.1/src/webtap/commands/DEVELOPER_GUIDE.md +314 -0
- webtap_tool-0.1.1/src/webtap/commands/TIPS.md +153 -0
- webtap_tool-0.1.1/src/webtap/commands/__init__.py +7 -0
- webtap_tool-0.1.1/src/webtap/commands/_builders.py +127 -0
- webtap_tool-0.1.1/src/webtap/commands/_errors.py +108 -0
- webtap_tool-0.1.1/src/webtap/commands/_tips.py +147 -0
- webtap_tool-0.1.1/src/webtap/commands/_utils.py +227 -0
- webtap_tool-0.1.1/src/webtap/commands/body.py +161 -0
- webtap_tool-0.1.1/src/webtap/commands/connection.py +168 -0
- webtap_tool-0.1.1/src/webtap/commands/console.py +69 -0
- webtap_tool-0.1.1/src/webtap/commands/events.py +109 -0
- webtap_tool-0.1.1/src/webtap/commands/fetch.py +219 -0
- webtap_tool-0.1.1/src/webtap/commands/filters.py +224 -0
- webtap_tool-0.1.1/src/webtap/commands/inspect.py +146 -0
- webtap_tool-0.1.1/src/webtap/commands/javascript.py +87 -0
- webtap_tool-0.1.1/src/webtap/commands/launch.py +86 -0
- webtap_tool-0.1.1/src/webtap/commands/navigation.py +199 -0
- webtap_tool-0.1.1/src/webtap/commands/network.py +85 -0
- webtap_tool-0.1.1/src/webtap/commands/setup.py +127 -0
- webtap_tool-0.1.1/src/webtap/filters.py +289 -0
- webtap_tool-0.1.1/src/webtap/services/README.md +83 -0
- webtap_tool-0.1.1/src/webtap/services/__init__.py +15 -0
- webtap_tool-0.1.1/src/webtap/services/body.py +113 -0
- webtap_tool-0.1.1/src/webtap/services/console.py +116 -0
- webtap_tool-0.1.1/src/webtap/services/fetch.py +397 -0
- webtap_tool-0.1.1/src/webtap/services/main.py +175 -0
- webtap_tool-0.1.1/src/webtap/services/network.py +105 -0
- webtap_tool-0.1.1/src/webtap/services/setup.py +219 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# WebTap Architecture
|
|
2
|
+
|
|
3
|
+
Implementation guide for WebTap commands following the VISION.
|
|
4
|
+
|
|
5
|
+
## Core Components
|
|
6
|
+
|
|
7
|
+
### CDPSession (cdp/session.py)
|
|
8
|
+
- WebSocket connection to Chrome
|
|
9
|
+
- DuckDB in-memory storage: `CREATE TABLE events (event JSON)`
|
|
10
|
+
- Events stored AS-IS: `INSERT INTO events VALUES (?)`
|
|
11
|
+
- Query interface: `query(sql)` - returns result rows
|
|
12
|
+
- Body fetching: `fetch_body(request_id)` - CDP call on-demand
|
|
13
|
+
|
|
14
|
+
### Command Pattern
|
|
15
|
+
|
|
16
|
+
Commands query DuckDB and return data for Replkit2 display.
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
@app.command
|
|
20
|
+
def network(state, query: dict = None):
|
|
21
|
+
"""Query network events with flexible filtering."""
|
|
22
|
+
|
|
23
|
+
# Default query
|
|
24
|
+
default = {
|
|
25
|
+
'limit': 20,
|
|
26
|
+
'exclude_static': True, # Skip images/fonts
|
|
27
|
+
'exclude_tracking': True # Skip analytics
|
|
28
|
+
}
|
|
29
|
+
q = {**default, **(query or {})}
|
|
30
|
+
|
|
31
|
+
# Build SQL from query dict
|
|
32
|
+
sql = build_network_sql(q)
|
|
33
|
+
|
|
34
|
+
# Return for Replkit2 display
|
|
35
|
+
return state.cdp.query(sql)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Command Implementation Guide
|
|
39
|
+
|
|
40
|
+
### network(query: dict)
|
|
41
|
+
```python
|
|
42
|
+
# Query dict can contain:
|
|
43
|
+
# - id: Single request detail
|
|
44
|
+
# - status: Filter by status code
|
|
45
|
+
# - method: Filter by HTTP method
|
|
46
|
+
# - url_contains: Substring match
|
|
47
|
+
# - limit: Result limit
|
|
48
|
+
# - exclude_static: Hide images/css/fonts
|
|
49
|
+
# - exclude_tracking: Hide analytics
|
|
50
|
+
|
|
51
|
+
# Build SQL:
|
|
52
|
+
SELECT
|
|
53
|
+
json_extract_string(event, '$.params.requestId') as id,
|
|
54
|
+
json_extract_string(event, '$.params.response.status') as status,
|
|
55
|
+
json_extract_string(event, '$.params.response.url') as url
|
|
56
|
+
FROM events
|
|
57
|
+
WHERE json_extract_string(event, '$.method') = 'Network.responseReceived'
|
|
58
|
+
AND [additional filters from query dict]
|
|
59
|
+
LIMIT 20
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### console(query: dict)
|
|
63
|
+
```python
|
|
64
|
+
# Query dict can contain:
|
|
65
|
+
# - level: 'error', 'warn', 'log'
|
|
66
|
+
# - source: 'console', 'network', 'security'
|
|
67
|
+
# - contains: Text search in message
|
|
68
|
+
# - limit: Result limit
|
|
69
|
+
|
|
70
|
+
# Build SQL:
|
|
71
|
+
SELECT
|
|
72
|
+
json_extract_string(event, '$.params.type') as level,
|
|
73
|
+
json_extract_string(event, '$.params.args[0].value') as message,
|
|
74
|
+
json_extract_string(event, '$.params.timestamp') as time
|
|
75
|
+
FROM events
|
|
76
|
+
WHERE json_extract_string(event, '$.method') IN ('Runtime.consoleAPICalled', 'Log.entryAdded')
|
|
77
|
+
AND [additional filters]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### body(id: str, expr: str = None)
|
|
81
|
+
```python
|
|
82
|
+
# Fetch body on-demand
|
|
83
|
+
result = state.cdp.fetch_body(id)
|
|
84
|
+
|
|
85
|
+
if not expr:
|
|
86
|
+
return result['body'] # Raw body
|
|
87
|
+
|
|
88
|
+
# Evaluate Python expression on body (like inspect command)
|
|
89
|
+
context = {
|
|
90
|
+
'data': result['body'],
|
|
91
|
+
'json': json.loads(result['body']) if parseable,
|
|
92
|
+
're': __import__('re')
|
|
93
|
+
}
|
|
94
|
+
return eval(expr, {}, context)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### inspect(query: dict, expr: str)
|
|
98
|
+
```python
|
|
99
|
+
# Query events then apply Python expression
|
|
100
|
+
events = state.cdp.query(build_sql(query))
|
|
101
|
+
|
|
102
|
+
# Apply expression to each event
|
|
103
|
+
results = []
|
|
104
|
+
for event in events:
|
|
105
|
+
context = {'event': json.loads(event[0])}
|
|
106
|
+
results.append(eval(expr, {}, context))
|
|
107
|
+
return results
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## SQL Patterns
|
|
111
|
+
|
|
112
|
+
### Fuzzy field matching
|
|
113
|
+
```sql
|
|
114
|
+
-- Find any field containing 'status'
|
|
115
|
+
SELECT * FROM events
|
|
116
|
+
WHERE json_extract_string(event, '$.params.response.status') = '404'
|
|
117
|
+
OR json_extract_string(event, '$.params.status') = '404'
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Correlation by requestId
|
|
121
|
+
```sql
|
|
122
|
+
-- Get all events for a request
|
|
123
|
+
SELECT event FROM events
|
|
124
|
+
WHERE json_extract_string(event, '$.params.requestId') = ?
|
|
125
|
+
ORDER BY rowid
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Exclude noise
|
|
129
|
+
```sql
|
|
130
|
+
-- Skip tracking/analytics
|
|
131
|
+
WHERE json_extract_string(event, '$.params.request.url') NOT LIKE '%google-analytics%'
|
|
132
|
+
AND json_extract_string(event, '$.params.request.url') NOT LIKE '%doubleclick%'
|
|
133
|
+
AND json_extract_string(event, '$.params.type') NOT IN ('Image', 'Font', 'Stylesheet')
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Display Strategy
|
|
137
|
+
|
|
138
|
+
- **Lists**: Return list of dicts for Replkit2 table display
|
|
139
|
+
- **Details**: Return single dict for box display
|
|
140
|
+
- **Raw**: Return JSON strings for inspect/debug
|
|
141
|
+
|
|
142
|
+
Commands should NOT format output - let Replkit2 handle display based on `@app.command(display="table"|"markdown"|"raw")`.
|
|
143
|
+
|
|
144
|
+
## Future Commands
|
|
145
|
+
|
|
146
|
+
- `storage()` - Query cookies/localStorage via CDP
|
|
147
|
+
- `api()` - Discover API endpoints from traffic
|
|
148
|
+
- `har()` - Export to HAR format
|
|
149
|
+
- `intercept()` - Modify requests (requires Fetch domain)
|
|
150
|
+
- `timeline()` - Request/response correlation view
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
### Fixed
|
|
15
|
+
|
|
16
|
+
### Removed
|
|
17
|
+
|
|
18
|
+
## [0.1.1] - 2025-09-05
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
### Changed
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
|
|
26
|
+
### Removed
|
|
27
|
+
|
|
28
|
+
## [0.1.0] - 2025-09-05
|
|
29
|
+
|
|
30
|
+
### Added
|
|
31
|
+
- Chrome DevTools Protocol (CDP) integration for browser debugging
|
|
32
|
+
- Native CDP Storage architecture using DuckDB for event storage
|
|
33
|
+
- Dynamic field discovery with fuzzy matching across all CDP events
|
|
34
|
+
- Network request/response monitoring with on-demand body fetching
|
|
35
|
+
- Console message capture with error tracking
|
|
36
|
+
- JavaScript execution in browser context via `js()` command
|
|
37
|
+
- Request interception and modification via `fetch()` command
|
|
38
|
+
- Chrome extension for visual page selection and debugging
|
|
39
|
+
- Bootstrap commands for downloading filters and extension (`setup-filters`, `setup-extension`)
|
|
40
|
+
- Chrome launcher command (`launch-chrome`) for debugging-enabled browser startup
|
|
41
|
+
- FastAPI server on port 8765 for Chrome extension integration
|
|
42
|
+
- Comprehensive filter system (ads, tracking, analytics, CDN, consent, monitoring)
|
|
43
|
+
- Events query system for flexible CDP event exploration
|
|
44
|
+
- Inspect command with Python environment for data analysis
|
|
45
|
+
- Svelte Debug Protocol (SDP) experimental support for Svelte app debugging
|
|
46
|
+
- Service layer architecture with clean dependency injection
|
|
47
|
+
- Markdown-based output formatting for all commands
|
|
48
|
+
- MCP (Model Context Protocol) support via ReplKit2
|
|
49
|
+
- CLI mode with Typer integration
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
- **BREAKING**: Removed single `bootstrap` command, replaced with separate setup commands
|
|
53
|
+
- **BREAKING**: `eval()` and `exec()` commands replaced by unified `js()` command
|
|
54
|
+
- **BREAKING**: All commands now return markdown dictionaries instead of plain text
|
|
55
|
+
- Aligned with ReplKit2 v0.11.0 API changes (`typer_config` instead of `cli_config`)
|
|
56
|
+
- Store CDP events as-is without transformation (Native CDP Storage philosophy)
|
|
57
|
+
- Connection errors return error responses instead of raising exceptions
|
|
58
|
+
- Standardized command pattern with unified builders and error handling
|
|
59
|
+
|
|
60
|
+
### Fixed
|
|
61
|
+
- CLI mode parameter handling for dict/list types
|
|
62
|
+
- Type checking errors with proper null checks
|
|
63
|
+
- Import order issues in CLI mode
|
|
64
|
+
- Shell completion options properly hidden in CLI mode
|
|
65
|
+
|
|
66
|
+
<!--
|
|
67
|
+
When you run 'relkit bump', the [Unreleased] section will automatically
|
|
68
|
+
become the new version section. Make sure to add your changes above!
|
|
69
|
+
-->
|
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: webtap-tool
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Terminal-based web page inspector for AI debugging sessions
|
|
5
|
+
Author-email: Fredrik Angelsen <fredrikangelsen@gmail.com>
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
8
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
9
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Requires-Dist: beautifulsoup4>=4.13.5
|
|
12
|
+
Requires-Dist: cryptography>=45.0.6
|
|
13
|
+
Requires-Dist: duckdb>=1.3.2
|
|
14
|
+
Requires-Dist: fastapi>=0.116.1
|
|
15
|
+
Requires-Dist: httpx>=0.28.1
|
|
16
|
+
Requires-Dist: lxml>=6.0.1
|
|
17
|
+
Requires-Dist: msgpack-python>=0.5.6
|
|
18
|
+
Requires-Dist: protobuf>=6.32.0
|
|
19
|
+
Requires-Dist: pyjwt>=2.10.1
|
|
20
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
21
|
+
Requires-Dist: replkit2[all]>=0.11.0
|
|
22
|
+
Requires-Dist: requests>=2.32.4
|
|
23
|
+
Requires-Dist: uvicorn>=0.35.0
|
|
24
|
+
Requires-Dist: websocket-client>=1.8.0
|
|
25
|
+
Requires-Dist: websockets>=15.0.1
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# WebTap
|
|
29
|
+
|
|
30
|
+
Browser debugging via Chrome DevTools Protocol with native event storage and dynamic querying.
|
|
31
|
+
|
|
32
|
+
## Overview
|
|
33
|
+
|
|
34
|
+
WebTap connects to Chrome's debugging protocol and stores CDP events as-is in DuckDB, enabling powerful SQL queries and dynamic field discovery without complex transformations.
|
|
35
|
+
|
|
36
|
+
## Key Features
|
|
37
|
+
|
|
38
|
+
- **Native CDP Storage** - Events stored exactly as received in DuckDB
|
|
39
|
+
- **Dynamic Field Discovery** - Automatically indexes all field paths from events
|
|
40
|
+
- **Smart Filtering** - Built-in filters for ads, tracking, analytics noise
|
|
41
|
+
- **SQL Querying** - Direct DuckDB access for complex analysis
|
|
42
|
+
- **Chrome Extension** - Visual page selector and connection management
|
|
43
|
+
- **Python Inspection** - Full Python environment for data exploration
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Install with uv
|
|
49
|
+
uv tool install webtap
|
|
50
|
+
|
|
51
|
+
# Or from source
|
|
52
|
+
cd packages/webtap
|
|
53
|
+
uv sync
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
1. **Start Chrome with debugging**
|
|
59
|
+
```bash
|
|
60
|
+
# macOS
|
|
61
|
+
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
|
|
62
|
+
|
|
63
|
+
# Linux
|
|
64
|
+
google-chrome --remote-debugging-port=9222
|
|
65
|
+
|
|
66
|
+
# Windows
|
|
67
|
+
chrome.exe --remote-debugging-port=9222
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
2. **Launch WebTap**
|
|
71
|
+
```bash
|
|
72
|
+
webtap
|
|
73
|
+
|
|
74
|
+
# You'll see:
|
|
75
|
+
================================================================================
|
|
76
|
+
WebTap - Chrome DevTools Protocol REPL
|
|
77
|
+
--------------------------------------------------------------------------------
|
|
78
|
+
Type help() for available commands
|
|
79
|
+
>>>
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
3. **Connect and explore**
|
|
83
|
+
```python
|
|
84
|
+
>>> pages() # List available Chrome pages
|
|
85
|
+
>>> connect(0) # Connect to first page
|
|
86
|
+
>>> network() # View network requests (filtered)
|
|
87
|
+
>>> console() # View console messages
|
|
88
|
+
>>> events({"url": "*api*"}) # Query any CDP field dynamically
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Core Commands
|
|
92
|
+
|
|
93
|
+
### Connection & Navigation
|
|
94
|
+
```python
|
|
95
|
+
pages() # List Chrome pages
|
|
96
|
+
connect(0) # Connect by index (shorthand)
|
|
97
|
+
connect(page=1) # Connect by index (explicit)
|
|
98
|
+
connect(page_id="xyz") # Connect by page ID
|
|
99
|
+
disconnect() # Disconnect from current page
|
|
100
|
+
navigate("https://...") # Navigate to URL
|
|
101
|
+
reload(ignore_cache=False) # Reload page
|
|
102
|
+
back() / forward() # Navigate history
|
|
103
|
+
page() # Show current page info
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Dynamic Event Querying
|
|
107
|
+
```python
|
|
108
|
+
# Query ANY field across ALL event types using dict filters
|
|
109
|
+
events({"url": "*github*"}) # Find GitHub requests
|
|
110
|
+
events({"status": 404}) # Find all 404s
|
|
111
|
+
events({"type": "xhr", "method": "POST"}) # Find AJAX POSTs
|
|
112
|
+
events({"headers": "*"}) # Extract all headers
|
|
113
|
+
|
|
114
|
+
# Field names are fuzzy-matched and case-insensitive
|
|
115
|
+
events({"URL": "*api*"}) # Works! Finds 'url', 'URL', 'documentURL'
|
|
116
|
+
events({"err": "*"}) # Finds 'error', 'errorText', 'err'
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Network Monitoring
|
|
120
|
+
```python
|
|
121
|
+
network() # Filtered network requests (default)
|
|
122
|
+
network(no_filters=True) # Show everything (noisy!)
|
|
123
|
+
network(filters=["ads", "tracking"]) # Specific filter categories
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Filter Management
|
|
127
|
+
```python
|
|
128
|
+
# Manage noise filters
|
|
129
|
+
filters() # Show current filters (default action="list")
|
|
130
|
+
filters(action="load") # Load from .webtap/filters.json
|
|
131
|
+
filters(action="add", config={"domain": "*doubleclick*", "category": "ads"})
|
|
132
|
+
filters(action="save") # Persist to disk
|
|
133
|
+
filters(action="toggle", config={"category": "ads"}) # Toggle category
|
|
134
|
+
|
|
135
|
+
# Built-in categories: ads, tracking, analytics, telemetry, cdn, fonts, images
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Data Inspection
|
|
139
|
+
```python
|
|
140
|
+
# Inspect events by rowid
|
|
141
|
+
inspect(49) # View event details by rowid
|
|
142
|
+
inspect(50, expr="data['params']['response']['headers']") # Extract field
|
|
143
|
+
|
|
144
|
+
# Response body inspection with Python expressions
|
|
145
|
+
body(49) # Get response body
|
|
146
|
+
body(49, expr="import json; json.loads(body)") # Parse JSON
|
|
147
|
+
body(49, expr="len(body)") # Check size
|
|
148
|
+
|
|
149
|
+
# Request interception
|
|
150
|
+
fetch("enable") # Enable request interception
|
|
151
|
+
fetch("disable") # Disable request interception
|
|
152
|
+
requests() # Show paused requests
|
|
153
|
+
resume(123) # Continue paused request by ID
|
|
154
|
+
fail(123) # Fail paused request by ID
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Console & JavaScript
|
|
158
|
+
```python
|
|
159
|
+
console() # View console messages
|
|
160
|
+
js("document.title") # Evaluate JavaScript (returns value)
|
|
161
|
+
js("console.log('Hello')", wait_return=False) # Execute without waiting
|
|
162
|
+
clear() # Clear events (default)
|
|
163
|
+
clear(console=True) # Clear browser console
|
|
164
|
+
clear(events=True, console=True, cache=True) # Clear everything
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Architecture
|
|
168
|
+
|
|
169
|
+
### Native CDP Storage Philosophy
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
Chrome Tab
|
|
173
|
+
↓ CDP Events (WebSocket)
|
|
174
|
+
DuckDB Storage (events table)
|
|
175
|
+
↓ SQL Queries + Field Discovery
|
|
176
|
+
Service Layer (WebTapService)
|
|
177
|
+
├── NetworkService - Request filtering
|
|
178
|
+
├── ConsoleService - Message handling
|
|
179
|
+
├── FetchService - Request interception
|
|
180
|
+
└── BodyService - Response caching
|
|
181
|
+
↓
|
|
182
|
+
Commands (Thin Wrappers)
|
|
183
|
+
├── events() - Query any field
|
|
184
|
+
├── network() - Filtered requests
|
|
185
|
+
├── console() - Messages
|
|
186
|
+
├── body() - Response bodies
|
|
187
|
+
└── js() - JavaScript execution
|
|
188
|
+
↓
|
|
189
|
+
API Server (FastAPI on :8765)
|
|
190
|
+
└── Chrome Extension Integration
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### How It Works
|
|
194
|
+
|
|
195
|
+
1. **Events stored as-is** - No transformation, full CDP data preserved
|
|
196
|
+
2. **Field paths indexed** - Every unique path like `params.response.status` tracked
|
|
197
|
+
3. **Dynamic discovery** - Fuzzy matching finds fields without schemas
|
|
198
|
+
4. **SQL generation** - User queries converted to DuckDB JSON queries
|
|
199
|
+
5. **On-demand fetching** - Bodies, cookies fetched only when needed
|
|
200
|
+
|
|
201
|
+
## Advanced Usage
|
|
202
|
+
|
|
203
|
+
### Direct SQL Queries
|
|
204
|
+
```python
|
|
205
|
+
# Access DuckDB directly
|
|
206
|
+
sql = """
|
|
207
|
+
SELECT json_extract_string(event, '$.params.response.url') as url,
|
|
208
|
+
json_extract_string(event, '$.params.response.status') as status
|
|
209
|
+
FROM events
|
|
210
|
+
WHERE json_extract_string(event, '$.method') = 'Network.responseReceived'
|
|
211
|
+
"""
|
|
212
|
+
results = state.cdp.query(sql)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Field Discovery
|
|
216
|
+
```python
|
|
217
|
+
# See what fields are available
|
|
218
|
+
state.cdp.field_paths.keys() # All discovered field names
|
|
219
|
+
|
|
220
|
+
# Find all paths for a field
|
|
221
|
+
state.cdp.discover_field_paths("url")
|
|
222
|
+
# Returns: ['params.request.url', 'params.response.url', 'params.documentURL', ...]
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Direct CDP Access
|
|
226
|
+
```python
|
|
227
|
+
# Send CDP commands directly
|
|
228
|
+
state.cdp.execute("Network.getResponseBody", {"requestId": "123"})
|
|
229
|
+
state.cdp.execute("Storage.getCookies", {})
|
|
230
|
+
state.cdp.execute("Runtime.evaluate", {"expression": "window.location.href"})
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### Chrome Extension
|
|
234
|
+
|
|
235
|
+
Install the extension from `packages/webtap/extension/`:
|
|
236
|
+
1. Open `chrome://extensions/`
|
|
237
|
+
2. Enable Developer mode
|
|
238
|
+
3. Load unpacked → Select extension folder
|
|
239
|
+
4. Click extension icon to connect to pages
|
|
240
|
+
|
|
241
|
+
## Examples
|
|
242
|
+
|
|
243
|
+
### List and Connect to Pages
|
|
244
|
+
```python
|
|
245
|
+
>>> pages()
|
|
246
|
+
## Chrome Pages
|
|
247
|
+
|
|
248
|
+
| Index | Title | URL | ID | Connected |
|
|
249
|
+
|:------|:---------------------|:-------------------------------|:-------|:----------|
|
|
250
|
+
| 0 | Messenger | https://www.m...1743198803269/ | DC8... | No |
|
|
251
|
+
| 1 | GitHub - replkit2 | https://githu...elsen/replkit2 | DD4... | No |
|
|
252
|
+
| 2 | YouTube Music | https://music.youtube.com/ | F83... | No |
|
|
253
|
+
|
|
254
|
+
_3 pages available_
|
|
255
|
+
<pages: 1 fields>
|
|
256
|
+
|
|
257
|
+
>>> connect(1)
|
|
258
|
+
## Connection Established
|
|
259
|
+
|
|
260
|
+
**Page:** GitHub - angelsen/replkit2
|
|
261
|
+
|
|
262
|
+
**URL:** https://github.com/angelsen/replkit2
|
|
263
|
+
<connect: 1 fields>
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Monitor Network Traffic
|
|
267
|
+
```python
|
|
268
|
+
>>> network()
|
|
269
|
+
## Network Requests
|
|
270
|
+
|
|
271
|
+
| ID | ReqID | Method | Status | URL | Type | Size |
|
|
272
|
+
|:-----|:-------------|:-------|:-------|:------------------------------------------------|:---------|:-----|
|
|
273
|
+
| 3264 | 682214.9033 | GET | 200 | https://api.github.com/graphql | Fetch | 22KB |
|
|
274
|
+
| 2315 | 682214.8985 | GET | 200 | https://api.github.com/repos/angelsen/replkit2 | Fetch | 16KB |
|
|
275
|
+
| 359 | 682214.8638 | GET | 200 | https://github.githubassets.com/assets/app.js | Script | 21KB |
|
|
276
|
+
|
|
277
|
+
_3 requests_
|
|
278
|
+
|
|
279
|
+
### Next Steps
|
|
280
|
+
|
|
281
|
+
- **Analyze responses:** `body(3264)` - fetch response body
|
|
282
|
+
- **Parse HTML:** `body(3264, "bs4(body, 'html.parser').find('title').text")`
|
|
283
|
+
- **Extract JSON:** `body(3264, "json.loads(body)['data']")`
|
|
284
|
+
- **Find patterns:** `body(3264, "re.findall(r'/api/\\w+', body)")`
|
|
285
|
+
- **Decode JWT:** `body(3264, "jwt.decode(body, options={'verify_signature': False})")`
|
|
286
|
+
- **Search events:** `events({'url': '*api*'})` - find all API calls
|
|
287
|
+
- **Intercept traffic:** `fetch('enable')` then `requests()` - pause and modify
|
|
288
|
+
<network: 1 fields>
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### View Console Messages
|
|
292
|
+
```python
|
|
293
|
+
>>> console()
|
|
294
|
+
## Console Messages
|
|
295
|
+
|
|
296
|
+
| ID | Level | Source | Message | Time |
|
|
297
|
+
|:-----|:-----------|:---------|:----------------------------------------------------------------|:---------|
|
|
298
|
+
| 5939 | WARNING | security | An iframe which has both allow-scripts and allow-same-origin... | 11:42:46 |
|
|
299
|
+
| 2319 | LOG | console | API request completed | 11:42:40 |
|
|
300
|
+
| 32 | ERROR | network | Failed to load resource: the server responded with a status... | 12:47:41 |
|
|
301
|
+
|
|
302
|
+
_3 messages_
|
|
303
|
+
|
|
304
|
+
### Next Steps
|
|
305
|
+
|
|
306
|
+
- **Inspect error:** `inspect(32)` - view full stack trace
|
|
307
|
+
- **Find all errors:** `events({'level': 'error'})` - filter console errors
|
|
308
|
+
- **Extract stack:** `inspect(32, "data.get('stackTrace', {})")`
|
|
309
|
+
- **Search messages:** `events({'message': '*failed*'})` - pattern match
|
|
310
|
+
- **Check network:** `network()` - may show failed requests causing errors
|
|
311
|
+
<console: 1 fields>
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Find and Analyze API Calls
|
|
315
|
+
```python
|
|
316
|
+
>>> events({"url": "*api*", "method": "POST"})
|
|
317
|
+
## Query Results
|
|
318
|
+
|
|
319
|
+
| RowID | Method | URL | Status |
|
|
320
|
+
|:------|:----------------------------|:--------------------------------|:-------|
|
|
321
|
+
| 49 | Network.requestWillBeSent | https://api.github.com/graphql | - |
|
|
322
|
+
| 50 | Network.responseReceived | https://api.github.com/graphql | 200 |
|
|
323
|
+
|
|
324
|
+
_2 events_
|
|
325
|
+
<events: 1 fields>
|
|
326
|
+
|
|
327
|
+
>>> body(50, expr="import json; json.loads(body)['data']")
|
|
328
|
+
{'viewer': {'login': 'octocat', 'name': 'The Octocat'}}
|
|
329
|
+
|
|
330
|
+
>>> inspect(49) # View full request details
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
### Debug Failed Requests
|
|
334
|
+
```python
|
|
335
|
+
>>> events({"status": 404})
|
|
336
|
+
## Query Results
|
|
337
|
+
|
|
338
|
+
| RowID | Method | URL | Status |
|
|
339
|
+
|:------|:-------------------------|:----------------------------------|:-------|
|
|
340
|
+
| 32 | Network.responseReceived | https://api.example.com/missing | 404 |
|
|
341
|
+
| 29 | Network.responseReceived | https://api.example.com/notfound | 404 |
|
|
342
|
+
|
|
343
|
+
_2 events_
|
|
344
|
+
<events: 1 fields>
|
|
345
|
+
|
|
346
|
+
>>> events({"errorText": "*"}) # Find network errors
|
|
347
|
+
>>> events({"type": "Failed"}) # Find failed resources
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
### Monitor Specific Domains
|
|
351
|
+
```python
|
|
352
|
+
>>> events({"url": "*myapi.com*"}) # Your API
|
|
353
|
+
>>> events({"url": "*localhost*"}) # Local development
|
|
354
|
+
>>> events({"url": "*stripe*"}) # Payment APIs
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
### Extract Headers and Cookies
|
|
358
|
+
```python
|
|
359
|
+
>>> events({"headers": "*authorization*"}) # Find auth headers
|
|
360
|
+
>>> state.cdp.execute("Storage.getCookies", {}) # Get all cookies
|
|
361
|
+
>>> events({"setCookie": "*"}) # Find Set-Cookie headers
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
## Filter Configuration
|
|
365
|
+
|
|
366
|
+
WebTap includes aggressive default filters to reduce noise. Customize in `.webtap/filters.json`:
|
|
367
|
+
|
|
368
|
+
```json
|
|
369
|
+
{
|
|
370
|
+
"ads": {
|
|
371
|
+
"domains": ["*doubleclick*", "*googlesyndication*", "*adsystem*"],
|
|
372
|
+
"types": ["Ping", "Beacon"]
|
|
373
|
+
},
|
|
374
|
+
"tracking": {
|
|
375
|
+
"domains": ["*google-analytics*", "*segment*", "*mixpanel*"],
|
|
376
|
+
"types": ["Image", "Script"]
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
## Design Principles
|
|
382
|
+
|
|
383
|
+
1. **Store AS-IS** - No transformation of CDP events
|
|
384
|
+
2. **Query On-Demand** - Extract only what's needed
|
|
385
|
+
3. **Dynamic Discovery** - No predefined schemas
|
|
386
|
+
4. **SQL-First** - Leverage DuckDB's JSON capabilities
|
|
387
|
+
5. **Minimal Memory** - Store only CDP data
|
|
388
|
+
|
|
389
|
+
## Requirements
|
|
390
|
+
|
|
391
|
+
- Chrome/Chromium with debugging enabled
|
|
392
|
+
- Python 3.12+
|
|
393
|
+
- Dependencies: websocket-client, duckdb, replkit2, fastapi, uvicorn, beautifulsoup4
|
|
394
|
+
|
|
395
|
+
## Development
|
|
396
|
+
|
|
397
|
+
```bash
|
|
398
|
+
# Run from source
|
|
399
|
+
cd packages/webtap
|
|
400
|
+
uv run webtap
|
|
401
|
+
|
|
402
|
+
# API server starts automatically on port 8765
|
|
403
|
+
# Chrome extension connects to http://localhost:8765
|
|
404
|
+
|
|
405
|
+
# Type checking and linting
|
|
406
|
+
basedpyright packages/webtap/src/webtap
|
|
407
|
+
ruff check --fix packages/webtap/src/webtap
|
|
408
|
+
ruff format packages/webtap/src/webtap
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
## API Server
|
|
412
|
+
|
|
413
|
+
WebTap automatically starts a FastAPI server on port 8765 for Chrome extension integration:
|
|
414
|
+
|
|
415
|
+
- `GET /status` - Connection status
|
|
416
|
+
- `GET /pages` - List available Chrome pages
|
|
417
|
+
- `POST /connect` - Connect to a page
|
|
418
|
+
- `POST /disconnect` - Disconnect from current page
|
|
419
|
+
- `POST /clear` - Clear events/console/cache
|
|
420
|
+
- `GET /fetch/paused` - Get paused requests
|
|
421
|
+
- `POST /filters/toggle/{category}` - Toggle filter categories
|
|
422
|
+
|
|
423
|
+
The API server runs in a background thread and doesn't block the REPL.
|
|
424
|
+
|
|
425
|
+
## License
|
|
426
|
+
|
|
427
|
+
MIT - See [LICENSE](../../LICENSE) for details.
|