webtap-tool 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webtap-tool might be problematic. Click here for more details.

@@ -0,0 +1,127 @@
1
+ """Setup commands for WebTap components."""
2
+
3
+ from webtap.app import app
4
+ from webtap.services.setup import SetupService
5
+
6
+
7
+ @app.command(
8
+ display="markdown",
9
+ typer={"name": "setup-filters", "help": "Download filter configuration from GitHub"},
10
+ fastmcp={"enabled": False},
11
+ )
12
+ def setup_filters(state, force: bool = False) -> dict:
13
+ """Download filter configuration to ./.webtap/filters.json.
14
+
15
+ Args:
16
+ force: Overwrite existing file (default: False)
17
+
18
+ Returns:
19
+ Markdown-formatted result with success/error messages
20
+ """
21
+ service = SetupService()
22
+ result = service.install_filters(force=force)
23
+ return _format_setup_result(result, "filters")
24
+
25
+
26
+ @app.command(
27
+ display="markdown",
28
+ typer={"name": "setup-extension", "help": "Download Chrome extension from GitHub"},
29
+ fastmcp={"enabled": False},
30
+ )
31
+ def setup_extension(state, force: bool = False) -> dict:
32
+ """Download Chrome extension to ~/.config/webtap/extension/.
33
+
34
+ Args:
35
+ force: Overwrite existing files (default: False)
36
+
37
+ Returns:
38
+ Markdown-formatted result with success/error messages
39
+ """
40
+ service = SetupService()
41
+ result = service.install_extension(force=force)
42
+ return _format_setup_result(result, "extension")
43
+
44
+
45
+ @app.command(
46
+ display="markdown",
47
+ typer={"name": "setup-chrome", "help": "Install Chrome wrapper script for debugging"},
48
+ fastmcp={"enabled": False},
49
+ )
50
+ def setup_chrome(state, force: bool = False) -> dict:
51
+ """Install Chrome wrapper to ~/.local/bin/wrappers/google-chrome-stable.
52
+
53
+ Args:
54
+ force: Overwrite existing script (default: False)
55
+
56
+ Returns:
57
+ Markdown-formatted result with success/error messages
58
+ """
59
+ service = SetupService()
60
+ result = service.install_chrome_wrapper(force=force)
61
+ return _format_setup_result(result, "chrome")
62
+
63
+
64
+ def _format_setup_result(result: dict, component: str) -> dict:
65
+ """Format setup result as markdown."""
66
+ elements = []
67
+
68
+ # Main message as alert (using "message" key for consistency)
69
+ level = "success" if result["success"] else "error"
70
+ elements.append({"type": "alert", "message": result["message"], "level": level})
71
+
72
+ # Add details if present
73
+ if result.get("path"):
74
+ elements.append({"type": "text", "content": f"**Location:** `{result['path']}`"})
75
+ if result.get("details"):
76
+ elements.append({"type": "text", "content": f"**Details:** {result['details']}"})
77
+
78
+ # Component-specific next steps
79
+ if result["success"]:
80
+ if component == "filters":
81
+ elements.append({"type": "text", "content": "\n**Next steps:**"})
82
+ elements.append(
83
+ {
84
+ "type": "list",
85
+ "items": [
86
+ "Run `filters('load')` to load the filters",
87
+ "Run `filters()` to see loaded categories",
88
+ ],
89
+ }
90
+ )
91
+ elif component == "extension":
92
+ elements.append({"type": "text", "content": "\n**To install in Chrome:**"})
93
+ elements.append(
94
+ {
95
+ "type": "list",
96
+ "items": [
97
+ "Open chrome://extensions/",
98
+ "Enable Developer mode",
99
+ "Click 'Load unpacked'",
100
+ f"Select {result['path']}",
101
+ ],
102
+ }
103
+ )
104
+ elif component == "chrome":
105
+ if "Add to PATH" in result.get("details", ""):
106
+ elements.append({"type": "text", "content": "\n**Setup PATH:**"})
107
+ elements.append(
108
+ {
109
+ "type": "code_block",
110
+ "language": "bash",
111
+ "content": 'export PATH="$HOME/.local/bin/wrappers:$PATH"',
112
+ }
113
+ )
114
+ elements.append({"type": "text", "content": "Add to ~/.bashrc to make permanent"})
115
+ else:
116
+ elements.append({"type": "text", "content": "\n**Usage:**"})
117
+ elements.append(
118
+ {
119
+ "type": "list",
120
+ "items": [
121
+ "Run `google-chrome-stable` to start Chrome with debugging",
122
+ "Or use `run-chrome` command for direct launch",
123
+ ],
124
+ }
125
+ )
126
+
127
+ return {"elements": elements}
webtap/filters.py ADDED
@@ -0,0 +1,289 @@
1
+ """Network request filter management for WebTap.
2
+
3
+ PUBLIC API:
4
+ - FilterManager: Main filter management class
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Dict, List, Any
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class FilterManager:
16
+ """Manages network request filters for noise reduction.
17
+
18
+ Provides filtering of CDP network events based on domain patterns and resource
19
+ types. Filters are organized into categories that can be enabled/disabled
20
+ independently. Supports wildcard patterns and generates SQL WHERE clauses
21
+ for efficient event filtering.
22
+
23
+ Attributes:
24
+ filter_path: Path to the filters.json file.
25
+ filters: Dict mapping category names to filter patterns.
26
+ enabled_categories: Set of currently enabled filter categories.
27
+ """
28
+
29
+ def __init__(self, filter_path: Path | None = None):
30
+ """Initialize filter manager.
31
+
32
+ Args:
33
+ filter_path: Path to filters.json file. Defaults to .webtap/filters.json.
34
+ """
35
+ self.filter_path = filter_path or (Path.cwd() / ".webtap" / "filters.json")
36
+ self.filters: Dict[str, Dict[str, List[str]]] = {}
37
+ self.enabled_categories: set[str] = set()
38
+
39
+ def load(self) -> bool:
40
+ """Load filters from disk.
41
+
42
+ Loads filter configuration from the JSON file and enables all categories
43
+ by default. Creates empty filter dict if file doesn't exist or fails to load.
44
+
45
+ Returns:
46
+ True if loaded successfully, False otherwise.
47
+ """
48
+ if self.filter_path.exists():
49
+ try:
50
+ with open(self.filter_path) as f:
51
+ self.filters = json.load(f)
52
+ # Enable all categories by default
53
+ self.enabled_categories = set(self.filters.keys())
54
+ logger.info(f"Loaded {len(self.filters)} filter categories from {self.filter_path}")
55
+ return True
56
+ except Exception as e:
57
+ logger.error(f"Failed to load filters: {e}")
58
+ self.filters = {}
59
+ return False
60
+ else:
61
+ logger.info(f"No filters found at {self.filter_path}")
62
+ self.filters = {}
63
+ return False
64
+
65
+ def save(self) -> bool:
66
+ """Save current filters to disk.
67
+
68
+ Creates the parent directory if it doesn't exist and writes the filter
69
+ configuration as JSON with indentation.
70
+
71
+ Returns:
72
+ True if saved successfully, False on error.
73
+ """
74
+ try:
75
+ self.filter_path.parent.mkdir(parents=True, exist_ok=True)
76
+ with open(self.filter_path, "w") as f:
77
+ json.dump(self.filters, f, indent=2)
78
+ logger.info(f"Saved filters to {self.filter_path}")
79
+ return True
80
+ except Exception as e:
81
+ logger.error(f"Failed to save filters: {e}")
82
+ return False
83
+
84
+ def add_pattern(self, pattern: str, category: str, pattern_type: str = "domain") -> bool:
85
+ """Add a filter pattern to a category.
86
+
87
+ Creates the category if it doesn't exist and enables it. Supports wildcard
88
+ patterns using * for matching. Patterns are deduplicated within categories.
89
+
90
+ Args:
91
+ pattern: Pattern to add (e.g., "*ads*", "googletagmanager.com").
92
+ category: Category name (e.g., "ads", "tracking").
93
+ pattern_type: "domain" or "type". Defaults to "domain".
94
+
95
+ Returns:
96
+ True if pattern was added, False if it already existed.
97
+ """
98
+ if category not in self.filters:
99
+ self.filters[category] = {"domains": [], "types": []}
100
+ self.enabled_categories.add(category)
101
+
102
+ key = "domains" if pattern_type == "domain" else "types"
103
+ if pattern not in self.filters[category][key]:
104
+ self.filters[category][key].append(pattern)
105
+ return True
106
+ return False
107
+
108
+ def remove_pattern(self, pattern: str, pattern_type: str = "domain") -> str:
109
+ """Remove a pattern from all categories.
110
+
111
+ Searches through all categories to find and remove the specified pattern.
112
+ Only removes the first occurrence found.
113
+
114
+ Args:
115
+ pattern: Pattern to remove.
116
+ pattern_type: "domain" or "type". Defaults to "domain".
117
+
118
+ Returns:
119
+ Category name it was removed from, or empty string if not found.
120
+ """
121
+ key = "domains" if pattern_type == "domain" else "types"
122
+ for category, filters in self.filters.items():
123
+ if pattern in filters.get(key, []):
124
+ filters[key].remove(pattern)
125
+ return category
126
+ return ""
127
+
128
+ def update_category(self, category: str, domains: List[str] | None = None, types: List[str] | None = None):
129
+ """Update or create a category with new patterns.
130
+
131
+ Creates the category if it doesn't exist and enables it. If patterns are
132
+ provided, they completely replace the existing patterns for that type.
133
+
134
+ Args:
135
+ category: Category name.
136
+ domains: List of domain patterns. None leaves existing unchanged.
137
+ types: List of type patterns. None leaves existing unchanged.
138
+ """
139
+ if category not in self.filters:
140
+ self.filters[category] = {"domains": [], "types": []}
141
+
142
+ if domains is not None:
143
+ self.filters[category]["domains"] = domains
144
+ if types is not None:
145
+ self.filters[category]["types"] = types
146
+
147
+ self.enabled_categories.add(category)
148
+
149
+ def delete_category(self, category: str) -> bool:
150
+ """Delete a filter category.
151
+
152
+ Removes the category and all its patterns. Also removes it from the
153
+ enabled categories set.
154
+
155
+ Args:
156
+ category: Category name to delete.
157
+
158
+ Returns:
159
+ True if category was deleted, False if it didn't exist.
160
+ """
161
+ if category in self.filters:
162
+ del self.filters[category]
163
+ self.enabled_categories.discard(category)
164
+ return True
165
+ return False
166
+
167
+ def set_enabled_categories(self, categories: List[str] | None = None):
168
+ """Set which categories are enabled for filtering.
169
+
170
+ Only enabled categories are used when generating SQL filter clauses.
171
+ Invalid category names are silently ignored.
172
+
173
+ Args:
174
+ categories: List of category names to enable. None enables all categories.
175
+ """
176
+ if categories is None:
177
+ self.enabled_categories = set(self.filters.keys())
178
+ else:
179
+ self.enabled_categories = set(categories) & set(self.filters.keys())
180
+
181
+ def get_filter_sql(self, use_all: bool = True, categories: List[str] | None = None) -> str:
182
+ """Generate SQL WHERE clause for filtering CDP events.
183
+
184
+ Creates SQL conditions to exclude network requests matching the filter
185
+ patterns. Handles wildcard patterns by converting them to SQL LIKE patterns
186
+ and properly escapes SQL strings.
187
+
188
+ Args:
189
+ use_all: Use all enabled categories. Defaults to True.
190
+ categories: Specific categories to use (overrides use_all).
191
+
192
+ Returns:
193
+ SQL WHERE clause string, or empty string if no filters apply.
194
+ """
195
+ if not self.filters:
196
+ return ""
197
+
198
+ # Determine which categories to use
199
+ if categories:
200
+ active_categories = set(categories) & set(self.filters.keys())
201
+ elif use_all:
202
+ active_categories = self.enabled_categories
203
+ else:
204
+ return ""
205
+
206
+ if not active_categories:
207
+ return ""
208
+
209
+ # Collect all patterns
210
+ all_domains = []
211
+ all_types = []
212
+
213
+ for category in active_categories:
214
+ all_domains.extend(self.filters[category].get("domains", []))
215
+ all_types.extend(self.filters[category].get("types", []))
216
+
217
+ # Build filter conditions - exclude matching items
218
+ exclude_conditions = []
219
+
220
+ # Domain filtering - exclude URLs matching these patterns
221
+ if all_domains:
222
+ for pattern in all_domains:
223
+ # Convert wildcard to SQL LIKE pattern, escape single quotes for SQL safety
224
+ sql_pattern = pattern.replace("'", "''").replace("*", "%")
225
+ # For Network.responseReceived events - filter on what's actually there
226
+ exclude_conditions.append(
227
+ f"json_extract_string(event, '$.params.response.url') NOT LIKE '{sql_pattern}'"
228
+ )
229
+
230
+ # Type filtering - exclude these types
231
+ if all_types:
232
+ # Escape single quotes in types for SQL safety
233
+ escaped_types = [t.replace("'", "''") for t in all_types]
234
+ type_list = ", ".join(f"'{t}'" for t in escaped_types)
235
+ # Use COALESCE to handle NULL types properly, exclude matching types
236
+ exclude_conditions.append(
237
+ f"(COALESCE(json_extract_string(event, '$.params.type'), '') NOT IN ({type_list}) OR "
238
+ f"json_extract_string(event, '$.params.type') IS NULL)"
239
+ )
240
+
241
+ if exclude_conditions:
242
+ # Use AND to ensure ALL conditions are met (item doesn't match ANY filter)
243
+ return f"({' AND '.join(exclude_conditions)})"
244
+
245
+ return ""
246
+
247
+ def get_status(self) -> Dict[str, Any]:
248
+ """Get current filter status and statistics.
249
+
250
+ Provides comprehensive information about loaded filters including
251
+ category counts, enabled status, and file path.
252
+
253
+ Returns:
254
+ Dict with filter information including loaded status, categories,
255
+ enabled categories, pattern counts, and file path.
256
+ """
257
+ return {
258
+ "loaded": bool(self.filters),
259
+ "categories": list(self.filters.keys()),
260
+ "enabled": list(self.enabled_categories),
261
+ "total_domains": sum(len(f.get("domains", [])) for f in self.filters.values()),
262
+ "total_types": sum(len(f.get("types", [])) for f in self.filters.values()),
263
+ "path": str(self.filter_path),
264
+ }
265
+
266
+ def get_display_info(self) -> str:
267
+ """Get formatted filter information for display.
268
+
269
+ Creates a human-readable summary of all filter categories with their
270
+ enabled status and pattern counts.
271
+
272
+ Returns:
273
+ Formatted multiline string with filter details.
274
+ """
275
+ if not self.filters:
276
+ return f"No filters loaded (would load from {self.filter_path})"
277
+
278
+ lines = [f"Loaded filters from {self.filter_path}:"]
279
+ for category in sorted(self.filters.keys()):
280
+ filters = self.filters[category]
281
+ enabled = "✓" if category in self.enabled_categories else "✗"
282
+ domains = len(filters.get("domains", []))
283
+ types = len(filters.get("types", []))
284
+ lines.append(f" {enabled} {category}: {domains} domains, {types} types")
285
+
286
+ return "\n".join(lines)
287
+
288
+
289
+ __all__ = ["FilterManager"]
@@ -0,0 +1,83 @@
1
+ # WebTap Services Layer
2
+
3
+ The services layer provides clean, reusable interfaces for querying and managing CDP events stored in DuckDB.
4
+
5
+ ## Architecture
6
+
7
+ ```
8
+ commands/ → services/ → cdp/session → DuckDB
9
+ ↓ ↓
10
+ API Properties/Methods
11
+ ```
12
+
13
+ ## Services
14
+
15
+ ### WebTapService (`main.py`)
16
+ Main orchestrator that manages all domain-specific services and CDP connection.
17
+
18
+ **Key Properties:**
19
+ - `event_count` - Total CDP events stored
20
+
21
+ **Key Methods:**
22
+ - `connect_to_page()` - Connect and enable CDP domains
23
+ - `disconnect()` - Clean disconnection
24
+ - `get_status()` - Comprehensive status with metrics from all services
25
+
26
+ ### FetchService (`fetch.py`)
27
+ Manages HTTP request/response interception.
28
+
29
+ **Key Properties:**
30
+ - `paused_count` - Number of paused requests
31
+
32
+ **Key Methods:**
33
+ - `get_paused_rowids()` - List of paused request rowids
34
+ - `enable()` / `disable()` - Control interception
35
+ - `continue_request()` / `fail_request()` - Process paused requests
36
+
37
+ ### NetworkService (`network.py`)
38
+ Queries network events (requests/responses).
39
+
40
+ **Key Properties:**
41
+ - `request_count` - Total network requests
42
+
43
+ **Key Methods:**
44
+ - `get_recent_requests()` - Network events with filter support
45
+ - `get_failed_requests()` - 4xx/5xx errors
46
+ - `get_request_by_id()` - All events for a request
47
+
48
+ ### ConsoleService (`console.py`)
49
+ Queries console messages and browser logs.
50
+
51
+ **Key Properties:**
52
+ - `message_count` - Total console messages
53
+ - `error_count` - Console errors only
54
+
55
+ **Key Methods:**
56
+ - `get_recent_messages()` - Console events with level filter
57
+ - `get_errors()` / `get_warnings()` - Filtered queries
58
+ - `clear_browser_console()` - CDP command to clear console
59
+
60
+ ## Design Principles
61
+
62
+ 1. **Rowid-Native**: All queries return rowid as primary identifier
63
+ 2. **Direct Queries**: No caching, query DuckDB on-demand
64
+ 3. **Properties for Counts**: Common counts exposed as properties
65
+ 4. **Methods for Queries**: Complex queries as methods with parameters
66
+ 5. **Service Isolation**: Each service manages its domain independently
67
+
68
+ ## Usage
69
+
70
+ Services are accessed through the WebTapState:
71
+
72
+ ```python
73
+ # In commands
74
+ @app.command()
75
+ def network(state):
76
+ results = state.service.network.get_recent_requests(limit=20)
77
+ count = state.service.network.request_count
78
+
79
+ # In API
80
+ @api.get("/status")
81
+ async def status():
82
+ return app_state.service.get_status()
83
+ ```
@@ -0,0 +1,15 @@
1
+ """WebTap service layer for managing CDP state and operations.
2
+
3
+ The service layer provides a clean interface between REPL commands/API endpoints
4
+ and the underlying CDP session. Services encapsulate domain-specific queries and
5
+ operations, making them reusable across different interfaces.
6
+
7
+ PUBLIC API:
8
+ - WebTapService: Main service orchestrating all domain services
9
+ - SetupService: Service for installing WebTap components
10
+ """
11
+
12
+ from webtap.services.main import WebTapService
13
+ from webtap.services.setup import SetupService
14
+
15
+ __all__ = ["WebTapService", "SetupService"]
@@ -0,0 +1,113 @@
1
+ """Body fetching service for response content."""
2
+
3
+ import base64
4
+ import json
5
+ import logging
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from webtap.cdp import CDPSession
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BodyService:
15
+ """Internal service for response body fetching and caching."""
16
+
17
+ def __init__(self):
18
+ """Initialize body service."""
19
+ self.cdp: CDPSession | None = None
20
+ self._body_cache: dict[str, dict] = {}
21
+
22
+ def get_response_body(self, rowid: int, use_cache: bool = True) -> dict:
23
+ """Fetch response body for a response.
24
+
25
+ Args:
26
+ rowid: Row ID from events table (Network or Fetch response)
27
+ use_cache: Whether to use cached body if available
28
+ """
29
+ if not self.cdp:
30
+ return {"error": "No CDP session"}
31
+
32
+ # Get event from DB to extract requestId
33
+ result = self.cdp.query("SELECT event FROM events WHERE rowid = ?", [rowid])
34
+
35
+ if not result:
36
+ return {"error": f"Event with rowid {rowid} not found"}
37
+
38
+ try:
39
+ event_data = json.loads(result[0][0])
40
+ except json.JSONDecodeError:
41
+ return {"error": "Failed to parse event data"}
42
+
43
+ method = event_data.get("method", "")
44
+ params = event_data.get("params", {})
45
+
46
+ # Handle both Fetch and Network events
47
+ if method == "Fetch.requestPaused":
48
+ # Fetch interception - verify it's response stage
49
+ if "responseStatusCode" not in params:
50
+ return {"error": "Not a response stage event (no responseStatusCode)"}
51
+ request_id = params.get("requestId")
52
+ domain = "Fetch"
53
+ elif method == "Network.responseReceived":
54
+ # Regular network response
55
+ request_id = params.get("requestId")
56
+ domain = "Network"
57
+ else:
58
+ return {"error": f"Not a response event (method: {method})"}
59
+
60
+ if not request_id:
61
+ return {"error": "No requestId in event"}
62
+
63
+ # Check cache
64
+ if use_cache and request_id in self._body_cache:
65
+ logger.debug(f"Using cached body for {request_id}")
66
+ return self._body_cache[request_id]
67
+
68
+ try:
69
+ # Fetch body from CDP using appropriate domain
70
+ logger.debug(f"Fetching body for {request_id} using {domain}.getResponseBody")
71
+ result = self.cdp.execute(f"{domain}.getResponseBody", {"requestId": request_id})
72
+
73
+ body_data = {"body": result.get("body", ""), "base64Encoded": result.get("base64Encoded", False)}
74
+
75
+ # Cache it for this request
76
+ if use_cache:
77
+ self._body_cache[request_id] = body_data
78
+ logger.debug(f"Cached body for {request_id}")
79
+
80
+ return body_data
81
+
82
+ except Exception as e:
83
+ logger.error(f"Failed to fetch body for {request_id}: {e}")
84
+ return {"error": str(e)}
85
+
86
+ def clear_cache(self):
87
+ """Clear all cached bodies."""
88
+ count = len(self._body_cache)
89
+ self._body_cache.clear()
90
+ logger.info(f"Cleared {count} cached bodies")
91
+ return count
92
+
93
+ def decode_body(self, body_content: str, is_base64: bool) -> str | bytes:
94
+ """Decode body content if base64 encoded.
95
+
96
+ Args:
97
+ body_content: The body content (possibly base64)
98
+ is_base64: Whether the content is base64 encoded
99
+ """
100
+ if not is_base64:
101
+ return body_content
102
+
103
+ try:
104
+ decoded = base64.b64decode(body_content)
105
+ # Try to decode as UTF-8 text
106
+ try:
107
+ return decoded.decode("utf-8")
108
+ except UnicodeDecodeError:
109
+ # Return as bytes for binary content
110
+ return decoded
111
+ except Exception as e:
112
+ logger.error(f"Failed to decode base64 body: {e}")
113
+ return body_content # Return original if decode fails