PyPI - webtap-tool - Versions diffs - 0.1.1__py3-none-any.whl - Mend

webtap-tool 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of webtap-tool might be problematic. Click here for more details.

Files changed (43) hide show

webtap/VISION.md +234 -0
webtap/__init__.py +56 -0
webtap/api.py +222 -0
webtap/app.py +76 -0
webtap/cdp/README.md +268 -0
webtap/cdp/__init__.py +14 -0
webtap/cdp/query.py +107 -0
webtap/cdp/schema/README.md +41 -0
webtap/cdp/schema/cdp_protocol.json +32785 -0
webtap/cdp/schema/cdp_version.json +8 -0
webtap/cdp/session.py +365 -0
webtap/commands/DEVELOPER_GUIDE.md +314 -0
webtap/commands/TIPS.md +153 -0
webtap/commands/__init__.py +7 -0
webtap/commands/_builders.py +127 -0
webtap/commands/_errors.py +108 -0
webtap/commands/_tips.py +147 -0
webtap/commands/_utils.py +227 -0
webtap/commands/body.py +161 -0
webtap/commands/connection.py +168 -0
webtap/commands/console.py +69 -0
webtap/commands/events.py +109 -0
webtap/commands/fetch.py +219 -0
webtap/commands/filters.py +224 -0
webtap/commands/inspect.py +146 -0
webtap/commands/javascript.py +87 -0
webtap/commands/launch.py +86 -0
webtap/commands/navigation.py +199 -0
webtap/commands/network.py +85 -0
webtap/commands/setup.py +127 -0
webtap/filters.py +289 -0
webtap/services/README.md +83 -0
webtap/services/__init__.py +15 -0
webtap/services/body.py +113 -0
webtap/services/console.py +116 -0
webtap/services/fetch.py +397 -0
webtap/services/main.py +175 -0
webtap/services/network.py +105 -0
webtap/services/setup.py +219 -0
webtap_tool-0.1.1.dist-info/METADATA +427 -0
webtap_tool-0.1.1.dist-info/RECORD +43 -0
webtap_tool-0.1.1.dist-info/WHEEL +4 -0
webtap_tool-0.1.1.dist-info/entry_points.txt +2 -0

webtap/cdp/schema/cdp_version.json ADDED Viewed

@@ -0,0 +1,8 @@
+{
+  "Browser": "Chrome/139.0.7258.138",
+  "Protocol-Version": "1.3",
+  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
+  "V8-Version": "13.9.205.20",
+  "WebKit-Version": "537.36 (@884e54ea8d42947ed636779015c5b4815e069838)",
+  "webSocketDebuggerUrl": "ws://localhost:9222/devtools/browser/e2c22d46-fafc-483e-a512-caccea649b20"
+}

webtap/cdp/session.py ADDED Viewed

@@ -0,0 +1,365 @@
+"""CDP Session with native event storage.
+PUBLIC API:
+  - CDPSession: WebSocket-based CDP client with DuckDB event storage
+"""
+import json
+import logging
+import threading
+from concurrent.futures import Future, TimeoutError
+from typing import Any
+import duckdb
+import requests
+import websocket
+logger = logging.getLogger(__name__)
+class CDPSession:
+    """WebSocket-based CDP client with native event storage.
+    Stores CDP events as-is in DuckDB for minimal overhead and maximum flexibility.
+    Provides field discovery and query capabilities for dynamic data exploration.
+    Attributes:
+        port: Chrome debugging port.
+        timeout: Default timeout for execute() calls.
+        db: DuckDB connection for event storage.
+        field_paths: Live field lookup for query building.
+    """
+    def __init__(self, port: int = 9222, timeout: float = 30):
+        """Initialize CDP session with WebSocket and DuckDB storage.
+        Args:
+            port: Chrome debugging port. Defaults to 9222.
+            timeout: Default timeout for execute() calls. Defaults to 30.
+        """
+        self.port = port
+        self.timeout = timeout
+        # WebSocketApp instance
+        self.ws_app: websocket.WebSocketApp | None = None
+        self.ws_thread: threading.Thread | None = None
+        # Connection state
+        self.connected = threading.Event()
+        self.page_info: dict | None = None
+        # CDP request/response tracking
+        self._next_id = 1
+        self._pending: dict[int, Future] = {}
+        self._lock = threading.Lock()
+        # DuckDB storage - store events AS-IS
+        self.db = duckdb.connect(":memory:")
+        self.db.execute("CREATE TABLE events (event JSON)")
+        # Live field path lookup for fast discovery
+        # Maps lowercase field names to their full paths with original case
+        self.field_paths: dict[str, set[str]] = {}
+    def list_pages(self) -> list[dict]:
+        """List available Chrome pages via HTTP API.
+        Returns:
+            List of page dictionaries with webSocketDebuggerUrl.
+        """
+        try:
+            resp = requests.get(f"http://localhost:{self.port}/json", timeout=2)
+            resp.raise_for_status()
+            pages = resp.json()
+            return [p for p in pages if p.get("type") == "page" and "webSocketDebuggerUrl" in p]
+        except Exception as e:
+            logger.error(f"Failed to list pages: {e}")
+            return []
+    def connect(self, page_index: int | None = None, page_id: str | None = None) -> None:
+        """Connect to Chrome page via WebSocket.
+        Establishes WebSocket connection and starts event collection.
+        Does not auto-enable CDP domains - use execute() for that.
+        Args:
+            page_index: Index of page to connect to. Defaults to 0.
+            page_id: Stable page ID across tab reordering.
+        Raises:
+            RuntimeError: If already connected or no pages available.
+            ValueError: If page_id not found.
+            IndexError: If page_index out of range.
+            TimeoutError: If connection fails within 5 seconds.
+        """
+        if self.ws_app:
+            raise RuntimeError("Already connected")
+        pages = self.list_pages()
+        if not pages:
+            raise RuntimeError("No pages available")
+        # Find the page by ID or index
+        if page_id:
+            page = next((p for p in pages if p.get("id") == page_id), None)
+            if not page:
+                raise ValueError(f"Page with ID {page_id} not found")
+        elif page_index is not None:
+            if page_index >= len(pages):
+                raise IndexError(f"Page {page_index} out of range")
+            page = pages[page_index]
+        else:
+            # Default to first page
+            page = pages[0]
+        ws_url = page["webSocketDebuggerUrl"]
+        self.page_info = page
+        # Create WebSocketApp with callbacks
+        self.ws_app = websocket.WebSocketApp(
+            ws_url, on_open=self._on_open, on_message=self._on_message, on_error=self._on_error, on_close=self._on_close
+        )
+        # Let WebSocketApp handle everything in a thread
+        self.ws_thread = threading.Thread(
+            target=self.ws_app.run_forever,
+            kwargs={
+                "ping_interval": 30,  # Ping every 30s
+                "ping_timeout": 10,  # Wait 10s for pong
+                "reconnect": 5,  # Auto-reconnect with max 5s delay
+                "skip_utf8_validation": True,  # Faster
+            },
+        )
+        self.ws_thread.daemon = True
+        self.ws_thread.start()
+        # Wait for connection
+        if not self.connected.wait(timeout=5):
+            self.disconnect()
+            raise TimeoutError("Failed to connect to Chrome")
+    def disconnect(self) -> None:
+        """Disconnect WebSocket and clean up resources."""
+        if self.ws_app:
+            self.ws_app.close()
+            self.ws_app = None
+        if self.ws_thread and self.ws_thread.is_alive():
+            self.ws_thread.join(timeout=2)
+            self.ws_thread = None
+        self.connected.clear()
+        self.page_info = None
+    def send(self, method: str, params: dict | None = None) -> Future:
+        """Send CDP command asynchronously.
+        Args:
+            method: CDP method like "Page.navigate" or "Network.enable".
+            params: Optional command parameters.
+        Returns:
+            Future containing CDP response 'result' field.
+        Raises:
+            RuntimeError: If not connected to Chrome.
+        """
+        if not self.ws_app:
+            raise RuntimeError("Not connected")
+        with self._lock:
+            msg_id = self._next_id
+            self._next_id += 1
+            future = Future()
+            self._pending[msg_id] = future
+        # Send CDP command
+        message = {"id": msg_id, "method": method}
+        if params:
+            message["params"] = params
+        self.ws_app.send(json.dumps(message))
+        return future
+    def execute(self, method: str, params: dict | None = None, timeout: float | None = None) -> Any:
+        """Send CDP command synchronously.
+        Args:
+            method: CDP method like "Page.navigate" or "Network.enable".
+            params: Optional command parameters.
+            timeout: Override default timeout.
+        Returns:
+            CDP response 'result' field.
+        Raises:
+            TimeoutError: If command times out.
+            RuntimeError: If CDP returns error or not connected.
+        """
+        future = self.send(method, params)
+        try:
+            return future.result(timeout=timeout or self.timeout)
+        except TimeoutError:
+            # Clean up the pending future
+            with self._lock:
+                for msg_id, f in list(self._pending.items()):
+                    if f is future:
+                        self._pending.pop(msg_id, None)
+                        break
+            raise TimeoutError(f"Command {method} timed out")
+    def _on_open(self, ws):
+        """WebSocket connection established."""
+        logger.info("WebSocket connected")
+        self.connected.set()
+    def _on_message(self, ws, message):
+        """Handle CDP messages - store events as-is, resolve command futures."""
+        try:
+            data = json.loads(message)
+            # Command response - resolve future
+            if "id" in data:
+                msg_id = data["id"]
+                with self._lock:
+                    future = self._pending.pop(msg_id, None)
+                if future:
+                    if "error" in data:
+                        future.set_exception(RuntimeError(data["error"]))
+                    else:
+                        future.set_result(data.get("result", {}))
+            # CDP event - store AS-IS in DuckDB and update field lookup
+            elif "method" in data:
+                self.db.execute("INSERT INTO events VALUES (?)", [json.dumps(data)])
+                self._update_field_lookup(data)
+        except Exception as e:
+            logger.error(f"Error handling message: {e}")
+    def _on_error(self, ws, error):
+        """Handle WebSocket errors."""
+        logger.error(f"WebSocket error: {error}")
+    def _on_close(self, ws, code, reason):
+        """Handle WebSocket closure and cleanup."""
+        logger.info(f"WebSocket closed: {code} {reason}")
+        self.connected.clear()
+        # Fail pending commands
+        with self._lock:
+            for future in self._pending.values():
+                future.set_exception(RuntimeError("Connection closed"))
+            self._pending.clear()
+    def _extract_paths(self, obj, parent_key=""):
+        """Extract all JSON paths from nested dict structure.
+        Args:
+            obj: Dictionary to extract paths from.
+            parent_key: Current path prefix.
+        """
+        paths = []
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                new_key = f"{parent_key}.{k}" if parent_key else k
+                paths.append(new_key)
+                if isinstance(v, dict):
+                    paths.extend(self._extract_paths(v, new_key))
+        return paths
+    def _update_field_lookup(self, data):
+        """Update field_paths lookup with new event data.
+        Args:
+            data: CDP event dictionary.
+        """
+        event_type = data.get("method", "unknown")
+        paths = self._extract_paths(data)
+        for path in paths:
+            # Store with event type prefix using colon separator
+            full_path = f"{event_type}:{path}"
+            # Index by each part of the path for flexible searching
+            parts = path.split(".")
+            for part in parts:
+                key = part.lower()
+                if key not in self.field_paths:
+                    self.field_paths[key] = set()
+                self.field_paths[key].add(full_path)  # Store with event type and original case
+    def discover_field_paths(self, search_key: str) -> list[str]:
+        """Discover all JSON paths containing the search key.
+        Used by build_query for dynamic field discovery.
+        Args:
+            search_key: Field name to search for like "url" or "status".
+        Returns:
+            Sorted list of full paths with event type prefixes.
+        """
+        search_key = search_key.lower()
+        paths = set()
+        # Find all field names that contain our search key
+        for field_name, field_paths in self.field_paths.items():
+            if search_key in field_name:
+                paths.update(field_paths)
+        return sorted(list(paths))  # Sort for consistent results
+    def clear_events(self) -> None:
+        """Clear all stored events and reset field lookup."""
+        self.db.execute("DELETE FROM events")
+        self.field_paths.clear()
+    def query(self, sql: str, params: list | None = None) -> list:
+        """Query stored CDP events using DuckDB SQL.
+        Events are stored in 'events' table with single JSON 'event' column.
+        Use json_extract_string() for accessing nested fields.
+        Args:
+            sql: DuckDB SQL query string.
+            params: Optional query parameters.
+        Returns:
+            List of result rows.
+        Examples:
+            query("SELECT * FROM events WHERE json_extract_string(event, '$.method') = 'Network.responseReceived'")
+            query("SELECT json_extract_string(event, '$.params.request.url') as url FROM events")
+        """
+        result = self.db.execute(sql, params or [])
+        return result.fetchall() if result else []
+    def fetch_body(self, request_id: str) -> dict | None:
+        """Fetch response body via Network.getResponseBody CDP call.
+        Args:
+            request_id: Network request ID from CDP events.
+        Returns:
+            Dict with 'body' and 'base64Encoded' keys, or None if failed.
+        """
+        try:
+            return self.execute("Network.getResponseBody", {"requestId": request_id})
+        except Exception as e:
+            logger.debug(f"Failed to fetch body for {request_id}: {e}")
+            return None
+    @property
+    def is_connected(self) -> bool:
+        """Check if WebSocket connection is active.
+        Returns:
+            True if connected to Chrome page.
+        """
+        return self.connected.is_set()

webtap/commands/DEVELOPER_GUIDE.md ADDED Viewed

@@ -0,0 +1,314 @@
+# WebTap Commands Developer Guide
+This guide documents the patterns and conventions for developing WebTap commands with MCP compatibility.
+## Command Patterns (Post-Refinement)
+### 1. Simple Commands (No Parameters)
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def disconnect(state) -> dict:
+    """Disconnect from Chrome."""
+    # Implementation
+```
+### 2. Single Required Parameter
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def navigate(state, url: str) -> dict:
+    """Navigate to URL."""
+    # Implementation
+```
+### 3. Optional Boolean/Simple Parameters (Direct)
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def reload(state, ignore_cache: bool = False) -> dict:
+    """Reload current page."""
+    # Implementation
+# Multiple boolean flags
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def clear(state, events: bool = True, console: bool = False, cache: bool = False) -> dict:
+    """Clear various data stores."""
+    # Implementation
+```
+### 4. Mutually Exclusive Parameters (Direct)
+Use direct parameters when you have different ways to identify the same thing:
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def connect(state, page: int = None, page_id: str = None) -> dict:
+    """Connect to Chrome page.
+    Args:
+        page: Connect by page index (0-based)
+        page_id: Connect by page ID
+    Note: Cannot specify both page and page_id.
+    """
+    if page is not None and page_id is not None:
+        return error_response("invalid_parameters",
+            "Cannot specify both 'page' and 'page_id'")
+    # Implementation
+```
+### 5. Multiple Optional Parameters (Direct)
+Use direct parameters for cleaner API when parameters are well-defined:
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def network(state, limit: int = 20, filters: list = None, no_filters: bool = False) -> dict:
+    """Show network requests.
+    Args:
+        limit: Maximum results to show
+        filters: Specific filter categories to apply
+        no_filters: Show everything unfiltered
+    """
+    # Implementation
+# With expression evaluation
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def body(state, response: int, expr: str = None, decode: bool = True, cache: bool = True) -> dict:
+    """Get response body for network request."""
+    # Implementation
+```
+### 6. Mixed Parameters (Direct + Dict)
+Use dict only for complex/variable configurations:
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def resume(state, request: int, wait: float = 0.5, modifications: dict = None) -> dict:
+    """Resume a paused request.
+    Args:
+        request: Request row ID
+        wait: Wait time for next event
+        modifications: Request/response modifications
+            - {"url": "..."} - Change URL
+            - {"method": "POST"} - Change method
+    """
+    mods = modifications or {}
+    # Implementation
+```
+### 7. Dynamic Field Discovery (Keep Dict)
+Use dict when field names are dynamic/unknown:
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def events(state, filters: dict = None, limit: int = 20) -> dict:
+    """Query CDP events by field values.
+    Args:
+        filters: Field filters (any CDP field name)
+            - {"method": "Network.*"}
+            - {"status": 200}
+            - {"url": "*api*"}
+    """
+    # Fields are discovered dynamically from CDP events
+```
+### 8. Action + Config Pattern (Complex Operations)
+Keep for commands with varied operations:
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def filters(state, action: str = "list", config: dict = None) -> dict:
+    """Manage filters.
+    Args:
+        action: Operation to perform
+            - "list" - Show all filters
+            - "add" - Add filter
+            - "remove" - Remove filter
+        config: Action-specific configuration
+            - For add: {"category": "ads", "patterns": ["*ad*"]}
+            - For remove: {"patterns": ["*ad*"]}
+    """
+    cfg = config or {}
+    if action == "add":
+        category = cfg.get("category", "custom")
+        patterns = cfg.get("patterns", [])
+        # Implementation
+```
+## MCP Type Requirements
+### ❌ Avoid These (Not MCP Compatible)
+```python
+# Union types
+def command(state, param: str | None = None)
+# Optional types
+from typing import Optional
+def command(state, param: Optional[str] = None)
+# Complex types
+from typing import Dict, List
+def command(state, data: Dict[str, List[str]])
+# **kwargs
+def command(state, **fields)
+```
+### ✅ Use These Instead
+```python
+# Simple defaults
+def command(state, param: str = "default")
+def command(state, param: dict = None)  # pyright: ignore[reportArgumentType]
+def command(state, param: list = None)  # pyright: ignore[reportArgumentType]
+def command(state, param: bool = False)
+def command(state, param: int = 0)
+```
+## Response Patterns
+### Resources (Read-Only Data)
+```python
+@app.command(display="markdown", fastmcp={"type": "resource", "mime_type": "text/markdown"})
+def pages(state) -> dict:
+    """List available pages."""
+    return build_table_response(
+        title="Chrome Pages",
+        headers=["Index", "Title", "URL"],
+        rows=rows,
+        summary=f"{len(rows)} pages"
+    )
+```
+### Tools (Actions with Side Effects)
+```python
+@app.command(display="markdown", fastmcp={"type": "tool"})
+def navigate(state, url: str) -> dict:
+    """Navigate to URL."""
+    # Perform action
+    return build_info_response(
+        title="Navigation Complete",
+        fields={"URL": url, "Status": "Success"}
+    )
+```
+## Error Handling
+Always use the error utilities from `_errors.py`:
+```python
+from webtap.commands._errors import check_connection, error_response
+def my_command(state, ...):
+    # Check connection first for commands that need it
+    if error := check_connection(state):
+        return error
+    # Validate parameters
+    if not valid:
+        return error_response("invalid_param", "Parameter X must be Y")
+    # Custom errors
+    return error_response("custom", custom_message="Specific error message")
+```
+## Utility Functions
+Use helpers from `_utils.py`:
+```python
+from webtap.commands._utils import (
+    build_table_response,    # For tables
+    build_info_response,     # For key-value info
+    parse_options,          # Parse dict with defaults
+    extract_option,         # Extract single option
+    truncate_string,        # Truncate long strings
+    format_size,           # Format byte sizes
+    format_id,             # Format IDs
+)
+```
+## Text Over Symbols
+Use explicit text instead of symbols for clarity:
+```python
+# Status text
+"Connected" / "Disconnected"
+"Enabled" / "Disabled"
+"Yes" / "No"
+# For empty values
+"-" or "None" or ""
+# Descriptive status
+"3 requests paused" instead of symbols
+"Request Failed" instead of error symbols
+```
+## Decision Tree for Parameter Patterns (Updated)
+1. **No parameters?** → Simple command
+2. **One required param?** → Single parameter
+3. **Few well-defined params?** → Direct parameters with defaults
+4. **Multiple ways to identify same thing?** → Direct mutually exclusive params
+5. **Dynamic/unknown field names?** → Dict for filters
+6. **Complex variable config?** → Dict for modifications
+7. **Different operations based on input?** → Action + config pattern
+### When to Use Direct Parameters
+- Parameters are well-defined and limited (< 5)
+- Parameters are commonly used
+- Makes the API more intuitive
+- Boolean flags or simple types
+### When to Keep Dict Parameters
+- Field names are dynamic (like CDP event fields)
+- Configuration varies significantly by action
+- Many optional parameters rarely used together
+- Complex nested structures needed
+## Examples by Category (Current Implementation)
+### Navigation Commands
+- `navigate(url: str)` - Single required parameter
+- `reload(ignore_cache: bool = False)` - Optional boolean
+- `back()`, `forward()` - No parameters
+### Query Commands
+- `network(limit: int = 20, filters: list = None, no_filters: bool = False)` - Direct params
+- `events(filters: dict = None, limit: int = 20)` - Dict for dynamic fields + limit
+- `inspect(event: int = None, expr: str = None)` - Direct optional params
+- `body(response: int, expr: str = None, decode: bool = True, cache: bool = True)` - Mixed direct params
+### Management Commands
+- `connect(page: int = None, page_id: str = None)` - Mutually exclusive direct params
+- `clear(events: bool = True, console: bool = False, cache: bool = False)` - Boolean flags
+- `filters(action: str = "list", config: dict = None)` - Action + config pattern
+### JavaScript & Fetch Commands
+- `js(code: str, wait_return: bool = True, await_promise: bool = False)` - Direct params
+- `fetch(action: str, options: dict = None)` - Action pattern
+- `resume(request: int, wait: float = 0.5, modifications: dict = None)` - Direct + dict
+## Testing Your Command
+1. **Type checking**: Run `basedpyright` to ensure types are correct
+2. **Linting**: Run `ruff check` for code style
+3. **REPL mode**: Test with `webtap` command
+4. **MCP mode**: Test with `webtap --mcp` command
+5. **Markdown rendering**: Verify output displays correctly
+## Checklist for New Commands
+- [ ] Use `@app.command()` decorator with `display="markdown"`
+- [ ] Add `fastmcp` metadata (type: "resource" or "tool")
+- [ ] Use simple types only (no unions, no Optional)
+- [ ] Add `# pyright: ignore[reportArgumentType]` for `dict = None`
+- [ ] Import utilities from `_utils.py` and `_errors.py`
+- [ ] Use `build_table_response()` or `build_info_response()`
+- [ ] Check connection with `check_connection()` if needed
+- [ ] Document parameters clearly in docstring
+- [ ] Provide usage examples in docstring
+- [ ] Test in both REPL and MCP modes