PyPI - logtap - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

logtap 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

logtap/__init__.py +1 -1
logtap/api/app.py +69 -3
logtap/api/routes/health.py +26 -4
logtap/api/routes/logs.py +26 -31
logtap/api/routes/parsed.py +8 -7
logtap/api/routes/runs.py +330 -0
logtap/cli/commands/collect.py +107 -0
logtap/cli/commands/doctor.py +127 -0
logtap/cli/commands/ingest.py +123 -0
logtap/cli/commands/runs.py +116 -0
logtap/cli/commands/tail.py +220 -23
logtap/cli/main.py +12 -5
logtap/core/runs.py +433 -0
logtap/core/validation.py +132 -0
logtap/models/responses.py +54 -1
logtap-0.4.1.dist-info/METADATA +304 -0
{logtap-0.3.0.dist-info → logtap-0.4.1.dist-info}/RECORD +20 -14
logtap-0.3.0.dist-info/METADATA +0 -319
{logtap-0.3.0.dist-info → logtap-0.4.1.dist-info}/WHEEL +0 -0
{logtap-0.3.0.dist-info → logtap-0.4.1.dist-info}/entry_points.txt +0 -0
{logtap-0.3.0.dist-info → logtap-0.4.1.dist-info}/licenses/LICENSE +0 -0

logtap/core/runs.py ADDED Viewed

@@ -0,0 +1,433 @@
+"""Run store for ingested log streams.
+Provides append-only storage with in-memory tail cache and cursor management.
+"""
+import threading
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, Iterator, List, Optional
+TAG_KEY_PATTERN = r"^[a-zA-Z0-9_.-]+$"
+TAG_VALUE_MAX_LEN = 256
+@dataclass
+class RunLine:
+    """A single log line with cursor, timestamp, and optional tags."""
+    cursor: int
+    line: str
+    ts: datetime
+    tags: Dict[str, str] = field(default_factory=dict)
+@dataclass
+class RunMetadata:
+    """Metadata for a run."""
+    id: str
+    tags: Dict[str, str] = field(default_factory=dict)
+    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    last_activity: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    cursor_start: int = 0  # First cursor ever (immutable)
+    cursor_latest: int = -1  # Latest cursor written
+    lines_count: int = 0
+    bytes_on_disk: int = 0
+    active: bool = True
+class Run:
+    """A single run with append-only file storage and in-memory tail cache."""
+    def __init__(self, run_id: str, data_dir: Path, buffer_lines: int = 100_000):
+        self.id = run_id
+        self.data_dir = data_dir
+        self.buffer_lines = buffer_lines
+        self._lock = threading.RLock()
+        # In-memory tail cache (deque for O(1) append and popleft)
+        self._cache: deque[RunLine] = deque(maxlen=buffer_lines)
+        self._cache_start_cursor: int = 0  # Cursor of first item in cache
+        # Run directory and files
+        self.run_dir = data_dir / run_id
+        self.log_file = self.run_dir / "log.txt"
+        self.meta_file = self.run_dir / "meta.json"
+        # Metadata
+        self.metadata: RunMetadata
+        # Initialize or load
+        if self.run_dir.exists():
+            self._load()
+        else:
+            self._create()
+    def _create(self) -> None:
+        """Create a new run."""
+        self.run_dir.mkdir(parents=True, exist_ok=True)
+        self.metadata = RunMetadata(id=self.id)
+        self._save_metadata()
+    def _load(self) -> None:
+        """Load existing run from disk."""
+        import json
+        # Load metadata
+        if self.meta_file.exists():
+            with open(self.meta_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                self.metadata = RunMetadata(
+                    id=data["id"],
+                    tags=data.get("tags", {}),
+                    created_at=datetime.fromisoformat(data["created_at"]),
+                    last_activity=datetime.fromisoformat(data["last_activity"]),
+                    cursor_start=data.get("cursor_start", 0),
+                    cursor_latest=data.get("cursor_latest", -1),
+                    lines_count=data.get("lines_count", 0),
+                    bytes_on_disk=data.get("bytes_on_disk", 0),
+                    active=data.get("active", True),
+                )
+        else:
+            self.metadata = RunMetadata(id=self.id)
+        # Populate cache from end of log file
+        if self.log_file.exists():
+            self._populate_cache_from_disk()
+    def _populate_cache_from_disk(self) -> None:
+        """Load last N lines from disk into cache."""
+        import json
+        if not self.log_file.exists():
+            return
+        run_lines: List[RunLine] = []
+        with open(self.log_file, "r", encoding="utf-8", errors="replace") as f:
+            for raw_line in f:
+                raw_line = raw_line.rstrip("\n")
+                if not raw_line:
+                    continue
+                # Try JSONL format first
+                if raw_line.startswith("{"):
+                    try:
+                        record = json.loads(raw_line)
+                        run_lines.append(
+                            RunLine(
+                                cursor=record["c"],
+                                line=record["l"],
+                                ts=datetime.fromisoformat(record["t"]),
+                                tags=record.get("g", {}),
+                            )
+                        )
+                        continue
+                    except (json.JSONDecodeError, KeyError):
+                        pass
+                # Legacy plain text format
+                run_lines.append(
+                    RunLine(
+                        cursor=len(run_lines),
+                        line=raw_line,
+                        ts=self.metadata.last_activity,
+                        tags={},
+                    )
+                )
+        # Only keep last buffer_lines
+        if len(run_lines) > self.buffer_lines:
+            run_lines = run_lines[-self.buffer_lines :]
+        self._cache_start_cursor = run_lines[0].cursor if run_lines else 0
+        self._cache.clear()
+        for rl in run_lines:
+            self._cache.append(rl)
+    def _save_metadata(self) -> None:
+        """Save metadata to disk."""
+        import json
+        with open(self.meta_file, "w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "id": self.metadata.id,
+                    "tags": self.metadata.tags,
+                    "created_at": self.metadata.created_at.isoformat(),
+                    "last_activity": self.metadata.last_activity.isoformat(),
+                    "cursor_start": self.metadata.cursor_start,
+                    "cursor_latest": self.metadata.cursor_latest,
+                    "lines_count": self.metadata.lines_count,
+                    "bytes_on_disk": self.metadata.bytes_on_disk,
+                    "active": self.metadata.active,
+                },
+                f,
+            )
+    def append(self, line: str, tags: Optional[Dict[str, str]] = None) -> RunLine:
+        """Append a line to the run. Returns the line with assigned cursor."""
+        import json
+        with self._lock:
+            now = datetime.now(timezone.utc)
+            cursor = self.metadata.cursor_latest + 1
+            run_line = RunLine(cursor=cursor, line=line, ts=now, tags=tags or {})
+            # Append to disk as JSONL
+            record = {
+                "c": cursor,
+                "l": line,
+                "t": now.isoformat(),
+            }
+            if tags:
+                record["g"] = tags  # g for tags (short key)
+            with open(self.log_file, "a", encoding="utf-8") as f:
+                written = f.write(json.dumps(record, separators=(",", ":")) + "\n")
+                self.metadata.bytes_on_disk += written
+            # Update cache
+            if len(self._cache) >= self.buffer_lines:
+                self._cache_start_cursor += 1
+            self._cache.append(run_line)
+            # Update metadata
+            self.metadata.cursor_latest = cursor
+            self.metadata.lines_count += 1
+            self.metadata.last_activity = now
+            return run_line
+    def append_batch(
+        self, lines: List[str], tags: Optional[Dict[str, str]] = None
+    ) -> List[RunLine]:
+        """Append multiple lines atomically."""
+        with self._lock:
+            result = []
+            for line in lines:
+                result.append(self.append(line, tags))
+            self._save_metadata()
+            return result
+    def set_tags(self, tags: Dict[str, str]) -> Optional[str]:
+        """Validate tags. Returns error message on invalid tag, None on success.
+        Note: Tags are now stored per-line, not per-run. This method just validates
+        and tracks known tag keys in run metadata for discoverability.
+        """
+        import re
+        with self._lock:
+            for key, value in tags.items():
+                # Validate key
+                if not re.match(TAG_KEY_PATTERN, key):
+                    return f"Invalid tag key: {key}"
+                # Validate value length
+                if len(value) > TAG_VALUE_MAX_LEN:
+                    return f"Tag value too long: {key}"
+            # Track tag keys in metadata (last value wins, just for discoverability)
+            self.metadata.tags.update(tags)
+            self._save_metadata()
+            return None
+    @property
+    def cursor_earliest(self) -> int:
+        """Earliest cursor available in cache/disk."""
+        with self._lock:
+            if self._cache:
+                return self._cache[0].cursor
+            return 0
+    @property
+    def cursor_latest(self) -> int:
+        """Latest cursor written."""
+        with self._lock:
+            return self.metadata.cursor_latest
+    def get_lines(
+        self,
+        since: Optional[int] = None,
+        tail: int = 50,
+        limit: int = 1000,
+        tag_filter: Optional[Dict[str, str]] = None,
+    ) -> tuple[List[RunLine], bool]:
+        """
+        Get lines from run.
+        Args:
+            since: Cursor to start from (exclusive). If None, returns last `tail` lines.
+            tail: Number of recent lines if since is None.
+            limit: Maximum lines to return.
+            tag_filter: Filter lines by tags (AND semantics).
+        Returns:
+            Tuple of (lines, gap_detected).
+            gap_detected is True if since < cursor_earliest.
+        """
+        with self._lock:
+            gap = False
+            if since is not None:
+                # Resume from cursor
+                if since < self.cursor_earliest:
+                    gap = True
+                    # Start from earliest available
+                    start_cursor = self.cursor_earliest
+                else:
+                    start_cursor = since + 1  # Exclusive
+                # Get lines from cache
+                lines = [ln for ln in self._cache if ln.cursor >= start_cursor]
+            else:
+                # Tail mode - get last N lines
+                lines = list(self._cache)[-tail:]
+            # Filter by tags (AND semantics)
+            if tag_filter:
+                lines = [
+                    ln for ln in lines if all(ln.tags.get(k) == v for k, v in tag_filter.items())
+                ]
+            # Apply limit
+            if len(lines) > limit:
+                lines = lines[:limit]
+            return lines, gap
+    def tail_iter(self, since: Optional[int] = None) -> Iterator[RunLine]:
+        """
+        Iterator that yields new lines as they arrive.
+        Args:
+            since: Cursor to start from (exclusive). If None, starts from latest.
+        """
+        last_cursor = since if since is not None else self.cursor_latest
+        while True:
+            with self._lock:
+                new_lines = [ln for ln in self._cache if ln.cursor > last_cursor]
+            for line in new_lines:
+                last_cursor = line.cursor
+                yield line
+            if not new_lines:
+                time.sleep(0.1)  # Poll interval
+    def close(self) -> None:
+        """Mark run as inactive and save metadata."""
+        with self._lock:
+            self.metadata.active = False
+            self._save_metadata()
+class RunStore:
+    """Manages all runs with disk persistence."""
+    def __init__(
+        self,
+        data_dir: Path,
+        buffer_lines: int = 100_000,
+        max_disk_mb: int = 1000,
+        retention_hours: int = 72,
+    ):
+        self.data_dir = Path(data_dir).expanduser()
+        self.buffer_lines = buffer_lines
+        self.max_disk_bytes = max_disk_mb * 1024 * 1024
+        self.retention_seconds = retention_hours * 3600
+        self._runs: Dict[str, Run] = {}
+        self._lock = threading.RLock()
+        # Create data directory
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        # Load existing runs
+        self._load_existing_runs()
+    def _load_existing_runs(self) -> None:
+        """Load existing runs from disk."""
+        if not self.data_dir.exists():
+            return
+        for run_dir in self.data_dir.iterdir():
+            if run_dir.is_dir() and (run_dir / "meta.json").exists():
+                try:
+                    run = Run(run_dir.name, self.data_dir, self.buffer_lines)
+                    self._runs[run_dir.name] = run
+                except Exception:
+                    pass  # Skip corrupted runs
+    def get_or_create(self, run_id: str) -> tuple[Run, bool]:
+        """Get existing run or create new one. Returns (run, created)."""
+        with self._lock:
+            if run_id in self._runs:
+                return self._runs[run_id], False
+            run = Run(run_id, self.data_dir, self.buffer_lines)
+            self._runs[run_id] = run
+            return run, True
+    def get(self, run_id: str) -> Optional[Run]:
+        """Get run by ID, or None if not found."""
+        with self._lock:
+            return self._runs.get(run_id)
+    def list_runs(self, since_hours: Optional[int] = None) -> List[Run]:
+        """List all runs, optionally filtered by recent activity."""
+        with self._lock:
+            runs = list(self._runs.values())
+            if since_hours is not None:
+                cutoff = datetime.now(timezone.utc).timestamp() - (since_hours * 3600)
+                runs = [r for r in runs if r.metadata.last_activity.timestamp() >= cutoff]
+            # Sort by last activity (most recent first)
+            runs.sort(key=lambda r: r.metadata.last_activity, reverse=True)
+            return runs
+    def total_disk_usage(self) -> int:
+        """Get total disk usage across all runs in bytes."""
+        with self._lock:
+            return sum(r.metadata.bytes_on_disk for r in self._runs.values())
+    def enforce_retention(self) -> None:
+        """Remove runs older than retention period."""
+        with self._lock:
+            cutoff = datetime.now(timezone.utc).timestamp() - self.retention_seconds
+            to_remove = [
+                run_id
+                for run_id, run in self._runs.items()
+                if run.metadata.last_activity.timestamp() < cutoff
+            ]
+            for run_id in to_remove:
+                self._delete_run(run_id)
+    def enforce_disk_limit(self) -> None:
+        """Remove oldest runs if disk limit exceeded."""
+        with self._lock:
+            while self.total_disk_usage() > self.max_disk_bytes and self._runs:
+                # Find oldest run
+                oldest = min(self._runs.values(), key=lambda r: r.metadata.last_activity)
+                self._delete_run(oldest.id)
+    def _delete_run(self, run_id: str) -> None:
+        """Delete a run from disk and memory."""
+        import shutil
+        if run_id in self._runs:
+            run = self._runs.pop(run_id)
+            run_dir = run.run_dir
+            if run_dir.exists():
+                shutil.rmtree(run_dir)
+    def check_storage(self) -> Optional[str]:
+        """Check if storage is available. Returns error message if not."""
+        if self.total_disk_usage() >= self.max_disk_bytes:
+            return "insufficient_storage"
+        return None

logtap/core/validation.py CHANGED Viewed

@@ -3,8 +3,140 @@ Input validation functions for logtap.
 These functions validate user input to prevent security issues
 like path traversal attacks and DoS via overly large inputs.
+Path Traversal Prevention Model
+===============================
+1. Input validation: reject NUL bytes, control chars, path separators, ".."
+2. Join filename to base directory
+3. Resolve to canonical absolute path (follows symlinks)
+4. Containment check: commonpath([base, resolved]) == base
+5. File type check: must be regular file (not dir, device, etc.)
+This prevents:
+- Directory traversal (../)
+- Absolute path injection (/etc/passwd)
+- Symlink escape attacks
+- Null byte injection
+- Path prefix collisions (/var/log vs /var/logs)
 """
+import os
+import stat
+from typing import Optional, Tuple
+def resolve_safe_path(base_dir: str, filename: str, require_exists: bool = False) -> Optional[str]:
+    """
+    Safely resolve a filename within a base directory.
+    Security guarantees:
+    - Resolved path is always within base_dir (symlink-safe)
+    - No path traversal via "..", separators, or absolute paths
+    - No NUL bytes or control characters
+    - Containment verified via os.path.commonpath
+    Args:
+        base_dir: The base directory that files must be within.
+        filename: The user-provided filename (single component, no path separators).
+        require_exists: If True, also verify the file exists and is a regular file.
+    Returns:
+        The resolved filepath if safe, None if validation fails.
+    """
+    # Reject empty filenames
+    if not filename:
+        return None
+    # Reject NUL bytes (can truncate paths in some contexts)
+    if "\x00" in filename:
+        return None
+    # Reject control characters (0x00-0x1F, 0x7F)
+    if any(ord(c) < 0x20 or ord(c) == 0x7F for c in filename):
+        return None
+    # Reject special directory entries
+    # Note: ".." substring check removed - it over-blocks valid names like "my..log"
+    # Traversal requires separators which we reject below; containment check is authoritative
+    if filename in {".", ".."}:
+        return None
+    # Reject path separators - filename must be a single component
+    if "/" in filename or "\\" in filename:
+        return None
+    # Reject absolute paths (Unix and Windows)
+    if filename.startswith("/") or filename.startswith("\\"):
+        return None
+    # Windows drive letters (C:, D:, etc.)
+    if len(filename) >= 2 and filename[1] == ":":
+        return None
+    # Resolve base directory to canonical absolute form
+    base_resolved = os.path.realpath(base_dir)
+    # Join and resolve to canonical absolute path (follows symlinks)
+    filepath = os.path.join(base_resolved, filename)
+    filepath_resolved = os.path.realpath(filepath)
+    # Containment check using commonpath
+    # This is the authoritative check - handles prefix collisions correctly
+    # e.g., base=/var/log, candidate=/var/logs/evil will fail
+    try:
+        common = os.path.commonpath([base_resolved, filepath_resolved])
+        if common != base_resolved:
+            return None
+    except ValueError:
+        # Paths on different drives (Windows) or other path issues
+        return None
+    # Optional: verify file exists and is a regular file
+    if require_exists:
+        try:
+            file_stat = os.stat(filepath_resolved)
+            if not stat.S_ISREG(file_stat.st_mode):
+                return None
+        except OSError:
+            return None
+    return filepath_resolved
+def resolve_safe_path_checked(base_dir: str, filename: str) -> Tuple[Optional[str], str]:
+    """
+    Resolve a safe path and return detailed error reason if validation fails.
+    Returns:
+        Tuple of (resolved_path, error_reason). If path is valid, error_reason is empty.
+    """
+    if not filename:
+        return None, "empty filename"
+    if "\x00" in filename:
+        return None, "filename contains NUL byte"
+    if any(ord(c) < 0x20 or ord(c) == 0x7F for c in filename):
+        return None, "filename contains control character"
+    if filename in {".", ".."}:
+        return None, "filename is special directory entry"
+    if "/" in filename or "\\" in filename:
+        return None, "filename contains path separator"
+    if filename.startswith("/") or filename.startswith("\\"):
+        return None, "filename is absolute path"
+    if len(filename) >= 2 and filename[1] == ":":
+        return None, "filename contains Windows drive letter"
+    base_resolved = os.path.realpath(base_dir)
+    filepath = os.path.join(base_resolved, filename)
+    filepath_resolved = os.path.realpath(filepath)
+    try:
+        common = os.path.commonpath([base_resolved, filepath_resolved])
+        if common != base_resolved:
+            return None, "resolved path escapes base directory"
+    except ValueError as e:
+        return None, f"path resolution error: {e}"
+    return filepath_resolved, ""
 def is_filename_valid(filename: str) -> bool:
     """

logtap/models/responses.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Response models for logtap API."""
-from typing import List, Optional
+from datetime import datetime
+from typing import Dict, List, Optional
 from pydantic import BaseModel, Field
@@ -63,3 +64,55 @@ class HealthResponse(BaseModel):
     status: str = Field(default="healthy", description="Service status")
     version: str = Field(description="logtap version")
+    mode: Optional[str] = Field(default=None, description="Server mode: serve, collect, or both")
+    features: Optional[List[str]] = Field(default=None, description="Available features")
+    runs: Optional[int] = Field(default=None, description="Number of active runs (collect mode)")
+    uptime_seconds: Optional[int] = Field(default=None, description="Server uptime in seconds")
+# Run-related models for collector mode
+class RunInfo(BaseModel):
+    """Information about a single run."""
+    id: str = Field(description="Run identifier")
+    lines: int = Field(description="Total lines ingested")
+    cursor_earliest: int = Field(description="Earliest available cursor")
+    cursor_latest: int = Field(description="Latest cursor")
+    tags: Dict[str, str] = Field(default_factory=dict, description="Run tags")
+    created_at: datetime = Field(description="When the run was created")
+    last_activity: datetime = Field(description="Last activity timestamp")
+    active: bool = Field(description="Whether the run is actively receiving data")
+    bytes_on_disk: Optional[int] = Field(default=None, description="Disk usage in bytes")
+class RunListResponse(BaseModel):
+    """Response for listing runs."""
+    runs: List[RunInfo] = Field(description="List of runs")
+class IngestResponse(BaseModel):
+    """Response after ingest completes."""
+    run_id: str = Field(description="Run identifier")
+    lines_ingested: int = Field(description="Number of lines ingested in this request")
+    cursor_end: int = Field(description="Final cursor after ingest")
+class StreamMetaEvent(BaseModel):
+    """Meta event sent at start of stream."""
+    cursor_earliest: int = Field(description="Earliest available cursor")
+    cursor_latest: int = Field(description="Latest cursor")
+    gap: bool = Field(default=False, description="Whether a gap was detected")
+    missed: Optional[int] = Field(default=None, description="Number of missed lines if gap")
+class StreamLineEvent(BaseModel):
+    """Line event in stream."""
+    cursor: int = Field(description="Line cursor")
+    line: str = Field(description="Log line content")
+    ts: datetime = Field(description="Timestamp when line was ingested")

logtap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

logtap 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl