logtap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
logtap/core/runs.py ADDED
@@ -0,0 +1,433 @@
1
+ """Run store for ingested log streams.
2
+
3
+ Provides append-only storage with in-memory tail cache and cursor management.
4
+ """
5
+
6
+ import threading
7
+ import time
8
+ from collections import deque
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import Dict, Iterator, List, Optional
13
+
14
+ TAG_KEY_PATTERN = r"^[a-zA-Z0-9_.-]+$"
15
+ TAG_VALUE_MAX_LEN = 256
16
+
17
+
18
+ @dataclass
19
+ class RunLine:
20
+ """A single log line with cursor, timestamp, and optional tags."""
21
+
22
+ cursor: int
23
+ line: str
24
+ ts: datetime
25
+ tags: Dict[str, str] = field(default_factory=dict)
26
+
27
+
28
+ @dataclass
29
+ class RunMetadata:
30
+ """Metadata for a run."""
31
+
32
+ id: str
33
+ tags: Dict[str, str] = field(default_factory=dict)
34
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
35
+ last_activity: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
36
+ cursor_start: int = 0 # First cursor ever (immutable)
37
+ cursor_latest: int = -1 # Latest cursor written
38
+ lines_count: int = 0
39
+ bytes_on_disk: int = 0
40
+ active: bool = True
41
+
42
+
43
+ class Run:
44
+ """A single run with append-only file storage and in-memory tail cache."""
45
+
46
+ def __init__(self, run_id: str, data_dir: Path, buffer_lines: int = 100_000):
47
+ self.id = run_id
48
+ self.data_dir = data_dir
49
+ self.buffer_lines = buffer_lines
50
+ self._lock = threading.RLock()
51
+
52
+ # In-memory tail cache (deque for O(1) append and popleft)
53
+ self._cache: deque[RunLine] = deque(maxlen=buffer_lines)
54
+ self._cache_start_cursor: int = 0 # Cursor of first item in cache
55
+
56
+ # Run directory and files
57
+ self.run_dir = data_dir / run_id
58
+ self.log_file = self.run_dir / "log.txt"
59
+ self.meta_file = self.run_dir / "meta.json"
60
+
61
+ # Metadata
62
+ self.metadata: RunMetadata
63
+
64
+ # Initialize or load
65
+ if self.run_dir.exists():
66
+ self._load()
67
+ else:
68
+ self._create()
69
+
70
+ def _create(self) -> None:
71
+ """Create a new run."""
72
+ self.run_dir.mkdir(parents=True, exist_ok=True)
73
+ self.metadata = RunMetadata(id=self.id)
74
+ self._save_metadata()
75
+
76
+ def _load(self) -> None:
77
+ """Load existing run from disk."""
78
+ import json
79
+
80
+ # Load metadata
81
+ if self.meta_file.exists():
82
+ with open(self.meta_file, "r", encoding="utf-8") as f:
83
+ data = json.load(f)
84
+ self.metadata = RunMetadata(
85
+ id=data["id"],
86
+ tags=data.get("tags", {}),
87
+ created_at=datetime.fromisoformat(data["created_at"]),
88
+ last_activity=datetime.fromisoformat(data["last_activity"]),
89
+ cursor_start=data.get("cursor_start", 0),
90
+ cursor_latest=data.get("cursor_latest", -1),
91
+ lines_count=data.get("lines_count", 0),
92
+ bytes_on_disk=data.get("bytes_on_disk", 0),
93
+ active=data.get("active", True),
94
+ )
95
+ else:
96
+ self.metadata = RunMetadata(id=self.id)
97
+
98
+ # Populate cache from end of log file
99
+ if self.log_file.exists():
100
+ self._populate_cache_from_disk()
101
+
102
+ def _populate_cache_from_disk(self) -> None:
103
+ """Load last N lines from disk into cache."""
104
+ import json
105
+
106
+ if not self.log_file.exists():
107
+ return
108
+
109
+ run_lines: List[RunLine] = []
110
+ with open(self.log_file, "r", encoding="utf-8", errors="replace") as f:
111
+ for raw_line in f:
112
+ raw_line = raw_line.rstrip("\n")
113
+ if not raw_line:
114
+ continue
115
+
116
+ # Try JSONL format first
117
+ if raw_line.startswith("{"):
118
+ try:
119
+ record = json.loads(raw_line)
120
+ run_lines.append(
121
+ RunLine(
122
+ cursor=record["c"],
123
+ line=record["l"],
124
+ ts=datetime.fromisoformat(record["t"]),
125
+ tags=record.get("g", {}),
126
+ )
127
+ )
128
+ continue
129
+ except (json.JSONDecodeError, KeyError):
130
+ pass
131
+
132
+ # Legacy plain text format
133
+ run_lines.append(
134
+ RunLine(
135
+ cursor=len(run_lines),
136
+ line=raw_line,
137
+ ts=self.metadata.last_activity,
138
+ tags={},
139
+ )
140
+ )
141
+
142
+ # Only keep last buffer_lines
143
+ if len(run_lines) > self.buffer_lines:
144
+ run_lines = run_lines[-self.buffer_lines :]
145
+
146
+ self._cache_start_cursor = run_lines[0].cursor if run_lines else 0
147
+ self._cache.clear()
148
+ for rl in run_lines:
149
+ self._cache.append(rl)
150
+
151
+ def _save_metadata(self) -> None:
152
+ """Save metadata to disk."""
153
+ import json
154
+
155
+ with open(self.meta_file, "w", encoding="utf-8") as f:
156
+ json.dump(
157
+ {
158
+ "id": self.metadata.id,
159
+ "tags": self.metadata.tags,
160
+ "created_at": self.metadata.created_at.isoformat(),
161
+ "last_activity": self.metadata.last_activity.isoformat(),
162
+ "cursor_start": self.metadata.cursor_start,
163
+ "cursor_latest": self.metadata.cursor_latest,
164
+ "lines_count": self.metadata.lines_count,
165
+ "bytes_on_disk": self.metadata.bytes_on_disk,
166
+ "active": self.metadata.active,
167
+ },
168
+ f,
169
+ )
170
+
171
+ def append(self, line: str, tags: Optional[Dict[str, str]] = None) -> RunLine:
172
+ """Append a line to the run. Returns the line with assigned cursor."""
173
+ import json
174
+
175
+ with self._lock:
176
+ now = datetime.now(timezone.utc)
177
+ cursor = self.metadata.cursor_latest + 1
178
+
179
+ run_line = RunLine(cursor=cursor, line=line, ts=now, tags=tags or {})
180
+
181
+ # Append to disk as JSONL
182
+ record = {
183
+ "c": cursor,
184
+ "l": line,
185
+ "t": now.isoformat(),
186
+ }
187
+ if tags:
188
+ record["g"] = tags # g for tags (short key)
189
+ with open(self.log_file, "a", encoding="utf-8") as f:
190
+ written = f.write(json.dumps(record, separators=(",", ":")) + "\n")
191
+ self.metadata.bytes_on_disk += written
192
+
193
+ # Update cache
194
+ if len(self._cache) >= self.buffer_lines:
195
+ self._cache_start_cursor += 1
196
+ self._cache.append(run_line)
197
+
198
+ # Update metadata
199
+ self.metadata.cursor_latest = cursor
200
+ self.metadata.lines_count += 1
201
+ self.metadata.last_activity = now
202
+
203
+ return run_line
204
+
205
+ def append_batch(
206
+ self, lines: List[str], tags: Optional[Dict[str, str]] = None
207
+ ) -> List[RunLine]:
208
+ """Append multiple lines atomically."""
209
+ with self._lock:
210
+ result = []
211
+ for line in lines:
212
+ result.append(self.append(line, tags))
213
+ self._save_metadata()
214
+ return result
215
+
216
+ def set_tags(self, tags: Dict[str, str]) -> Optional[str]:
217
+ """Validate tags. Returns error message on invalid tag, None on success.
218
+
219
+ Note: Tags are now stored per-line, not per-run. This method just validates
220
+ and tracks known tag keys in run metadata for discoverability.
221
+ """
222
+ import re
223
+
224
+ with self._lock:
225
+ for key, value in tags.items():
226
+ # Validate key
227
+ if not re.match(TAG_KEY_PATTERN, key):
228
+ return f"Invalid tag key: {key}"
229
+ # Validate value length
230
+ if len(value) > TAG_VALUE_MAX_LEN:
231
+ return f"Tag value too long: {key}"
232
+
233
+ # Track tag keys in metadata (last value wins, just for discoverability)
234
+ self.metadata.tags.update(tags)
235
+ self._save_metadata()
236
+ return None
237
+
238
+ @property
239
+ def cursor_earliest(self) -> int:
240
+ """Earliest cursor available in cache/disk."""
241
+ with self._lock:
242
+ if self._cache:
243
+ return self._cache[0].cursor
244
+ return 0
245
+
246
+ @property
247
+ def cursor_latest(self) -> int:
248
+ """Latest cursor written."""
249
+ with self._lock:
250
+ return self.metadata.cursor_latest
251
+
252
+ def get_lines(
253
+ self,
254
+ since: Optional[int] = None,
255
+ tail: int = 50,
256
+ limit: int = 1000,
257
+ tag_filter: Optional[Dict[str, str]] = None,
258
+ ) -> tuple[List[RunLine], bool]:
259
+ """
260
+ Get lines from run.
261
+
262
+ Args:
263
+ since: Cursor to start from (exclusive). If None, returns last `tail` lines.
264
+ tail: Number of recent lines if since is None.
265
+ limit: Maximum lines to return.
266
+ tag_filter: Filter lines by tags (AND semantics).
267
+
268
+ Returns:
269
+ Tuple of (lines, gap_detected).
270
+ gap_detected is True if since < cursor_earliest.
271
+ """
272
+ with self._lock:
273
+ gap = False
274
+
275
+ if since is not None:
276
+ # Resume from cursor
277
+ if since < self.cursor_earliest:
278
+ gap = True
279
+ # Start from earliest available
280
+ start_cursor = self.cursor_earliest
281
+ else:
282
+ start_cursor = since + 1 # Exclusive
283
+
284
+ # Get lines from cache
285
+ lines = [ln for ln in self._cache if ln.cursor >= start_cursor]
286
+ else:
287
+ # Tail mode - get last N lines
288
+ lines = list(self._cache)[-tail:]
289
+
290
+ # Filter by tags (AND semantics)
291
+ if tag_filter:
292
+ lines = [
293
+ ln for ln in lines if all(ln.tags.get(k) == v for k, v in tag_filter.items())
294
+ ]
295
+
296
+ # Apply limit
297
+ if len(lines) > limit:
298
+ lines = lines[:limit]
299
+
300
+ return lines, gap
301
+
302
+ def tail_iter(self, since: Optional[int] = None) -> Iterator[RunLine]:
303
+ """
304
+ Iterator that yields new lines as they arrive.
305
+
306
+ Args:
307
+ since: Cursor to start from (exclusive). If None, starts from latest.
308
+ """
309
+ last_cursor = since if since is not None else self.cursor_latest
310
+
311
+ while True:
312
+ with self._lock:
313
+ new_lines = [ln for ln in self._cache if ln.cursor > last_cursor]
314
+
315
+ for line in new_lines:
316
+ last_cursor = line.cursor
317
+ yield line
318
+
319
+ if not new_lines:
320
+ time.sleep(0.1) # Poll interval
321
+
322
+ def close(self) -> None:
323
+ """Mark run as inactive and save metadata."""
324
+ with self._lock:
325
+ self.metadata.active = False
326
+ self._save_metadata()
327
+
328
+
329
+ class RunStore:
330
+ """Manages all runs with disk persistence."""
331
+
332
+ def __init__(
333
+ self,
334
+ data_dir: Path,
335
+ buffer_lines: int = 100_000,
336
+ max_disk_mb: int = 1000,
337
+ retention_hours: int = 72,
338
+ ):
339
+ self.data_dir = Path(data_dir).expanduser()
340
+ self.buffer_lines = buffer_lines
341
+ self.max_disk_bytes = max_disk_mb * 1024 * 1024
342
+ self.retention_seconds = retention_hours * 3600
343
+ self._runs: Dict[str, Run] = {}
344
+ self._lock = threading.RLock()
345
+
346
+ # Create data directory
347
+ self.data_dir.mkdir(parents=True, exist_ok=True)
348
+
349
+ # Load existing runs
350
+ self._load_existing_runs()
351
+
352
+ def _load_existing_runs(self) -> None:
353
+ """Load existing runs from disk."""
354
+ if not self.data_dir.exists():
355
+ return
356
+
357
+ for run_dir in self.data_dir.iterdir():
358
+ if run_dir.is_dir() and (run_dir / "meta.json").exists():
359
+ try:
360
+ run = Run(run_dir.name, self.data_dir, self.buffer_lines)
361
+ self._runs[run_dir.name] = run
362
+ except Exception:
363
+ pass # Skip corrupted runs
364
+
365
+ def get_or_create(self, run_id: str) -> tuple[Run, bool]:
366
+ """Get existing run or create new one. Returns (run, created)."""
367
+ with self._lock:
368
+ if run_id in self._runs:
369
+ return self._runs[run_id], False
370
+
371
+ run = Run(run_id, self.data_dir, self.buffer_lines)
372
+ self._runs[run_id] = run
373
+ return run, True
374
+
375
+ def get(self, run_id: str) -> Optional[Run]:
376
+ """Get run by ID, or None if not found."""
377
+ with self._lock:
378
+ return self._runs.get(run_id)
379
+
380
+ def list_runs(self, since_hours: Optional[int] = None) -> List[Run]:
381
+ """List all runs, optionally filtered by recent activity."""
382
+ with self._lock:
383
+ runs = list(self._runs.values())
384
+
385
+ if since_hours is not None:
386
+ cutoff = datetime.now(timezone.utc).timestamp() - (since_hours * 3600)
387
+ runs = [r for r in runs if r.metadata.last_activity.timestamp() >= cutoff]
388
+
389
+ # Sort by last activity (most recent first)
390
+ runs.sort(key=lambda r: r.metadata.last_activity, reverse=True)
391
+ return runs
392
+
393
+ def total_disk_usage(self) -> int:
394
+ """Get total disk usage across all runs in bytes."""
395
+ with self._lock:
396
+ return sum(r.metadata.bytes_on_disk for r in self._runs.values())
397
+
398
+ def enforce_retention(self) -> None:
399
+ """Remove runs older than retention period."""
400
+ with self._lock:
401
+ cutoff = datetime.now(timezone.utc).timestamp() - self.retention_seconds
402
+ to_remove = [
403
+ run_id
404
+ for run_id, run in self._runs.items()
405
+ if run.metadata.last_activity.timestamp() < cutoff
406
+ ]
407
+
408
+ for run_id in to_remove:
409
+ self._delete_run(run_id)
410
+
411
+ def enforce_disk_limit(self) -> None:
412
+ """Remove oldest runs if disk limit exceeded."""
413
+ with self._lock:
414
+ while self.total_disk_usage() > self.max_disk_bytes and self._runs:
415
+ # Find oldest run
416
+ oldest = min(self._runs.values(), key=lambda r: r.metadata.last_activity)
417
+ self._delete_run(oldest.id)
418
+
419
+ def _delete_run(self, run_id: str) -> None:
420
+ """Delete a run from disk and memory."""
421
+ import shutil
422
+
423
+ if run_id in self._runs:
424
+ run = self._runs.pop(run_id)
425
+ run_dir = run.run_dir
426
+ if run_dir.exists():
427
+ shutil.rmtree(run_dir)
428
+
429
+ def check_storage(self) -> Optional[str]:
430
+ """Check if storage is available. Returns error message if not."""
431
+ if self.total_disk_usage() >= self.max_disk_bytes:
432
+ return "insufficient_storage"
433
+ return None
logtap/core/validation.py CHANGED
@@ -3,8 +3,140 @@ Input validation functions for logtap.
3
3
 
4
4
  These functions validate user input to prevent security issues
5
5
  like path traversal attacks and DoS via overly large inputs.
6
+
7
+ Path Traversal Prevention Model
8
+ ===============================
9
+ 1. Input validation: reject NUL bytes, control chars, path separators, ".."
10
+ 2. Join filename to base directory
11
+ 3. Resolve to canonical absolute path (follows symlinks)
12
+ 4. Containment check: commonpath([base, resolved]) == base
13
+ 5. File type check: must be regular file (not dir, device, etc.)
14
+
15
+ This prevents:
16
+ - Directory traversal (../)
17
+ - Absolute path injection (/etc/passwd)
18
+ - Symlink escape attacks
19
+ - Null byte injection
20
+ - Path prefix collisions (/var/log vs /var/logs)
6
21
  """
7
22
 
23
+ import os
24
+ import stat
25
+ from typing import Optional, Tuple
26
+
27
+
28
+ def resolve_safe_path(base_dir: str, filename: str, require_exists: bool = False) -> Optional[str]:
29
+ """
30
+ Safely resolve a filename within a base directory.
31
+
32
+ Security guarantees:
33
+ - Resolved path is always within base_dir (symlink-safe)
34
+ - No path traversal via "..", separators, or absolute paths
35
+ - No NUL bytes or control characters
36
+ - Containment verified via os.path.commonpath
37
+
38
+ Args:
39
+ base_dir: The base directory that files must be within.
40
+ filename: The user-provided filename (single component, no path separators).
41
+ require_exists: If True, also verify the file exists and is a regular file.
42
+
43
+ Returns:
44
+ The resolved filepath if safe, None if validation fails.
45
+ """
46
+ # Reject empty filenames
47
+ if not filename:
48
+ return None
49
+
50
+ # Reject NUL bytes (can truncate paths in some contexts)
51
+ if "\x00" in filename:
52
+ return None
53
+
54
+ # Reject control characters (0x00-0x1F, 0x7F)
55
+ if any(ord(c) < 0x20 or ord(c) == 0x7F for c in filename):
56
+ return None
57
+
58
+ # Reject special directory entries
59
+ # Note: ".." substring check removed - it over-blocks valid names like "my..log"
60
+ # Traversal requires separators which we reject below; containment check is authoritative
61
+ if filename in {".", ".."}:
62
+ return None
63
+
64
+ # Reject path separators - filename must be a single component
65
+ if "/" in filename or "\\" in filename:
66
+ return None
67
+
68
+ # Reject absolute paths (Unix and Windows)
69
+ if filename.startswith("/") or filename.startswith("\\"):
70
+ return None
71
+ # Windows drive letters (C:, D:, etc.)
72
+ if len(filename) >= 2 and filename[1] == ":":
73
+ return None
74
+
75
+ # Resolve base directory to canonical absolute form
76
+ base_resolved = os.path.realpath(base_dir)
77
+
78
+ # Join and resolve to canonical absolute path (follows symlinks)
79
+ filepath = os.path.join(base_resolved, filename)
80
+ filepath_resolved = os.path.realpath(filepath)
81
+
82
+ # Containment check using commonpath
83
+ # This is the authoritative check - handles prefix collisions correctly
84
+ # e.g., base=/var/log, candidate=/var/logs/evil will fail
85
+ try:
86
+ common = os.path.commonpath([base_resolved, filepath_resolved])
87
+ if common != base_resolved:
88
+ return None
89
+ except ValueError:
90
+ # Paths on different drives (Windows) or other path issues
91
+ return None
92
+
93
+ # Optional: verify file exists and is a regular file
94
+ if require_exists:
95
+ try:
96
+ file_stat = os.stat(filepath_resolved)
97
+ if not stat.S_ISREG(file_stat.st_mode):
98
+ return None
99
+ except OSError:
100
+ return None
101
+
102
+ return filepath_resolved
103
+
104
+
105
+ def resolve_safe_path_checked(base_dir: str, filename: str) -> Tuple[Optional[str], str]:
106
+ """
107
+ Resolve a safe path and return detailed error reason if validation fails.
108
+
109
+ Returns:
110
+ Tuple of (resolved_path, error_reason). If path is valid, error_reason is empty.
111
+ """
112
+ if not filename:
113
+ return None, "empty filename"
114
+ if "\x00" in filename:
115
+ return None, "filename contains NUL byte"
116
+ if any(ord(c) < 0x20 or ord(c) == 0x7F for c in filename):
117
+ return None, "filename contains control character"
118
+ if filename in {".", ".."}:
119
+ return None, "filename is special directory entry"
120
+ if "/" in filename or "\\" in filename:
121
+ return None, "filename contains path separator"
122
+ if filename.startswith("/") or filename.startswith("\\"):
123
+ return None, "filename is absolute path"
124
+ if len(filename) >= 2 and filename[1] == ":":
125
+ return None, "filename contains Windows drive letter"
126
+
127
+ base_resolved = os.path.realpath(base_dir)
128
+ filepath = os.path.join(base_resolved, filename)
129
+ filepath_resolved = os.path.realpath(filepath)
130
+
131
+ try:
132
+ common = os.path.commonpath([base_resolved, filepath_resolved])
133
+ if common != base_resolved:
134
+ return None, "resolved path escapes base directory"
135
+ except ValueError as e:
136
+ return None, f"path resolution error: {e}"
137
+
138
+ return filepath_resolved, ""
139
+
8
140
 
9
141
  def is_filename_valid(filename: str) -> bool:
10
142
  """
@@ -1,6 +1,7 @@
1
1
  """Response models for logtap API."""
2
2
 
3
- from typing import List, Optional
3
+ from datetime import datetime
4
+ from typing import Dict, List, Optional
4
5
 
5
6
  from pydantic import BaseModel, Field
6
7
 
@@ -63,3 +64,55 @@ class HealthResponse(BaseModel):
63
64
 
64
65
  status: str = Field(default="healthy", description="Service status")
65
66
  version: str = Field(description="logtap version")
67
+ mode: Optional[str] = Field(default=None, description="Server mode: serve, collect, or both")
68
+ features: Optional[List[str]] = Field(default=None, description="Available features")
69
+ runs: Optional[int] = Field(default=None, description="Number of active runs (collect mode)")
70
+ uptime_seconds: Optional[int] = Field(default=None, description="Server uptime in seconds")
71
+
72
+
73
+ # Run-related models for collector mode
74
+
75
+
76
+ class RunInfo(BaseModel):
77
+ """Information about a single run."""
78
+
79
+ id: str = Field(description="Run identifier")
80
+ lines: int = Field(description="Total lines ingested")
81
+ cursor_earliest: int = Field(description="Earliest available cursor")
82
+ cursor_latest: int = Field(description="Latest cursor")
83
+ tags: Dict[str, str] = Field(default_factory=dict, description="Run tags")
84
+ created_at: datetime = Field(description="When the run was created")
85
+ last_activity: datetime = Field(description="Last activity timestamp")
86
+ active: bool = Field(description="Whether the run is actively receiving data")
87
+ bytes_on_disk: Optional[int] = Field(default=None, description="Disk usage in bytes")
88
+
89
+
90
+ class RunListResponse(BaseModel):
91
+ """Response for listing runs."""
92
+
93
+ runs: List[RunInfo] = Field(description="List of runs")
94
+
95
+
96
+ class IngestResponse(BaseModel):
97
+ """Response after ingest completes."""
98
+
99
+ run_id: str = Field(description="Run identifier")
100
+ lines_ingested: int = Field(description="Number of lines ingested in this request")
101
+ cursor_end: int = Field(description="Final cursor after ingest")
102
+
103
+
104
+ class StreamMetaEvent(BaseModel):
105
+ """Meta event sent at start of stream."""
106
+
107
+ cursor_earliest: int = Field(description="Earliest available cursor")
108
+ cursor_latest: int = Field(description="Latest cursor")
109
+ gap: bool = Field(default=False, description="Whether a gap was detected")
110
+ missed: Optional[int] = Field(default=None, description="Number of missed lines if gap")
111
+
112
+
113
+ class StreamLineEvent(BaseModel):
114
+ """Line event in stream."""
115
+
116
+ cursor: int = Field(description="Line cursor")
117
+ line: str = Field(description="Log line content")
118
+ ts: datetime = Field(description="Timestamp when line was ingested")