codevigil 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codevigil/watcher.py ADDED
@@ -0,0 +1,456 @@
1
+ """Filesystem watcher: ``Source`` protocol and ``PollingSource`` implementation.
2
+
3
+ Turns a directory tree of session JSONL files into an iterator of typed
4
+ ``SourceEvent`` records. The aggregator is the only consumer and is unaware
5
+ of which concrete ``Source`` is feeding it — v0.1 ships ``PollingSource``;
6
+ v0.2+ may add inotify / fsevents backends behind the same protocol.
7
+
8
+ The ``Source`` protocol lives in this module rather than ``codevigil.types``
9
+ because it is watcher-internal vocabulary: the aggregator imports
10
+ ``codevigil.watcher.Source`` directly. Keeping it here means the watcher
11
+ module is self-contained and ``codevigil.types`` stays focused on the
12
+ parser/collector/renderer contracts that genuinely cross subsystem lines.
13
+
14
+ Five-case transition table (per ``docs/design.md`` §Watcher Design):
15
+
16
+ | Transition | Action |
17
+ | ----------------------- | ----------------------------------------- |
18
+ | unknown path | NEW_SESSION + APPEND per complete line |
19
+ | same inode, size grew | APPEND per complete line |
20
+ | same inode, size shrank | TRUNCATE, reset cursor, re-read |
21
+ | inode changed | ROTATE, reset cursor, re-read |
22
+ | path vanished | DELETE, evict cursor |
23
+
24
+ Partial trailing bytes (no newline) are buffered in the cursor's ``pending``
25
+ field and carried to the next poll, so a writer that flushes half a JSON
26
+ record never produces a torn line.
27
+
28
+ Note on ``SourceEvent`` shape: ``docs/design.md`` sketches a batched form
29
+ with ``lines: list[str]``; this implementation emits one ``SourceEvent`` per
30
+ complete line (``line: str | None``). The aggregator phase wires the events
31
+ into the parser one at a time anyway, and the per-line shape removes a layer
32
+ of unpacking at the call site without losing information.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import os
38
+ import stat
39
+ from collections.abc import Iterator
40
+ from dataclasses import dataclass, field
41
+ from datetime import UTC, datetime
42
+ from enum import Enum
43
+ from pathlib import Path
44
+ from typing import Protocol, runtime_checkable
45
+
46
+ from codevigil.errors import CodevigilError, ErrorLevel, ErrorSource, record
47
+ from codevigil.privacy import PrivacyViolationError
48
+
49
+ _CHUNK_SIZE: int = 1 * 1024 * 1024 # 1 MiB delta read chunk
50
+
51
+
52
+ class SourceEventKind(Enum):
53
+ """Five filesystem-state transitions a ``Source`` can report."""
54
+
55
+ NEW_SESSION = "new_session"
56
+ APPEND = "append"
57
+ ROTATE = "rotate"
58
+ TRUNCATE = "truncate"
59
+ DELETE = "delete"
60
+
61
+
62
+ @dataclass(frozen=True, slots=True)
63
+ class SourceEvent:
64
+ """One typed record yielded by ``Source.poll()``.
65
+
66
+ ``line`` is populated for ``APPEND`` events and is ``None`` for every
67
+ other kind. ``inode`` is the device-local inode number captured at the
68
+ moment the event was generated; for ``DELETE`` it carries the last
69
+ observed inode so downstream consumers can correlate against an earlier
70
+ cursor.
71
+ """
72
+
73
+ kind: SourceEventKind
74
+ session_id: str
75
+ path: Path
76
+ inode: int
77
+ line: str | None
78
+ timestamp: datetime
79
+
80
+
81
+ @dataclass(slots=True)
82
+ class FileCursor:
83
+ """Per-file watcher state.
84
+
85
+ ``offset`` is the byte offset of the next unread byte; ``pending``
86
+ carries bytes read past the last newline that have not yet completed a
87
+ line. A line is only emitted when its terminating ``\\n`` arrives.
88
+ """
89
+
90
+ path: Path
91
+ inode: int
92
+ size: int
93
+ offset: int
94
+ pending: bytes = b""
95
+ large_file_warned: bool = False
96
+
97
+
98
+ @runtime_checkable
99
+ class Source(Protocol):
100
+ """Interface every watcher backend must honor.
101
+
102
+ The aggregator calls ``poll()`` on its tick loop and consumes the
103
+ iterator to exhaustion before returning to its own bookkeeping. ``poll``
104
+ must not block. ``close`` releases any backend state; for the polling
105
+ implementation it simply drops the cursor table.
106
+ """
107
+
108
+ def poll(self) -> Iterator[SourceEvent]: ...
109
+
110
+ def close(self) -> None: ...
111
+
112
+
113
+ @dataclass(slots=True)
114
+ class _WalkResult:
115
+ files: list[Path]
116
+ overflowed: bool
117
+ overflow_count: int
118
+
119
+
120
+ def _now() -> datetime:
121
+ return datetime.now(tz=UTC)
122
+
123
+
124
+ class PollingSource:
125
+ """Stat-and-read polling implementation of the ``Source`` protocol.
126
+
127
+ Holds a per-file ``FileCursor`` table in memory; on every ``poll()``
128
+ call walks the configured ``root`` (capped at ``max_files``), stats each
129
+ discovered ``*.jsonl`` file, and yields ``SourceEvent`` records for the
130
+ transitions documented in the module docstring.
131
+
132
+ Constructor requires ``root`` to resolve to a path inside the user's
133
+ home directory; any path outside ``$HOME`` raises ``PrivacyViolationError``
134
+ before the source is usable. This is the runtime half of the filesystem
135
+ scope rule (``docs/design.md`` §Privacy Enforcement); a CRITICAL error
136
+ is also recorded on the error channel so operators see the attempt in
137
+ the JSONL log.
138
+ """
139
+
140
+ def __init__(
141
+ self,
142
+ root: Path,
143
+ *,
144
+ interval: float = 2.0,
145
+ max_files: int = 2000,
146
+ large_file_warn_bytes: int = 10 * 1024 * 1024,
147
+ ) -> None:
148
+ self._interval: float = interval
149
+ self._max_files: int = max_files
150
+ self._large_file_warn_bytes: int = large_file_warn_bytes
151
+ self._cursors: dict[Path, FileCursor] = {}
152
+ self._overflow_warned: bool = False
153
+ self._root: Path = self._validate_root(root)
154
+
155
+ @property
156
+ def root(self) -> Path:
157
+ return self._root
158
+
159
+ @property
160
+ def interval(self) -> float:
161
+ return self._interval
162
+
163
+ @property
164
+ def max_files(self) -> int:
165
+ return self._max_files
166
+
167
+ # ------------------------------------------------------------------ scope
168
+
169
+ @staticmethod
170
+ def _validate_root(root: Path) -> Path:
171
+ """Resolve the root once and refuse anything outside ``$HOME``."""
172
+
173
+ resolved_root = root.expanduser().resolve()
174
+ home = Path.home().resolve()
175
+ if not resolved_root.is_relative_to(home):
176
+ err = CodevigilError(
177
+ level=ErrorLevel.CRITICAL,
178
+ source=ErrorSource.WATCHER,
179
+ code="watcher.path_scope_violation",
180
+ message=(
181
+ f"watcher root {str(resolved_root)!r} is outside the user "
182
+ f"home directory {str(home)!r}; refusing to walk"
183
+ ),
184
+ context={
185
+ "root": str(resolved_root),
186
+ "home": str(home),
187
+ },
188
+ )
189
+ record(err)
190
+ raise PrivacyViolationError(err.message)
191
+ return resolved_root
192
+
193
+ # ------------------------------------------------------------------- walk
194
+
195
+ def _walk(self) -> _WalkResult:
196
+ """Return the deterministic, capped list of session files under root.
197
+
198
+ Walks the tree with ``os.scandir`` and collects every regular file
199
+ ending in ``.jsonl``. Results are sorted by absolute path so the
200
+ "first ``max_files``" slice is stable across polls and platforms.
201
+ """
202
+
203
+ discovered: list[Path] = []
204
+ if not self._root.exists():
205
+ return _WalkResult(files=[], overflowed=False, overflow_count=0)
206
+
207
+ stack: list[Path] = [self._root]
208
+ while stack:
209
+ current = stack.pop()
210
+ try:
211
+ with os.scandir(current) as it:
212
+ entries = list(it)
213
+ except (FileNotFoundError, NotADirectoryError, PermissionError):
214
+ continue
215
+ for entry in entries:
216
+ try:
217
+ if entry.is_dir(follow_symlinks=False):
218
+ stack.append(Path(entry.path))
219
+ elif entry.is_file(follow_symlinks=False) and entry.name.endswith(".jsonl"):
220
+ discovered.append(Path(entry.path))
221
+ except OSError:
222
+ continue
223
+
224
+ discovered.sort(key=lambda p: str(p))
225
+ if len(discovered) > self._max_files:
226
+ return _WalkResult(
227
+ files=discovered[: self._max_files],
228
+ overflowed=True,
229
+ overflow_count=len(discovered) - self._max_files,
230
+ )
231
+ return _WalkResult(files=discovered, overflowed=False, overflow_count=0)
232
+
233
+ # ------------------------------------------------------------------- poll
234
+
235
+ def poll(self) -> Iterator[SourceEvent]:
236
+ """Yield one ``SourceEvent`` per state transition since the last call.
237
+
238
+ The iterator is materialised eagerly into a list and returned via
239
+ ``iter()``: the aggregator wants stable ordering and the disk reads
240
+ happen inside this call, not lazily inside the consumer's loop.
241
+ """
242
+
243
+ events: list[SourceEvent] = []
244
+ walk = self._walk()
245
+ if walk.overflowed and not self._overflow_warned:
246
+ self._overflow_warned = True
247
+ record(
248
+ CodevigilError(
249
+ level=ErrorLevel.WARN,
250
+ source=ErrorSource.WATCHER,
251
+ code="watcher.bounded_walk_overflow",
252
+ message=(
253
+ f"watcher walk exceeded max_files={self._max_files}; "
254
+ f"{walk.overflow_count} file(s) skipped"
255
+ ),
256
+ context={
257
+ "max_files": self._max_files,
258
+ "overflow_count": walk.overflow_count,
259
+ "root": str(self._root),
260
+ },
261
+ )
262
+ )
263
+
264
+ seen_paths: set[Path] = set()
265
+ for path in walk.files:
266
+ seen_paths.add(path)
267
+ try:
268
+ st = os.stat(path)
269
+ except FileNotFoundError:
270
+ # File vanished between scandir and stat; treat as delete on
271
+ # the next pass once the cursor sees it missing.
272
+ continue
273
+ if not stat.S_ISREG(st.st_mode):
274
+ continue
275
+ self._handle_path(path, st.st_ino, st.st_size, events)
276
+
277
+ # Detect deletions: any cursored path that no longer appears in the
278
+ # walk has been removed from the watched tree.
279
+ deleted = [p for p in self._cursors if p not in seen_paths]
280
+ for path in sorted(deleted, key=lambda p: str(p)):
281
+ cursor = self._cursors.pop(path)
282
+ events.append(
283
+ SourceEvent(
284
+ kind=SourceEventKind.DELETE,
285
+ session_id=path.stem,
286
+ path=path,
287
+ inode=cursor.inode,
288
+ line=None,
289
+ timestamp=_now(),
290
+ )
291
+ )
292
+
293
+ return iter(events)
294
+
295
+ def close(self) -> None:
296
+ """Drop the cursor table. No OS handles are held between polls."""
297
+
298
+ self._cursors.clear()
299
+
300
+ # --------------------------------------------------------------- internals
301
+
302
+ def _handle_path(
303
+ self,
304
+ path: Path,
305
+ inode: int,
306
+ size: int,
307
+ events: list[SourceEvent],
308
+ ) -> None:
309
+ cursor = self._cursors.get(path)
310
+ if cursor is None:
311
+ self._handle_new(path, inode, size, events)
312
+ return
313
+ if inode != cursor.inode:
314
+ events.append(
315
+ SourceEvent(
316
+ kind=SourceEventKind.ROTATE,
317
+ session_id=path.stem,
318
+ path=path,
319
+ inode=inode,
320
+ line=None,
321
+ timestamp=_now(),
322
+ )
323
+ )
324
+ self._cursors.pop(path, None)
325
+ self._handle_new(path, inode, size, events, emit_new_session=False)
326
+ return
327
+ if size < cursor.size:
328
+ events.append(
329
+ SourceEvent(
330
+ kind=SourceEventKind.TRUNCATE,
331
+ session_id=path.stem,
332
+ path=path,
333
+ inode=inode,
334
+ line=None,
335
+ timestamp=_now(),
336
+ )
337
+ )
338
+ self._cursors.pop(path, None)
339
+ self._handle_new(path, inode, size, events, emit_new_session=False)
340
+ return
341
+ if size > cursor.size:
342
+ growth = size - cursor.offset
343
+ self._maybe_warn_large_growth(path, cursor, growth)
344
+ self._read_and_emit(path, cursor, size, events)
345
+
346
+ def _handle_new(
347
+ self,
348
+ path: Path,
349
+ inode: int,
350
+ size: int,
351
+ events: list[SourceEvent],
352
+ *,
353
+ emit_new_session: bool = True,
354
+ ) -> None:
355
+ cursor = FileCursor(path=path, inode=inode, size=0, offset=0, pending=b"")
356
+ self._cursors[path] = cursor
357
+ if emit_new_session:
358
+ events.append(
359
+ SourceEvent(
360
+ kind=SourceEventKind.NEW_SESSION,
361
+ session_id=path.stem,
362
+ path=path,
363
+ inode=inode,
364
+ line=None,
365
+ timestamp=_now(),
366
+ )
367
+ )
368
+ if size > 0:
369
+ self._maybe_warn_large_growth(path, cursor, size)
370
+ self._read_and_emit(path, cursor, size, events)
371
+
372
+ def _maybe_warn_large_growth(
373
+ self,
374
+ path: Path,
375
+ cursor: FileCursor,
376
+ growth: int,
377
+ ) -> None:
378
+ if growth <= self._large_file_warn_bytes or cursor.large_file_warned:
379
+ return
380
+ cursor.large_file_warned = True
381
+ record(
382
+ CodevigilError(
383
+ level=ErrorLevel.WARN,
384
+ source=ErrorSource.WATCHER,
385
+ code="watcher.large_file_growth",
386
+ message=(
387
+ f"file {str(path)!r} grew {growth} bytes in a single poll "
388
+ f"(threshold {self._large_file_warn_bytes}); processing anyway"
389
+ ),
390
+ context={
391
+ "path": str(path),
392
+ "growth": growth,
393
+ "threshold": self._large_file_warn_bytes,
394
+ },
395
+ )
396
+ )
397
+
398
+ def _read_and_emit(
399
+ self,
400
+ path: Path,
401
+ cursor: FileCursor,
402
+ new_size: int,
403
+ events: list[SourceEvent],
404
+ ) -> None:
405
+ """Read from ``cursor.offset`` to ``new_size`` in 1 MiB chunks.
406
+
407
+ Bytes read are appended to ``pending``; whenever ``pending`` contains
408
+ a newline, every complete line is split off and emitted as an APPEND
409
+ event. The trailing fragment (no newline yet) stays in ``pending``
410
+ for the next poll.
411
+ """
412
+
413
+ try:
414
+ handle = path.open("rb")
415
+ except FileNotFoundError:
416
+ return
417
+ try:
418
+ handle.seek(cursor.offset)
419
+ remaining = new_size - cursor.offset
420
+ while remaining > 0:
421
+ chunk = handle.read(min(_CHUNK_SIZE, remaining))
422
+ if not chunk:
423
+ break
424
+ cursor.pending += chunk
425
+ remaining -= len(chunk)
426
+ while b"\n" in cursor.pending:
427
+ line_bytes, _, rest = cursor.pending.partition(b"\n")
428
+ cursor.pending = rest
429
+ line = line_bytes.decode("utf-8", errors="replace")
430
+ events.append(
431
+ SourceEvent(
432
+ kind=SourceEventKind.APPEND,
433
+ session_id=path.stem,
434
+ path=path,
435
+ inode=cursor.inode,
436
+ line=line,
437
+ timestamp=_now(),
438
+ )
439
+ )
440
+ cursor.offset = handle.tell()
441
+ finally:
442
+ handle.close()
443
+ cursor.size = new_size
444
+
445
+
446
+ # ``field`` is re-exported for symmetry with ``codevigil.types``; downstream
447
+ # phases that build on ``FileCursor`` may want default_factory without a
448
+ # second dataclasses import.
449
+ __all__ = [
450
+ "FileCursor",
451
+ "PollingSource",
452
+ "Source",
453
+ "SourceEvent",
454
+ "SourceEventKind",
455
+ "field",
456
+ ]