cc-transcript 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/Cargo.lock +1 -1
  2. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/PKG-INFO +4 -3
  3. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/README.md +1 -1
  4. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/__init__.py +1 -1
  5. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/discovery.py +23 -16
  6. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/filterspec.py +20 -0
  7. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/parser.py +19 -6
  8. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/sentiment/buckets.py +11 -0
  9. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/sentiment/lexicon.py +18 -0
  10. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/sentiment/messages.py +6 -0
  11. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/sentiment/scorespec.py +5 -0
  12. cc_transcript-0.5.0/cc_transcript/store.py +130 -0
  13. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/pyproject.toml +25 -4
  14. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/Cargo.toml +1 -1
  15. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/lib.rs +34 -16
  16. cc_transcript-0.4.0/cc_transcript/store.py +0 -118
  17. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/Cargo.toml +0 -0
  18. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/LICENSE +0 -0
  19. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/_parser_rs.pyi +0 -0
  20. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/backend.py +0 -0
  21. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/builders.py +0 -0
  22. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/filters.py +0 -0
  23. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/models.py +0 -0
  24. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/py.typed +0 -0
  25. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/rust.py +0 -0
  26. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/sentiment/__init__.py +0 -0
  27. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/cc_transcript/sentiment/engine.py +0 -0
  28. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/data/afinn-en-165.tsv +0 -0
  29. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/data/domain_overrides.tsv +0 -0
  30. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/event.rs +0 -0
  31. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/filter.rs +0 -0
  32. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/lexicon.rs +0 -0
  33. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/model.rs +0 -0
  34. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/score.rs +0 -0
  35. {cc_transcript-0.4.0 → cc_transcript-0.5.0}/rust/src/value.rs +0 -0
@@ -66,7 +66,7 @@ dependencies = [
66
66
 
67
67
  [[package]]
68
68
  name = "cc_transcript_parser"
69
- version = "0.4.0"
69
+ version = "0.5.0"
70
70
  dependencies = [
71
71
  "chrono",
72
72
  "crossbeam-channel",
@@ -1,16 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cc-transcript
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Classifier: Development Status :: 3 - Alpha
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: Operating System :: OS Independent
7
7
  Classifier: Programming Language :: Python :: 3
8
8
  Classifier: Programming Language :: Python :: 3 :: Only
9
9
  Classifier: Typing :: Typed
10
+ Requires-Dist: aiosqlite>=0.20
10
11
  Requires-Dist: anyio>=4.4
11
12
  Requires-Dist: orjson>=3.10
12
13
  Requires-Dist: pytest>=8.0 ; extra == 'dev'
13
- Requires-Dist: pyright>=1.1 ; extra == 'dev'
14
+ Requires-Dist: ty>=0.0.44 ; extra == 'dev'
14
15
  Requires-Dist: ruff>=0.8 ; extra == 'dev'
15
16
  Requires-Dist: spacy>=3.8 ; extra == 'lexicon'
16
17
  Requires-Dist: afinn>=0.1 ; extra == 'lexicon'
@@ -86,5 +87,5 @@ available — every rule is off by default, so a bare `FilterConfig()` passes ev
86
87
 
87
88
  ## Docs
88
89
 
89
- [Read the docs](https://yasyf.github.io/cc-transcript/) for the full guide and API reference.
90
+ [Read the docs](https://yasyf.github.io/cc-transcript/) for the full guides — Getting Started, Filtering events, Scoring sentiment, Rust/Python backends & parity, and Compose your own policy — plus the complete API reference.
90
91
 
@@ -55,4 +55,4 @@ available — every rule is off by default, so a bare `FilterConfig()` passes ev
55
55
 
56
56
  ## Docs
57
57
 
58
- [Read the docs](https://yasyf.github.io/cc-transcript/) for the full guide and API reference.
58
+ [Read the docs](https://yasyf.github.io/cc-transcript/) for the full guides — Getting Started, Filtering events, Scoring sentiment, Rust/Python backends & parity, and Compose your own policy — plus the complete API reference.
@@ -56,5 +56,5 @@ from cc_transcript.models import (
56
56
  TranscriptEvent,
57
57
  UserEvent,
58
58
  )
59
- from cc_transcript.parser import TranscriptParser, parse_events, parse_events_from_bytes
59
+ from cc_transcript.parser import TranscriptParser, parse_events_async, parse_events_from_bytes
60
60
  from cc_transcript.store import FileStateStore
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
4
 
5
+ import anyio
6
+
5
7
  CLAUDE_PROJECTS_DIR = Path.home() / ".claude" / "projects"
6
8
 
7
9
 
@@ -14,26 +16,27 @@ class TranscriptDiscovery:
14
16
  """
15
17
 
16
18
  @staticmethod
17
- def find_transcripts() -> list[Path]:
19
+ async def find_transcripts() -> list[Path]:
18
20
  """Returns every transcript under the projects directory, sorted."""
19
- if not CLAUDE_PROJECTS_DIR.exists():
21
+ root = anyio.Path(CLAUDE_PROJECTS_DIR)
22
+ if not await root.exists():
20
23
  return []
21
- return sorted(CLAUDE_PROJECTS_DIR.rglob("*.jsonl"))
24
+ return sorted([Path(p) async for p in root.rglob("*.jsonl")])
22
25
 
23
26
  @staticmethod
24
- def stat_mtime(path: Path) -> float | None:
27
+ async def stat_mtime(path: Path) -> float | None:
25
28
  try:
26
- return path.stat().st_mtime
29
+ return (await anyio.Path(path).stat()).st_mtime
27
30
  except OSError:
28
31
  return None
29
32
 
30
33
  @staticmethod
31
- def transcript_mtime(path: Path) -> float:
34
+ async def transcript_mtime(path: Path) -> float:
32
35
  """Returns ``path``'s modification time, raising if it cannot be read."""
33
- return path.stat().st_mtime
36
+ return (await anyio.Path(path).stat()).st_mtime
34
37
 
35
38
  @staticmethod
36
- def find_in(
39
+ async def find_in(
37
40
  directory: Path,
38
41
  *,
39
42
  name_contains: str | None = None,
@@ -52,14 +55,18 @@ class TranscriptDiscovery:
52
55
  Returns:
53
56
  Pairs of ``(path, mtime)`` sorted by path.
54
57
  """
55
- if not directory.exists():
58
+ root = anyio.Path(directory)
59
+ if not await root.exists():
56
60
  return []
57
- found = [
58
- (p, mtime)
59
- for p in directory.rglob("*.jsonl")
60
- if not name_contains or name_contains in p.name
61
- if (mtime := TranscriptDiscovery.stat_mtime(p)) is not None
62
- if known_mtimes is None or (prev := known_mtimes.get(str(p))) is None or prev < mtime
63
- ]
61
+ found: list[tuple[Path, float]] = []
62
+ async for entry in root.rglob("*.jsonl"):
63
+ if name_contains and name_contains not in entry.name:
64
+ continue
65
+ path = Path(entry)
66
+ if (mtime := await TranscriptDiscovery.stat_mtime(path)) is None:
67
+ continue
68
+ if known_mtimes is not None and (prev := known_mtimes.get(str(path))) is not None and prev >= mtime:
69
+ continue
70
+ found.append((path, mtime))
64
71
  found.sort(key=lambda e: e[0])
65
72
  return found[:limit] if limit is not None else found
@@ -63,11 +63,28 @@ STRUCTURAL_GROUPS: tuple[tuple[str, str], ...] = (
63
63
  AGENT_INJECTION_GROUPS: tuple[tuple[str, str], ...] = (
64
64
  ("xml_tags_extra", r"<(?:teammate-message|scheduled-task)\b"),
65
65
  ("augment_agent", r"^# Augment Agent\b"),
66
+ ("role_reminder", r"^\s*\[Role Reminder\b"),
66
67
  )
67
68
 
68
69
  INTERRUPT_MARKER_GROUPS: tuple[tuple[str, str], ...] = (("interrupt", r"\[Request interrupted by user"),)
69
70
  STOP_HOOK_GROUPS: tuple[tuple[str, str], ...] = (("stop_hook", r"Stop hook feedback:"),)
70
71
 
72
+ # Approve-and-advance directives: a user telling the agent to proceed/commit/push or
73
+ # to resume killed work. They follow an assistant turn but advance it rather than
74
+ # correcting it — the opposite of pushback — so a pushback consumer drops them. The
75
+ # approve-and-advance arm is start-anchored so a mid-sentence "commit"/"push" inside
76
+ # a real correction never matches; only the resume arm searches anywhere.
77
+ CONTINUATION_GROUPS: tuple[tuple[str, str], ...] = (
78
+ (
79
+ "continuation",
80
+ r"^\s*(?:(?:yea+h?|yep|yup|sure|ok(?:ay)?|sounds good|looks good|lgtm|perfect)[\s,.!]+){0,2}"
81
+ r"(?:go ahead\b|(?:go ahead and\s+)?(?:commit|push|rebase|merge|deploy)\b"
82
+ r"|ship it\b|cut (?:a |the )?(?:new )?release\b|proceed\b)"
83
+ r"|\byou must resume\b"
84
+ r"|\b(?:resume|restart) (?:them|it|the (?:sub-?agents?|workflows?|agents?|tasks?))\b",
85
+ ),
86
+ )
87
+
71
88
  # Named junk categories a consumer composes via ``drop_junk(...)``. Interrupt and
72
89
  # stop-hook are kept separate because they carry pushback and must never be folded
73
90
  # into the structural-noise default.
@@ -76,6 +93,7 @@ JUNK_CATEGORIES: dict[str, tuple[tuple[str, str], ...]] = {
76
93
  "agent_injection": AGENT_INJECTION_GROUPS,
77
94
  "interrupt": INTERRUPT_MARKER_GROUPS,
78
95
  "stop_hook": STOP_HOOK_GROUPS,
96
+ "continuation": CONTINUATION_GROUPS,
79
97
  }
80
98
 
81
99
  # The superset of structural noise (structural ∪ agent-injection), WITHOUT
@@ -125,6 +143,7 @@ PORTABLE_GROUP_NAMES: frozenset[str] = frozenset(
125
143
  *STRUCTURAL_NOISE_GROUPS,
126
144
  *INTERRUPT_MARKER_GROUPS,
127
145
  *STOP_HOOK_GROUPS,
146
+ *CONTINUATION_GROUPS,
128
147
  *FRUSTRATION_GROUPS,
129
148
  *MILD_IMPATIENCE_GROUPS,
130
149
  )
@@ -398,6 +417,7 @@ def keep(event: TranscriptEvent, spec: FilterSpec) -> bool:
398
417
 
399
418
 
400
419
  def labels_for(event: TranscriptEvent, spec: FilterSpec) -> tuple[str, ...]:
420
+ """Returns the TAG labels ``spec`` records for ``event``, in clause order."""
401
421
  kind = event_kind(event)
402
422
  return tuple(
403
423
  clause.label
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import os
5
+ from contextlib import suppress
4
6
  from datetime import datetime
5
7
  from typing import TYPE_CHECKING, Any, ClassVar, Literal
6
8
 
@@ -144,12 +146,12 @@ def decode_line(line: bytes) -> TranscriptEvent | None:
144
146
  return build_event(data)
145
147
 
146
148
 
147
- def parse_events(path: Path) -> list[TranscriptEvent]:
148
- return parse_events_from_bytes(path.read_bytes())
149
+ async def parse_events_async(path: Path) -> list[TranscriptEvent]:
150
+ return parse_events_from_bytes(await anyio.Path(path).read_bytes())
149
151
 
150
152
 
151
153
  def parse_one(path: Path, mtime: float) -> ParsedTranscript:
152
- return ParsedTranscript(path=path, mtime=mtime, events=tuple(parse_events(path)))
154
+ return ParsedTranscript(path=path, mtime=mtime, events=tuple(parse_events_from_bytes(path.read_bytes())))
153
155
 
154
156
 
155
157
  def parse_one_filtered(path: Path, mtime: float, spec: FilterSpec | None) -> ParsedTranscript:
@@ -192,7 +194,14 @@ class PythonBackend:
192
194
 
193
195
  async def worker(path: Path, mtime: float) -> None:
194
196
  async with limiter:
195
- await send_ch.send(await anyio.to_thread.run_sync(parse_one_filtered, path, mtime, spec))
197
+ try:
198
+ parsed = await anyio.to_thread.run_sync(parse_one_filtered, path, mtime, spec)
199
+ except (OSError, ValueError, KeyError):
200
+ return
201
+ try:
202
+ await send_ch.send(parsed)
203
+ except anyio.BrokenResourceError:
204
+ return
196
205
 
197
206
  async def drive() -> None:
198
207
  try:
@@ -202,11 +211,15 @@ class PythonBackend:
202
211
  finally:
203
212
  await send_ch.aclose()
204
213
 
205
- async with anyio.create_task_group() as outer:
206
- outer.start_soon(drive)
214
+ driver = asyncio.ensure_future(drive())
215
+ try:
207
216
  async with recv_ch:
208
217
  async for parsed in recv_ch:
209
218
  yield parsed
219
+ finally:
220
+ driver.cancel()
221
+ with suppress(asyncio.CancelledError):
222
+ await driver
210
223
 
211
224
 
212
225
  class TranscriptParser:
@@ -16,6 +16,8 @@ MIN_USER_CHARS = 5
16
16
 
17
17
 
18
18
  class ConversationBucket(NamedTuple):
19
+ """A session's messages grouped into one fixed-width time window — the unit that gets scored."""
20
+
19
21
  session_id: SessionId
20
22
  bucket_index: BucketIndex
21
23
  bucket_start: datetime
@@ -23,11 +25,19 @@ class ConversationBucket(NamedTuple):
23
25
 
24
26
 
25
27
  class BucketKey(NamedTuple):
28
+ """Stable identity of a :class:`ConversationBucket`: its session and bucket index."""
29
+
26
30
  session_id: SessionId
27
31
  bucket_index: BucketIndex
28
32
 
29
33
 
30
34
  class ConversationBucketer:
35
+ """Groups transcript messages into per-session, time-aligned buckets worth scoring.
36
+
37
+ Sessions below ``MIN_USER_TURNS_PER_SESSION`` and windows lacking a substantive user turn or
38
+ any assistant turn are dropped.
39
+ """
40
+
31
41
  @staticmethod
32
42
  def align_to_bucket(ts: datetime) -> datetime:
33
43
  return ts.replace(
@@ -73,6 +83,7 @@ class ConversationBucketer:
73
83
 
74
84
 
75
85
  def extract_bucket_keys(messages: list[TranscriptMessage]) -> list[BucketKey]:
86
+ """Returns the :class:`BucketKey` of every scorable bucket in ``messages``."""
76
87
  return [
77
88
  BucketKey(session_id=b.session_id, bucket_index=b.bucket_index)
78
89
  for b in ConversationBucketer.bucket_messages(messages)
@@ -38,6 +38,13 @@ def rust_lexicon() -> ModuleType | None:
38
38
 
39
39
 
40
40
  class Lexicon:
41
+ """Token-polarity lookup: AFINN base scores layered with coding-domain overrides.
42
+
43
+ ``DOMAIN_OVERRIDES`` pins context-specific terms (``stop``, ``broken``, ``ship``) that
44
+ AFINN mis-scores, and magnitudes below ``MIN_MAGNITUDE`` collapse to neutral. Backs the
45
+ lexicon-bearing score stages through :meth:`has_hit`.
46
+ """
47
+
41
48
  DOMAIN_OVERRIDES: ClassVar[dict[str, int]] = {
42
49
  "stop": -3,
43
50
  "halt": -3,
@@ -104,6 +111,11 @@ class Lexicon:
104
111
 
105
112
  @classmethod
106
113
  def polarity(cls, lemma: str) -> int:
114
+ """The signed polarity of ``lemma``.
115
+
116
+ A domain override when present, else its AFINN score zeroed below
117
+ ``MIN_MAGNITUDE``.
118
+ """
107
119
  lower = lemma.lower()
108
120
  if (override := cls.DOMAIN_OVERRIDES.get(lower)) is not None:
109
121
  return override
@@ -129,6 +141,12 @@ class Lexicon:
129
141
 
130
142
 
131
143
  class NLP:
144
+ """Lazy loader for the spaCy ``en_core_web_sm`` model used to lemmatize text.
145
+
146
+ Loads from the user spaCy cache, downloading the model on first use; on failure it records
147
+ the diagnostic and disables itself so the lexicon path fails open.
148
+ """
149
+
132
150
  model: ClassVar[spacy.language.Language | None] = None
133
151
  failed: ClassVar[bool] = False
134
152
  last_download_output: ClassVar[str | None] = None
@@ -7,11 +7,15 @@ from cc_transcript.models import SessionId
7
7
 
8
8
 
9
9
  class ToolCall(NamedTuple):
10
+ """A single tool invocation within a message: the tool ``name`` and optional target file path."""
11
+
10
12
  name: str
11
13
  file_path: str | None = None
12
14
 
13
15
 
14
16
  class UserMessage(NamedTuple):
17
+ """A user turn distilled for bucketing: its text, tool calls, and authoring metadata."""
18
+
15
19
  content: str
16
20
  timestamp: datetime
17
21
  session_id: SessionId
@@ -23,6 +27,8 @@ class UserMessage(NamedTuple):
23
27
 
24
28
 
25
29
  class AssistantMessage(NamedTuple):
30
+ """An assistant turn distilled for bucketing: its text, tool calls, and responding model."""
31
+
26
32
  content: str
27
33
  timestamp: datetime
28
34
  session_id: SessionId
@@ -91,24 +91,29 @@ class ScoreSpec:
91
91
 
92
92
 
93
93
  def flag_frustration(*, score: int = 1) -> FrustrationShortCircuit:
94
+ """Composes the short-circuit stage that pins a frustrated message to ``score`` before inference."""
94
95
  return FrustrationShortCircuit(groups=FRUSTRATION_GROUPS, score=score)
95
96
 
96
97
 
97
98
  def clamp_positive(*, floor: int = 3, max_words: int = SHORT_MESSAGE_MAX_WORDS) -> PositiveClamp:
99
+ """Composes the post-process stage that lowers a top score on a short message lacking positive lexicon."""
98
100
  return PositiveClamp(positive_floor=floor, max_words=max_words)
99
101
 
100
102
 
101
103
  def demote_mild_irritation(*, floor: int = 3) -> MildIrritationDemote:
104
+ """Composes the post-process stage that softens a non-hostile mild-impatience message off the floor score."""
102
105
  return MildIrritationDemote(
103
106
  trigger_groups=MILD_IMPATIENCE_GROUPS, hostile_groups=FRUSTRATION_GROUPS, hostile_floor=floor
104
107
  )
105
108
 
106
109
 
107
110
  def clamp_resume() -> ResumeClamp:
111
+ """Composes the post-process stage that neutralizes a bare resume phrase to a middling score."""
108
112
  return ResumeClamp(phrases=RESUME_PHRASE_SET)
109
113
 
110
114
 
111
115
  def build_score_spec(*stages: ScoreStage) -> ScoreSpec:
116
+ """Assembles ``stages`` into a :class:`ScoreSpec` for the engine to apply around inference."""
112
117
  return ScoreSpec(stages=tuple(stages))
113
118
 
114
119
 
@@ -0,0 +1,130 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import asynccontextmanager
4
+ from typing import TYPE_CHECKING, Self
5
+
6
+ import aiosqlite
7
+ import anyio
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import AsyncIterator
11
+ from pathlib import Path
12
+ from types import TracebackType
13
+
14
+ FILE_SCHEMA = """
15
+ CREATE TABLE IF NOT EXISTS files (
16
+ path TEXT PRIMARY KEY,
17
+ mtime REAL NOT NULL
18
+ );
19
+ """
20
+
21
+
22
+ class FileStateStore:
23
+ """Tracks which transcript files have been ingested, keyed by mtime.
24
+
25
+ Backed by a single async SQLite (``aiosqlite``) database with WAL journaling
26
+ and a task lock, so it is safe to share one store across concurrent tasks.
27
+ Consumers compose their own writes alongside :meth:`record_file` inside
28
+ :meth:`transaction` to keep ingestion state and derived records atomic.
29
+
30
+ Example:
31
+ >>> store = await FileStateStore.open(Path("state.db"), extra_schema=MY_SCHEMA)
32
+ >>> async with store.transaction() as conn:
33
+ ... await conn.execute("INSERT INTO my_table VALUES (?)", (value,))
34
+ ... await store.record_file(str(path), mtime)
35
+ """
36
+
37
+ def __init__(self, conn: aiosqlite.Connection) -> None:
38
+ self.conn = conn
39
+ self.lock = anyio.Lock()
40
+ self._txn_owner: int | None = None
41
+
42
+ @classmethod
43
+ async def open(cls, path: Path, *, extra_schema: str = "") -> Self:
44
+ """Opens (creating if needed) the store at ``path``.
45
+
46
+ Args:
47
+ path: The database file path; its parent is created if absent.
48
+ extra_schema: Additional DDL to execute after the file schema,
49
+ e.g. consumer tables that reference ``files(path)``.
50
+
51
+ Returns:
52
+ The opened store.
53
+ """
54
+ path.parent.mkdir(parents=True, exist_ok=True)
55
+ conn = await aiosqlite.connect(str(path), isolation_level=None)
56
+ conn.row_factory = aiosqlite.Row
57
+ await conn.execute("PRAGMA foreign_keys = ON")
58
+ await conn.execute("PRAGMA journal_mode = WAL")
59
+ await conn.executescript(FILE_SCHEMA + extra_schema)
60
+ return cls(conn)
61
+
62
+ async def close(self) -> None:
63
+ """Closes the underlying connection."""
64
+ async with self.lock:
65
+ await self.conn.close()
66
+
67
+ async def __aenter__(self) -> Self:
68
+ return self
69
+
70
+ async def __aexit__(
71
+ self,
72
+ exc_type: type[BaseException] | None,
73
+ exc: BaseException | None,
74
+ tb: TracebackType | None,
75
+ ) -> None:
76
+ await self.close()
77
+
78
+ @asynccontextmanager
79
+ async def transaction(self) -> AsyncIterator[aiosqlite.Connection]:
80
+ """Yields the locked connection inside a single committed transaction.
81
+
82
+ Use this to compose consumer writes with :meth:`record_file` so they
83
+ commit or roll back together. :meth:`record_file` called within the
84
+ block joins this transaction instead of opening its own.
85
+
86
+ Yields:
87
+ The store's connection, held under the store lock.
88
+ """
89
+ async with self.lock:
90
+ self._txn_owner = anyio.get_current_task().id
91
+ await self.conn.execute("BEGIN IMMEDIATE")
92
+ try:
93
+ yield self.conn
94
+ except BaseException:
95
+ await self.conn.rollback()
96
+ raise
97
+ else:
98
+ await self.conn.commit()
99
+ finally:
100
+ self._txn_owner = None
101
+
102
+ async def file_mtimes(self) -> dict[str, float]:
103
+ """Returns the recorded ``path`` to ``mtime`` map."""
104
+ async with self.lock, self.conn.execute("SELECT path, mtime FROM files") as cur:
105
+ return {row["path"]: row["mtime"] async for row in cur}
106
+
107
+ async def record_file(self, path: str, mtime: float) -> None:
108
+ """Upserts the recorded mtime for ``path``.
109
+
110
+ Call inside :meth:`transaction` to commit alongside consumer writes;
111
+ called on its own it commits immediately.
112
+ """
113
+ if self._txn_owner == anyio.get_current_task().id:
114
+ await self.upsert_file(path, mtime)
115
+ return
116
+ async with self.lock:
117
+ await self.conn.execute("BEGIN IMMEDIATE")
118
+ try:
119
+ await self.upsert_file(path, mtime)
120
+ except BaseException:
121
+ await self.conn.rollback()
122
+ raise
123
+ else:
124
+ await self.conn.commit()
125
+
126
+ async def upsert_file(self, path: str, mtime: float) -> None:
127
+ await self.conn.execute(
128
+ "INSERT INTO files(path, mtime) VALUES(?, ?) ON CONFLICT(path) DO UPDATE SET mtime = excluded.mtime",
129
+ (path, mtime),
130
+ )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "cc-transcript"
3
- version = "0.4.0"
3
+ version = "0.5.0"
4
4
  description = "Typed events for Claude Code transcripts: discovery, a superset JSONL parser (Python + Rust), and ingestion-state tracking."
5
5
  readme = "README.md"
6
6
  license = "PolyForm-Noncommercial-1.0.0"
@@ -17,6 +17,7 @@ classifiers = [
17
17
  ]
18
18
  requires-python = ">=3.13"
19
19
  dependencies = [
20
+ "aiosqlite>=0.20",
20
21
  "anyio>=4.4",
21
22
  "orjson>=3.10",
22
23
  ]
@@ -24,7 +25,7 @@ dependencies = [
24
25
  [project.optional-dependencies]
25
26
  dev = [
26
27
  "pytest>=8.0",
27
- "pyright>=1.1",
28
+ "ty>=0.0.44",
28
29
  "ruff>=0.8",
29
30
  ]
30
31
  lexicon = [
@@ -57,12 +58,32 @@ markers = [
57
58
  "integration: Integration tests",
58
59
  ]
59
60
 
61
+ # ty (Astral) is the default type checker — run `uv run ty check cc_transcript`.
62
+ # It is fast, understands modern syntax, and avoids the strict-pyright false
63
+ # positives on pydantic/attrs-style dynamic defaults and PK-type overrides.
64
+ [tool.ty.rules]
65
+ # Keep cross-checker `# type: ignore` / `# pyright: ignore` comments from tripping ty.
66
+ unused-type-ignore-comment = "ignore"
67
+ unresolved-import = "ignore"
68
+
69
+ # pyright is kept as a secondary checker (editors / `uvx pyright`). Basic mode plus
70
+ # a few disables covers the noise; ty is the gate that runs in CI.
60
71
  [tool.pyright]
61
72
  pythonVersion = "3.13"
62
- typeCheckingMode = "strict"
73
+ typeCheckingMode = "basic"
63
74
  include = ["cc_transcript"]
64
75
  venvPath = "."
65
76
  venv = ".venv"
77
+ reportImplicitOverride = "none"
78
+ reportIncompatibleVariableOverride = "none"
79
+ reportUnknownVariableType = "none"
80
+ reportUnknownMemberType = "none"
81
+ reportUnknownArgumentType = "none"
82
+ reportUnknownParameterType = "none"
83
+ reportUnknownLambdaType = "none"
84
+ reportMissingTypeArgument = "none"
85
+ reportPrivateImportUsage = "none"
86
+ reportUnusedCallResult = "none"
66
87
 
67
88
  [tool.ruff]
68
89
  line-length = 120
@@ -73,7 +94,7 @@ src = [".", "tests"]
73
94
  select = ["E", "F", "I", "UP"]
74
95
 
75
96
  [tool.ruff.lint.per-file-ignores]
76
- "**/__init__.py" = ["F401"]
97
+ "__init__.py" = ["F401"]
77
98
 
78
99
  [dependency-groups]
79
100
  docs = [
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "cc_transcript_parser"
3
- version = "0.4.0"
3
+ version = "0.5.0"
4
4
  edition = "2021"
5
5
 
6
6
  [lib]
@@ -96,30 +96,48 @@ pub struct ParseStream {
96
96
 
97
97
  #[pymethods]
98
98
  impl ParseStream {
99
+ // A file whose events cannot be materialized (e.g. a malformed line missing a
100
+ // required field) is silently skipped — whole-file parity with PythonBackend.
99
101
  fn recv<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyAny>>> {
100
- match py.detach(|| self.rx.recv().ok()) {
101
- None => Ok(None),
102
- Some(pf) => Ok(Some(parsed_file_to_py(py, pf)?)),
102
+ loop {
103
+ match py.detach(|| self.rx.recv().ok()) {
104
+ None => return Ok(None),
105
+ Some(pf) => {
106
+ if let Ok(obj) = parsed_file_to_py(py, pf) {
107
+ return Ok(Some(obj));
108
+ }
109
+ }
110
+ }
103
111
  }
104
112
  }
105
113
 
106
114
  fn recv_many<'py>(&self, py: Python<'py>, max: usize) -> PyResult<Vec<Bound<'py, PyAny>>> {
107
- py.detach(|| {
108
- let mut out: Vec<ParsedFile> = Vec::new();
109
- if let Ok(pf) = self.rx.recv() {
110
- out.push(pf);
111
- while out.len() < max {
112
- match self.rx.try_recv() {
113
- Ok(pf) => out.push(pf),
114
- Err(_) => break,
115
+ let mut out: Vec<Bound<'py, PyAny>> = Vec::new();
116
+ // Block for the first materialized file; return [] only when the channel
117
+ // is genuinely closed, so an all-skipped batch never reads as "done".
118
+ loop {
119
+ match py.detach(|| self.rx.recv().ok()) {
120
+ None => return Ok(out),
121
+ Some(pf) => {
122
+ if let Ok(obj) = parsed_file_to_py(py, pf) {
123
+ out.push(obj);
124
+ break;
125
+ }
126
+ }
127
+ }
128
+ }
129
+ // Drain what is already buffered without blocking, skipping bad files.
130
+ while out.len() < max {
131
+ match py.detach(|| self.rx.try_recv().ok()) {
132
+ None => break,
133
+ Some(pf) => {
134
+ if let Ok(obj) = parsed_file_to_py(py, pf) {
135
+ out.push(obj);
115
136
  }
116
137
  }
117
138
  }
118
- out
119
- })
120
- .into_iter()
121
- .map(|pf| parsed_file_to_py(py, pf))
122
- .collect()
139
+ }
140
+ Ok(out)
123
141
  }
124
142
  }
125
143
 
@@ -1,118 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import sqlite3
4
- import threading
5
- from contextlib import contextmanager
6
- from typing import TYPE_CHECKING, Self
7
-
8
- if TYPE_CHECKING:
9
- from collections.abc import Generator
10
- from pathlib import Path
11
- from types import TracebackType
12
-
13
- FILE_SCHEMA = """
14
- CREATE TABLE IF NOT EXISTS files (
15
- path TEXT PRIMARY KEY,
16
- mtime REAL NOT NULL
17
- );
18
- """
19
-
20
-
21
- class FileStateStore:
22
- """Tracks which transcript files have been ingested, keyed by mtime.
23
-
24
- Backed by a single SQLite database with WAL journaling and a process-wide
25
- lock, so it is safe to share one store across threads. Consumers compose
26
- their own writes alongside :meth:`record_file` inside :meth:`transaction`
27
- to keep ingestion state and derived records atomic.
28
-
29
- Example:
30
- >>> store = FileStateStore.open(Path("state.db"), extra_schema=MY_SCHEMA)
31
- >>> with store.transaction() as conn:
32
- ... conn.execute("INSERT INTO my_table VALUES (?)", (value,))
33
- ... store.record_file(str(path), mtime)
34
- """
35
-
36
- def __init__(self, conn: sqlite3.Connection) -> None:
37
- self.conn = conn
38
- self.lock = threading.RLock()
39
- self._in_transaction = False
40
-
41
- @classmethod
42
- def open(cls, path: Path, *, extra_schema: str = "") -> Self:
43
- """Opens (creating if needed) the store at ``path``.
44
-
45
- Args:
46
- path: The database file path; its parent is created if absent.
47
- extra_schema: Additional DDL to execute after the file schema,
48
- e.g. consumer tables that reference ``files(path)``.
49
-
50
- Returns:
51
- The opened store.
52
- """
53
- path.parent.mkdir(parents=True, exist_ok=True)
54
- conn = sqlite3.connect(str(path), check_same_thread=False)
55
- conn.row_factory = sqlite3.Row
56
- conn.execute("PRAGMA foreign_keys = ON")
57
- conn.execute("PRAGMA journal_mode = WAL")
58
- conn.executescript(FILE_SCHEMA + extra_schema)
59
- conn.commit()
60
- return cls(conn)
61
-
62
- def close(self) -> None:
63
- """Closes the underlying connection."""
64
- with self.lock:
65
- self.conn.close()
66
-
67
- def __enter__(self) -> Self:
68
- return self
69
-
70
- def __exit__(
71
- self,
72
- exc_type: type[BaseException] | None,
73
- exc: BaseException | None,
74
- tb: TracebackType | None,
75
- ) -> None:
76
- self.close()
77
-
78
- @contextmanager
79
- def transaction(self) -> Generator[sqlite3.Connection]:
80
- """Yields the locked connection inside a single committed transaction.
81
-
82
- Use this to compose consumer writes with :meth:`record_file` so they
83
- commit or roll back together. :meth:`record_file` called within the
84
- block joins this transaction instead of opening its own.
85
-
86
- Yields:
87
- The store's connection, held under the store lock.
88
- """
89
- with self.lock, self.conn:
90
- self._in_transaction = True
91
- try:
92
- yield self.conn
93
- finally:
94
- self._in_transaction = False
95
-
96
- def file_mtimes(self) -> dict[str, float]:
97
- """Returns the recorded ``path`` to ``mtime`` map."""
98
- with self.lock:
99
- return {row["path"]: row["mtime"] for row in self.conn.execute("SELECT path, mtime FROM files")}
100
-
101
- def record_file(self, path: str, mtime: float) -> None:
102
- """Upserts the recorded mtime for ``path``.
103
-
104
- Call inside :meth:`transaction` to commit alongside consumer writes;
105
- called on its own it commits immediately.
106
- """
107
- with self.lock:
108
- if self._in_transaction:
109
- self.upsert_file(path, mtime)
110
- return
111
- with self.conn:
112
- self.upsert_file(path, mtime)
113
-
114
- def upsert_file(self, path: str, mtime: float) -> None:
115
- self.conn.execute(
116
- "INSERT INTO files(path, mtime) VALUES(?, ?) ON CONFLICT(path) DO UPDATE SET mtime = excluded.mtime",
117
- (path, mtime),
118
- )
File without changes
File without changes