cc-transcript 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/Cargo.lock +40 -1
  2. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/PKG-INFO +16 -11
  3. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/README.md +12 -10
  4. cc_transcript-0.3.0/cc_transcript/__init__.py +127 -0
  5. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/_parser_rs.pyi +8 -2
  6. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/backend.py +5 -0
  7. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/discovery.py +5 -0
  8. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/filters.py +58 -44
  9. cc_transcript-0.3.0/cc_transcript/filterspec.py +538 -0
  10. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/parser.py +16 -2
  11. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/rust.py +14 -3
  12. cc_transcript-0.3.0/cc_transcript/sentiment/__init__.py +62 -0
  13. cc_transcript-0.3.0/cc_transcript/sentiment/buckets.py +92 -0
  14. cc_transcript-0.3.0/cc_transcript/sentiment/engine.py +79 -0
  15. cc_transcript-0.3.0/cc_transcript/sentiment/lexicon.py +133 -0
  16. cc_transcript-0.3.0/cc_transcript/sentiment/messages.py +40 -0
  17. cc_transcript-0.3.0/cc_transcript/sentiment/scorefilters.py +149 -0
  18. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/pyproject.toml +15 -2
  19. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/Cargo.toml +2 -1
  20. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/src/event.rs +2 -19
  21. cc_transcript-0.3.0/rust/src/filter.rs +278 -0
  22. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/src/lib.rs +27 -7
  23. cc_transcript-0.3.0/rust/src/value.rs +44 -0
  24. cc_transcript-0.1.0/cc_transcript/__init__.py +0 -60
  25. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/Cargo.toml +0 -0
  26. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/LICENSE +0 -0
  27. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/models.py +0 -0
  28. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/py.typed +0 -0
  29. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/store.py +0 -0
  30. {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/src/model.rs +0 -0
@@ -15,6 +15,15 @@ dependencies = [
15
15
  "zerocopy",
16
16
  ]
17
17
 
18
+ [[package]]
19
+ name = "aho-corasick"
20
+ version = "1.1.4"
21
+ source = "registry+https://github.com/rust-lang/crates.io-index"
22
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
23
+ dependencies = [
24
+ "memchr",
25
+ ]
26
+
18
27
  [[package]]
19
28
  name = "autocfg"
20
29
  version = "1.5.1"
@@ -35,7 +44,7 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
35
44
 
36
45
  [[package]]
37
46
  name = "cc_transcript_parser"
38
- version = "0.1.0"
47
+ version = "0.3.0"
39
48
  dependencies = [
40
49
  "chrono",
41
50
  "crossbeam-channel",
@@ -44,6 +53,7 @@ dependencies = [
44
53
  "once_cell",
45
54
  "pyo3",
46
55
  "rayon",
56
+ "regex",
47
57
  "serde",
48
58
  "sonic-rs",
49
59
  ]
@@ -424,6 +434,35 @@ dependencies = [
424
434
  "syn",
425
435
  ]
426
436
 
437
+ [[package]]
438
+ name = "regex"
439
+ version = "1.12.3"
440
+ source = "registry+https://github.com/rust-lang/crates.io-index"
441
+ checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
442
+ dependencies = [
443
+ "aho-corasick",
444
+ "memchr",
445
+ "regex-automata",
446
+ "regex-syntax",
447
+ ]
448
+
449
+ [[package]]
450
+ name = "regex-automata"
451
+ version = "0.4.14"
452
+ source = "registry+https://github.com/rust-lang/crates.io-index"
453
+ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
454
+ dependencies = [
455
+ "aho-corasick",
456
+ "memchr",
457
+ "regex-syntax",
458
+ ]
459
+
460
+ [[package]]
461
+ name = "regex-syntax"
462
+ version = "0.8.10"
463
+ source = "registry+https://github.com/rust-lang/crates.io-index"
464
+ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
465
+
427
466
  [[package]]
428
467
  name = "rend"
429
468
  version = "0.5.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cc-transcript
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Classifier: Development Status :: 3 - Alpha
5
5
  Classifier: Intended Audience :: Developers
6
6
  Classifier: Operating System :: OS Independent
@@ -12,7 +12,10 @@ Requires-Dist: orjson>=3.10
12
12
  Requires-Dist: pytest>=8.0 ; extra == 'dev'
13
13
  Requires-Dist: pyright>=1.1 ; extra == 'dev'
14
14
  Requires-Dist: ruff>=0.8 ; extra == 'dev'
15
+ Requires-Dist: spacy>=3.8 ; extra == 'lexicon'
16
+ Requires-Dist: afinn>=0.1 ; extra == 'lexicon'
15
17
  Provides-Extra: dev
18
+ Provides-Extra: lexicon
16
19
  License-File: LICENSE
17
20
  Summary: Typed events for Claude Code transcripts: discovery, a superset JSONL parser (Python + Rust), and ingestion-state tracking.
18
21
  Keywords:
@@ -31,16 +34,16 @@ Project-URL: Repository, https://github.com/yasyf/cc-transcript
31
34
  [![PyPI](https://img.shields.io/pypi/v/cc-transcript.svg)](https://pypi.org/project/cc-transcript/)
32
35
  [![Python](https://img.shields.io/pypi/pyversions/cc-transcript.svg)](https://pypi.org/project/cc-transcript/)
33
36
  [![Docs](https://img.shields.io/github/actions/workflow/status/yasyf/cc-transcript/docs.yml?branch=main&label=docs)](https://yasyf.github.io/cc-transcript/)
34
- [![License: PolyForm-Noncommercial-1.0.0](https://img.shields.io/badge/License-PolyForm-Noncommercial-1.0.0-blue.svg)](https://github.com/yasyf/cc-transcript/blob/main/LICENSE)
37
+ [![License: PolyForm Noncommercial](https://img.shields.io/badge/License-PolyForm--Noncommercial--1.0.0-blue.svg)](https://github.com/yasyf/cc-transcript/blob/main/LICENSE)
35
38
 
36
- The shared transcript-parsing core extracted from [cc-sentiment](https://github.com/yasyf/cc-sentiment), now powering cc-pushback, cc-sentiment, and captain-hook. It parses Claude Code's on-disk JSONL transcripts into a **typed superset event model** — every entry type preserved, nothing dropped — so each consumer applies its own semantic filtering on top of one faithful representation.
39
+ `cc-transcript` parses Claude Code's on-disk JSONL transcripts into a **typed superset event model** — every entry type preserved, nothing dropped — so you build on one faithful representation and apply your own semantic filtering on top.
37
40
 
38
- The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in the consumer, not buried in the parser.
41
+ The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in your code, not buried in the parser.
39
42
 
40
43
  ## Install
41
44
 
42
45
  ```bash
43
- uv add cc-transcript
46
+ uv add cc-transcript # or: pip install cc-transcript
44
47
  ```
45
48
 
46
49
  ## Quickstart
@@ -50,8 +53,7 @@ Discover the transcripts on disk, parse one, and look at the events:
50
53
  ```python
51
54
  from cc_transcript import TranscriptDiscovery, parse_events, AssistantEvent, UserEvent
52
55
 
53
- path, _mtime = TranscriptDiscovery.find_in(TranscriptDiscovery.find_transcripts()[0].parent)[0]
54
- events = parse_events(path)
56
+ events = parse_events(TranscriptDiscovery.find_transcripts()[0])
55
57
 
56
58
  for event in events:
57
59
  match event:
@@ -61,7 +63,8 @@ for event in events:
61
63
  print(f"assistant ({model}):", text[:80])
62
64
  ```
63
65
 
64
- Apply cc-sentiment's filtering rules to drop sidechains, synthetic turns, and junk:
66
+ `SENTIMENT_FILTER` is a ready-made filter that keeps only user and assistant turns,
67
+ dropping sidechains, synthetic turns, compacted summaries, empty events, and tool/command noise:
65
68
 
66
69
  ```python
67
70
  from cc_transcript import apply_filters, SENTIMENT_FILTER
@@ -69,12 +72,14 @@ from cc_transcript import apply_filters, SENTIMENT_FILTER
69
72
  clean = list(apply_filters(events, SENTIMENT_FILTER))
70
73
  ```
71
74
 
75
+ Build your own with `FilterConfig` — every rule is off by default, so a bare `FilterConfig()` passes everything through.
76
+
72
77
  ## What problems does this solve?
73
78
 
74
- - **One faithful parse, many consumers.** Every project that reads Claude Code transcripts re-implements the same JSONL quirks (str-or-list content, tool results nested two ways, envelope-less mode markers). This is that parser, written once and typed strictly.
79
+ - **One faithful parse.** Anything reading Claude Code transcripts re-implements the same JSONL quirks (str-or-list content, tool results nested two ways, envelope-less mode markers). This is that parser, written once and typed strictly.
75
80
  - **Non-lossy by design.** The event model is a superset: sidechains, `<synthetic>` turns, thinking blocks, and unrecognized entry types all survive parsing. You decide what to drop, via `FilterConfig`.
76
- - **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and consumers compose their own writes in the same transaction.
77
- - **Pluggable backends.** A pure-Python reference parser ships today; a Rust backend behind the same `Backend` protocol reaches parity in a later release.
81
+ - **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and you compose your own writes in the same transaction.
82
+ - **Pluggable backends.** A Rust backend (PyO3 + rayon) is the default fast path, with a pure-Python reference parser behind the same `Backend` protocol as the fallback both at full event parity.
78
83
 
79
84
  ## Docs
80
85
 
@@ -3,16 +3,16 @@
3
3
  [![PyPI](https://img.shields.io/pypi/v/cc-transcript.svg)](https://pypi.org/project/cc-transcript/)
4
4
  [![Python](https://img.shields.io/pypi/pyversions/cc-transcript.svg)](https://pypi.org/project/cc-transcript/)
5
5
  [![Docs](https://img.shields.io/github/actions/workflow/status/yasyf/cc-transcript/docs.yml?branch=main&label=docs)](https://yasyf.github.io/cc-transcript/)
6
- [![License: PolyForm-Noncommercial-1.0.0](https://img.shields.io/badge/License-PolyForm-Noncommercial-1.0.0-blue.svg)](https://github.com/yasyf/cc-transcript/blob/main/LICENSE)
6
+ [![License: PolyForm Noncommercial](https://img.shields.io/badge/License-PolyForm--Noncommercial--1.0.0-blue.svg)](https://github.com/yasyf/cc-transcript/blob/main/LICENSE)
7
7
 
8
- The shared transcript-parsing core extracted from [cc-sentiment](https://github.com/yasyf/cc-sentiment), now powering cc-pushback, cc-sentiment, and captain-hook. It parses Claude Code's on-disk JSONL transcripts into a **typed superset event model** — every entry type preserved, nothing dropped — so each consumer applies its own semantic filtering on top of one faithful representation.
8
+ `cc-transcript` parses Claude Code's on-disk JSONL transcripts into a **typed superset event model** — every entry type preserved, nothing dropped — so you build on one faithful representation and apply your own semantic filtering on top.
9
9
 
10
- The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in the consumer, not buried in the parser.
10
+ The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in your code, not buried in the parser.
11
11
 
12
12
  ## Install
13
13
 
14
14
  ```bash
15
- uv add cc-transcript
15
+ uv add cc-transcript # or: pip install cc-transcript
16
16
  ```
17
17
 
18
18
  ## Quickstart
@@ -22,8 +22,7 @@ Discover the transcripts on disk, parse one, and look at the events:
22
22
  ```python
23
23
  from cc_transcript import TranscriptDiscovery, parse_events, AssistantEvent, UserEvent
24
24
 
25
- path, _mtime = TranscriptDiscovery.find_in(TranscriptDiscovery.find_transcripts()[0].parent)[0]
26
- events = parse_events(path)
25
+ events = parse_events(TranscriptDiscovery.find_transcripts()[0])
27
26
 
28
27
  for event in events:
29
28
  match event:
@@ -33,7 +32,8 @@ for event in events:
33
32
  print(f"assistant ({model}):", text[:80])
34
33
  ```
35
34
 
36
- Apply cc-sentiment's filtering rules to drop sidechains, synthetic turns, and junk:
35
+ `SENTIMENT_FILTER` is a ready-made filter that keeps only user and assistant turns,
36
+ dropping sidechains, synthetic turns, compacted summaries, empty events, and tool/command noise:
37
37
 
38
38
  ```python
39
39
  from cc_transcript import apply_filters, SENTIMENT_FILTER
@@ -41,12 +41,14 @@ from cc_transcript import apply_filters, SENTIMENT_FILTER
41
41
  clean = list(apply_filters(events, SENTIMENT_FILTER))
42
42
  ```
43
43
 
44
+ Build your own with `FilterConfig` — every rule is off by default, so a bare `FilterConfig()` passes everything through.
45
+
44
46
  ## What problems does this solve?
45
47
 
46
- - **One faithful parse, many consumers.** Every project that reads Claude Code transcripts re-implements the same JSONL quirks (str-or-list content, tool results nested two ways, envelope-less mode markers). This is that parser, written once and typed strictly.
48
+ - **One faithful parse.** Anything reading Claude Code transcripts re-implements the same JSONL quirks (str-or-list content, tool results nested two ways, envelope-less mode markers). This is that parser, written once and typed strictly.
47
49
  - **Non-lossy by design.** The event model is a superset: sidechains, `<synthetic>` turns, thinking blocks, and unrecognized entry types all survive parsing. You decide what to drop, via `FilterConfig`.
48
- - **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and consumers compose their own writes in the same transaction.
49
- - **Pluggable backends.** A pure-Python reference parser ships today; a Rust backend behind the same `Backend` protocol reaches parity in a later release.
50
+ - **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and you compose your own writes in the same transaction.
51
+ - **Pluggable backends.** A Rust backend (PyO3 + rayon) is the default fast path, with a pure-Python reference parser behind the same `Backend` protocol as the fallback both at full event parity.
50
52
 
51
53
  ## Docs
52
54
 
@@ -0,0 +1,127 @@
1
+ """Typed events for Claude Code transcripts.
2
+
3
+ Discovery, a superset JSONL parser (Python + Rust), and ingestion-state tracking.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from cc_transcript.backend import Backend, ParsedTranscript
9
+ from cc_transcript.discovery import CLAUDE_PROJECTS_DIR, TranscriptDiscovery
10
+ from cc_transcript.filters import JUNK_USER_MESSAGE_RE, SENTIMENT_FILTER, FilterConfig, apply_filters
11
+ from cc_transcript.filterspec import (
12
+ FRUSTRATION_GROUPS,
13
+ INTERRUPT_MARKER_GROUPS,
14
+ INTERRUPT_MARKER_RE,
15
+ MILD_IMPATIENCE_GROUPS,
16
+ PUSHBACK_SPEC,
17
+ RESUME_PHRASE_SET,
18
+ SENTIMENT_JUNK_GROUPS,
19
+ SENTIMENT_SPEC,
20
+ SENTIMENT_STRUCTURAL_GROUPS,
21
+ SHORT_MESSAGE_MAX_WORDS,
22
+ STOP_HOOK_GROUPS,
23
+ STOP_HOOK_RE,
24
+ STRUCTURAL_NOISE_GROUPS,
25
+ STRUCTURAL_NOISE_RE,
26
+ TRIVIAL_ACK_SET,
27
+ Action,
28
+ Clause,
29
+ EntrypointIn,
30
+ FilterSpec,
31
+ KindIs,
32
+ MetaFlag,
33
+ ModelIs,
34
+ TextEmpty,
35
+ TextInSet,
36
+ TextMatchesAny,
37
+ WordCountAtMost,
38
+ annotate_spec,
39
+ apply_spec,
40
+ is_portable,
41
+ keep,
42
+ labels_for,
43
+ spec_to_json,
44
+ )
45
+ from cc_transcript.models import (
46
+ AssistantEvent,
47
+ CcVersion,
48
+ ContentBlock,
49
+ EntryMeta,
50
+ EntryUuid,
51
+ ModeEvent,
52
+ OtherEvent,
53
+ SessionId,
54
+ SystemEvent,
55
+ TextBlock,
56
+ ThinkingBlock,
57
+ ToolResultBlock,
58
+ ToolUseBlock,
59
+ ToolUseId,
60
+ TranscriptEvent,
61
+ UserEvent,
62
+ )
63
+ from cc_transcript.parser import TranscriptParser, parse_events, parse_events_from_bytes
64
+ from cc_transcript.store import FileStateStore
65
+
66
+ __all__ = [
67
+ "CLAUDE_PROJECTS_DIR",
68
+ "FRUSTRATION_GROUPS",
69
+ "INTERRUPT_MARKER_GROUPS",
70
+ "INTERRUPT_MARKER_RE",
71
+ "JUNK_USER_MESSAGE_RE",
72
+ "MILD_IMPATIENCE_GROUPS",
73
+ "PUSHBACK_SPEC",
74
+ "RESUME_PHRASE_SET",
75
+ "SENTIMENT_FILTER",
76
+ "SENTIMENT_JUNK_GROUPS",
77
+ "SENTIMENT_SPEC",
78
+ "SENTIMENT_STRUCTURAL_GROUPS",
79
+ "SHORT_MESSAGE_MAX_WORDS",
80
+ "STOP_HOOK_GROUPS",
81
+ "STOP_HOOK_RE",
82
+ "STRUCTURAL_NOISE_GROUPS",
83
+ "STRUCTURAL_NOISE_RE",
84
+ "TRIVIAL_ACK_SET",
85
+ "Action",
86
+ "AssistantEvent",
87
+ "Backend",
88
+ "CcVersion",
89
+ "Clause",
90
+ "ContentBlock",
91
+ "EntryMeta",
92
+ "EntryUuid",
93
+ "EntrypointIn",
94
+ "FileStateStore",
95
+ "FilterConfig",
96
+ "FilterSpec",
97
+ "KindIs",
98
+ "MetaFlag",
99
+ "ModeEvent",
100
+ "ModelIs",
101
+ "OtherEvent",
102
+ "ParsedTranscript",
103
+ "SessionId",
104
+ "SystemEvent",
105
+ "TextBlock",
106
+ "TextEmpty",
107
+ "TextInSet",
108
+ "TextMatchesAny",
109
+ "ThinkingBlock",
110
+ "ToolResultBlock",
111
+ "ToolUseBlock",
112
+ "ToolUseId",
113
+ "TranscriptDiscovery",
114
+ "TranscriptEvent",
115
+ "TranscriptParser",
116
+ "UserEvent",
117
+ "WordCountAtMost",
118
+ "annotate_spec",
119
+ "apply_filters",
120
+ "apply_spec",
121
+ "is_portable",
122
+ "keep",
123
+ "labels_for",
124
+ "parse_events",
125
+ "parse_events_from_bytes",
126
+ "spec_to_json",
127
+ ]
@@ -11,5 +11,11 @@ class ParseStream:
11
11
  def recv_many(self, max: int, /) -> list[tuple[str, float, list[TranscriptEvent]]]:
12
12
  """Blocks for at least one parsed file, then drains up to ``max``."""
13
13
 
14
- def stream_parse(paths: list[tuple[str, float]], prefetch: int, /) -> ParseStream:
15
- """Spawns a rayon pool parsing ``paths``, buffering ``prefetch`` results."""
14
+ def stream_parse(
15
+ paths: list[tuple[str, float]], prefetch: int, spec_json: str | None = ..., /
16
+ ) -> ParseStream:
17
+ """Spawns a rayon pool parsing ``paths``, buffering ``prefetch`` results.
18
+
19
+ When ``spec_json`` is the JSON of a portable filter spec, events failing it
20
+ are dropped during parsing, before any Python object is built.
21
+ """
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
7
7
  from collections.abc import AsyncIterator, Sequence
8
8
  from pathlib import Path
9
9
 
10
+ from cc_transcript.filterspec import FilterSpec
10
11
  from cc_transcript.models import TranscriptEvent
11
12
 
12
13
 
@@ -39,12 +40,16 @@ class Backend(Protocol):
39
40
  paths: Sequence[tuple[Path, float]],
40
41
  *,
41
42
  prefetch: int,
43
+ spec: FilterSpec | None = None,
42
44
  ) -> AsyncIterator[ParsedTranscript]:
43
45
  """Parses ``paths`` concurrently, yielding results as they complete.
44
46
 
45
47
  Args:
46
48
  paths: Pairs of ``(path, mtime)`` to parse.
47
49
  prefetch: The number of files to keep in flight at once.
50
+ spec: When given, events failing the spec are dropped during
51
+ parsing; portable specs run in the Rust backend, others fall
52
+ back to the Python interpreter.
48
53
 
49
54
  Yields:
50
55
  One :class:`ParsedTranscript` per input path.
@@ -27,6 +27,11 @@ class TranscriptDiscovery:
27
27
  except OSError:
28
28
  return None
29
29
 
30
+ @staticmethod
31
+ def transcript_mtime(path: Path) -> float:
32
+ """Returns ``path``'s modification time, raising if it cannot be read."""
33
+ return path.stat().st_mtime
34
+
30
35
  @staticmethod
31
36
  def find_in(
32
37
  directory: Path,
@@ -4,37 +4,47 @@ import re
4
4
  from dataclasses import dataclass, field
5
5
  from typing import TYPE_CHECKING
6
6
 
7
- from cc_transcript.models import AssistantEvent, ModeEvent, OtherEvent, SystemEvent, ToolUseBlock, UserEvent
7
+ from cc_transcript.filterspec import (
8
+ ASSISTANTS,
9
+ SENTIMENT_JUNK_GROUPS,
10
+ USERS,
11
+ Clause,
12
+ EntrypointIn,
13
+ FilterSpec,
14
+ KindIs,
15
+ MetaFlag,
16
+ ModelIs,
17
+ TextEmpty,
18
+ TextMatchesAny,
19
+ apply_spec,
20
+ compile_groups,
21
+ )
22
+ from cc_transcript.models import AssistantEvent, ModeEvent, OtherEvent, SystemEvent, UserEvent
8
23
 
9
24
  if TYPE_CHECKING:
10
25
  from collections.abc import Iterable, Iterator
11
26
 
27
+ from cc_transcript.filterspec import EventKind
12
28
  from cc_transcript.models import TranscriptEvent
13
29
 
14
- JUNK_USER_MESSAGE_RE = re.compile(
15
- r"<(?:system[_-](?:instruction|reminder)"
16
- r"|local-command-(?:stdout|caveat)"
17
- r"|command-(?:name|message|args)"
18
- r"|task-notification"
19
- r"|persisted-output"
20
- r"|output-file)\b"
21
- r"|Caveat: The messages below were generated by the user while running local commands\."
22
- r"|\[Request interrupted by user"
23
- r"|Stop hook feedback:"
24
- r"|REMAINING_TASKS_ACKNOWLEDGED"
25
- r"|<<[a-z][a-z0-9-]*>>"
26
- r"|^Base directory for this skill:"
27
- r"|(?:###\s+[\w][\w \-]{0,30}\s+){3,}###",
28
- re.IGNORECASE,
29
- )
30
+ JUNK_USER_MESSAGE_RE = compile_groups(SENTIMENT_JUNK_GROUPS, True)
31
+
32
+ KIND_BY_TYPE: dict[type[TranscriptEvent], EventKind] = {
33
+ UserEvent: "user",
34
+ AssistantEvent: "assistant",
35
+ SystemEvent: "system",
36
+ ModeEvent: "mode",
37
+ OtherEvent: "other",
38
+ }
30
39
 
31
40
 
32
41
  @dataclass(frozen=True, slots=True)
33
42
  class FilterConfig:
34
43
  """Opt-in, consumer-side filtering of a transcript event stream.
35
44
 
36
- Every flag defaults off, so a bare ``FilterConfig()`` passes events
37
- through untouched.
45
+ A back-compatible flag-bag that lowers to a :class:`~cc_transcript.FilterSpec`
46
+ via :meth:`to_spec`. Every flag defaults off, so a bare ``FilterConfig()``
47
+ passes events through untouched.
38
48
 
39
49
  Attributes:
40
50
  keep_types: When set, drop every event not an instance of one of these
@@ -57,6 +67,34 @@ class FilterConfig:
57
67
  drop_ephemeral_entrypoints: frozenset[str] = frozenset()
58
68
  junk_pattern: re.Pattern[str] | None = field(default=None)
59
69
 
70
+ def to_spec(self) -> FilterSpec:
71
+ """Lowers this flag-bag into an equivalent ordered :class:`FilterSpec`."""
72
+ return FilterSpec(clauses=tuple(self.clauses()))
73
+
74
+ def clauses(self) -> Iterator[Clause]:
75
+ if self.keep_types is not None:
76
+ yield Clause(KindIs(frozenset(KIND_BY_TYPE[kind] for kind in self.keep_types)), negate=True)
77
+ if self.drop_synthetic:
78
+ yield Clause(ModelIs(frozenset({"<synthetic>"})), applies_to=ASSISTANTS)
79
+ if self.drop_empty:
80
+ yield Clause(TextEmpty(consider_tool_use=True), applies_to=ASSISTANTS)
81
+ yield Clause(TextEmpty(consider_tool_use=False), applies_to=USERS)
82
+ if self.junk_pattern is not None:
83
+ yield Clause(
84
+ TextMatchesAny(
85
+ (("junk", self.junk_pattern.pattern),),
86
+ ignore_case=bool(self.junk_pattern.flags & re.IGNORECASE),
87
+ ),
88
+ applies_to=USERS,
89
+ )
90
+ if self.drop_sidechain:
91
+ yield Clause(MetaFlag("is_sidechain"))
92
+ if self.drop_compacted:
93
+ yield Clause(MetaFlag("is_compact_summary"))
94
+ yield Clause(MetaFlag("is_visible_in_transcript_only"))
95
+ if self.drop_ephemeral_entrypoints:
96
+ yield Clause(EntrypointIn(self.drop_ephemeral_entrypoints))
97
+
60
98
 
61
99
  SENTIMENT_FILTER = FilterConfig(
62
100
  keep_types=(UserEvent, AssistantEvent),
@@ -69,30 +107,6 @@ SENTIMENT_FILTER = FilterConfig(
69
107
  )
70
108
 
71
109
 
72
- def keep(event: TranscriptEvent, config: FilterConfig) -> bool:
73
- if config.keep_types is not None and not isinstance(event, config.keep_types):
74
- return False
75
- match event:
76
- case OtherEvent() | ModeEvent():
77
- return True
78
- case AssistantEvent() if config.drop_synthetic and event.model == "<synthetic>":
79
- return False
80
- case AssistantEvent() if config.drop_empty and not event.text.strip() and not any(
81
- isinstance(block, ToolUseBlock) for block in event.blocks
82
- ):
83
- return False
84
- case UserEvent() if config.drop_empty and not event.text.strip():
85
- return False
86
- case UserEvent() if config.junk_pattern is not None and config.junk_pattern.search(event.text):
87
- return False
88
- case UserEvent(meta=meta) | AssistantEvent(meta=meta) | SystemEvent(meta=meta):
89
- if config.drop_sidechain and meta.is_sidechain:
90
- return False
91
- if config.drop_compacted and (meta.is_compact_summary or meta.is_visible_in_transcript_only):
92
- return False
93
- return meta.entrypoint not in config.drop_ephemeral_entrypoints
94
-
95
-
96
110
  def apply_filters(events: Iterable[TranscriptEvent], config: FilterConfig) -> Iterator[TranscriptEvent]:
97
111
  """Yields the events that survive ``config``.
98
112
 
@@ -103,7 +117,7 @@ def apply_filters(events: Iterable[TranscriptEvent], config: FilterConfig) -> It
103
117
  Yields:
104
118
  The events for which every enabled rule holds.
105
119
  """
106
- return (event for event in events if keep(event, config))
120
+ return apply_spec(events, config.to_spec())
107
121
 
108
122
 
109
123
  __all__ = ["JUNK_USER_MESSAGE_RE", "SENTIMENT_FILTER", "FilterConfig", "apply_filters"]