cc-transcript 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/Cargo.lock +40 -1
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/PKG-INFO +16 -11
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/README.md +12 -10
- cc_transcript-0.3.0/cc_transcript/__init__.py +127 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/_parser_rs.pyi +8 -2
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/backend.py +5 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/discovery.py +5 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/filters.py +58 -44
- cc_transcript-0.3.0/cc_transcript/filterspec.py +538 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/parser.py +16 -2
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/rust.py +14 -3
- cc_transcript-0.3.0/cc_transcript/sentiment/__init__.py +62 -0
- cc_transcript-0.3.0/cc_transcript/sentiment/buckets.py +92 -0
- cc_transcript-0.3.0/cc_transcript/sentiment/engine.py +79 -0
- cc_transcript-0.3.0/cc_transcript/sentiment/lexicon.py +133 -0
- cc_transcript-0.3.0/cc_transcript/sentiment/messages.py +40 -0
- cc_transcript-0.3.0/cc_transcript/sentiment/scorefilters.py +149 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/pyproject.toml +15 -2
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/Cargo.toml +2 -1
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/src/event.rs +2 -19
- cc_transcript-0.3.0/rust/src/filter.rs +278 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/src/lib.rs +27 -7
- cc_transcript-0.3.0/rust/src/value.rs +44 -0
- cc_transcript-0.1.0/cc_transcript/__init__.py +0 -60
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/Cargo.toml +0 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/LICENSE +0 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/models.py +0 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/py.typed +0 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/cc_transcript/store.py +0 -0
- {cc_transcript-0.1.0 → cc_transcript-0.3.0}/rust/src/model.rs +0 -0
|
@@ -15,6 +15,15 @@ dependencies = [
|
|
|
15
15
|
"zerocopy",
|
|
16
16
|
]
|
|
17
17
|
|
|
18
|
+
[[package]]
|
|
19
|
+
name = "aho-corasick"
|
|
20
|
+
version = "1.1.4"
|
|
21
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
22
|
+
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
|
|
23
|
+
dependencies = [
|
|
24
|
+
"memchr",
|
|
25
|
+
]
|
|
26
|
+
|
|
18
27
|
[[package]]
|
|
19
28
|
name = "autocfg"
|
|
20
29
|
version = "1.5.1"
|
|
@@ -35,7 +44,7 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
|
|
|
35
44
|
|
|
36
45
|
[[package]]
|
|
37
46
|
name = "cc_transcript_parser"
|
|
38
|
-
version = "0.
|
|
47
|
+
version = "0.3.0"
|
|
39
48
|
dependencies = [
|
|
40
49
|
"chrono",
|
|
41
50
|
"crossbeam-channel",
|
|
@@ -44,6 +53,7 @@ dependencies = [
|
|
|
44
53
|
"once_cell",
|
|
45
54
|
"pyo3",
|
|
46
55
|
"rayon",
|
|
56
|
+
"regex",
|
|
47
57
|
"serde",
|
|
48
58
|
"sonic-rs",
|
|
49
59
|
]
|
|
@@ -424,6 +434,35 @@ dependencies = [
|
|
|
424
434
|
"syn",
|
|
425
435
|
]
|
|
426
436
|
|
|
437
|
+
[[package]]
|
|
438
|
+
name = "regex"
|
|
439
|
+
version = "1.12.3"
|
|
440
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
441
|
+
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
|
|
442
|
+
dependencies = [
|
|
443
|
+
"aho-corasick",
|
|
444
|
+
"memchr",
|
|
445
|
+
"regex-automata",
|
|
446
|
+
"regex-syntax",
|
|
447
|
+
]
|
|
448
|
+
|
|
449
|
+
[[package]]
|
|
450
|
+
name = "regex-automata"
|
|
451
|
+
version = "0.4.14"
|
|
452
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
453
|
+
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
|
|
454
|
+
dependencies = [
|
|
455
|
+
"aho-corasick",
|
|
456
|
+
"memchr",
|
|
457
|
+
"regex-syntax",
|
|
458
|
+
]
|
|
459
|
+
|
|
460
|
+
[[package]]
|
|
461
|
+
name = "regex-syntax"
|
|
462
|
+
version = "0.8.10"
|
|
463
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
464
|
+
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
|
465
|
+
|
|
427
466
|
[[package]]
|
|
428
467
|
name = "rend"
|
|
429
468
|
version = "0.5.3"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cc-transcript
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Classifier: Development Status :: 3 - Alpha
|
|
5
5
|
Classifier: Intended Audience :: Developers
|
|
6
6
|
Classifier: Operating System :: OS Independent
|
|
@@ -12,7 +12,10 @@ Requires-Dist: orjson>=3.10
|
|
|
12
12
|
Requires-Dist: pytest>=8.0 ; extra == 'dev'
|
|
13
13
|
Requires-Dist: pyright>=1.1 ; extra == 'dev'
|
|
14
14
|
Requires-Dist: ruff>=0.8 ; extra == 'dev'
|
|
15
|
+
Requires-Dist: spacy>=3.8 ; extra == 'lexicon'
|
|
16
|
+
Requires-Dist: afinn>=0.1 ; extra == 'lexicon'
|
|
15
17
|
Provides-Extra: dev
|
|
18
|
+
Provides-Extra: lexicon
|
|
16
19
|
License-File: LICENSE
|
|
17
20
|
Summary: Typed events for Claude Code transcripts: discovery, a superset JSONL parser (Python + Rust), and ingestion-state tracking.
|
|
18
21
|
Keywords:
|
|
@@ -31,16 +34,16 @@ Project-URL: Repository, https://github.com/yasyf/cc-transcript
|
|
|
31
34
|
[](https://pypi.org/project/cc-transcript/)
|
|
32
35
|
[](https://pypi.org/project/cc-transcript/)
|
|
33
36
|
[](https://yasyf.github.io/cc-transcript/)
|
|
34
|
-
[](https://github.com/yasyf/cc-transcript/blob/main/LICENSE)
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
`cc-transcript` parses Claude Code's on-disk JSONL transcripts into a **typed superset event model** — every entry type preserved, nothing dropped — so you build on one faithful representation and apply your own semantic filtering on top.
|
|
37
40
|
|
|
38
|
-
The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in
|
|
41
|
+
The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in your code, not buried in the parser.
|
|
39
42
|
|
|
40
43
|
## Install
|
|
41
44
|
|
|
42
45
|
```bash
|
|
43
|
-
uv add cc-transcript
|
|
46
|
+
uv add cc-transcript # or: pip install cc-transcript
|
|
44
47
|
```
|
|
45
48
|
|
|
46
49
|
## Quickstart
|
|
@@ -50,8 +53,7 @@ Discover the transcripts on disk, parse one, and look at the events:
|
|
|
50
53
|
```python
|
|
51
54
|
from cc_transcript import TranscriptDiscovery, parse_events, AssistantEvent, UserEvent
|
|
52
55
|
|
|
53
|
-
|
|
54
|
-
events = parse_events(path)
|
|
56
|
+
events = parse_events(TranscriptDiscovery.find_transcripts()[0])
|
|
55
57
|
|
|
56
58
|
for event in events:
|
|
57
59
|
match event:
|
|
@@ -61,7 +63,8 @@ for event in events:
|
|
|
61
63
|
print(f"assistant ({model}):", text[:80])
|
|
62
64
|
```
|
|
63
65
|
|
|
64
|
-
|
|
66
|
+
`SENTIMENT_FILTER` is a ready-made filter that keeps only user and assistant turns,
|
|
67
|
+
dropping sidechains, synthetic turns, compacted summaries, empty events, and tool/command noise:
|
|
65
68
|
|
|
66
69
|
```python
|
|
67
70
|
from cc_transcript import apply_filters, SENTIMENT_FILTER
|
|
@@ -69,12 +72,14 @@ from cc_transcript import apply_filters, SENTIMENT_FILTER
|
|
|
69
72
|
clean = list(apply_filters(events, SENTIMENT_FILTER))
|
|
70
73
|
```
|
|
71
74
|
|
|
75
|
+
Build your own with `FilterConfig` — every rule is off by default, so a bare `FilterConfig()` passes everything through.
|
|
76
|
+
|
|
72
77
|
## What problems does this solve?
|
|
73
78
|
|
|
74
|
-
- **One faithful parse
|
|
79
|
+
- **One faithful parse.** Anything reading Claude Code transcripts re-implements the same JSONL quirks (str-or-list content, tool results nested two ways, envelope-less mode markers). This is that parser, written once and typed strictly.
|
|
75
80
|
- **Non-lossy by design.** The event model is a superset: sidechains, `<synthetic>` turns, thinking blocks, and unrecognized entry types all survive parsing. You decide what to drop, via `FilterConfig`.
|
|
76
|
-
- **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and
|
|
77
|
-
- **Pluggable backends.** A
|
|
81
|
+
- **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and you compose your own writes in the same transaction.
|
|
82
|
+
- **Pluggable backends.** A Rust backend (PyO3 + rayon) is the default fast path, with a pure-Python reference parser behind the same `Backend` protocol as the fallback — both at full event parity.
|
|
78
83
|
|
|
79
84
|
## Docs
|
|
80
85
|
|
|
@@ -3,16 +3,16 @@
|
|
|
3
3
|
[](https://pypi.org/project/cc-transcript/)
|
|
4
4
|
[](https://pypi.org/project/cc-transcript/)
|
|
5
5
|
[](https://yasyf.github.io/cc-transcript/)
|
|
6
|
-
[](https://github.com/yasyf/cc-transcript/blob/main/LICENSE)
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
`cc-transcript` parses Claude Code's on-disk JSONL transcripts into a **typed superset event model** — every entry type preserved, nothing dropped — so you build on one faithful representation and apply your own semantic filtering on top.
|
|
9
9
|
|
|
10
|
-
The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in
|
|
10
|
+
The one property that makes it worth using: the parser is non-lossy. It never silently discards sidechains, synthetic turns, tool results, or unrecognized entry types; filtering is opt-in and lives in your code, not buried in the parser.
|
|
11
11
|
|
|
12
12
|
## Install
|
|
13
13
|
|
|
14
14
|
```bash
|
|
15
|
-
uv add cc-transcript
|
|
15
|
+
uv add cc-transcript # or: pip install cc-transcript
|
|
16
16
|
```
|
|
17
17
|
|
|
18
18
|
## Quickstart
|
|
@@ -22,8 +22,7 @@ Discover the transcripts on disk, parse one, and look at the events:
|
|
|
22
22
|
```python
|
|
23
23
|
from cc_transcript import TranscriptDiscovery, parse_events, AssistantEvent, UserEvent
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
events = parse_events(path)
|
|
25
|
+
events = parse_events(TranscriptDiscovery.find_transcripts()[0])
|
|
27
26
|
|
|
28
27
|
for event in events:
|
|
29
28
|
match event:
|
|
@@ -33,7 +32,8 @@ for event in events:
|
|
|
33
32
|
print(f"assistant ({model}):", text[:80])
|
|
34
33
|
```
|
|
35
34
|
|
|
36
|
-
|
|
35
|
+
`SENTIMENT_FILTER` is a ready-made filter that keeps only user and assistant turns,
|
|
36
|
+
dropping sidechains, synthetic turns, compacted summaries, empty events, and tool/command noise:
|
|
37
37
|
|
|
38
38
|
```python
|
|
39
39
|
from cc_transcript import apply_filters, SENTIMENT_FILTER
|
|
@@ -41,12 +41,14 @@ from cc_transcript import apply_filters, SENTIMENT_FILTER
|
|
|
41
41
|
clean = list(apply_filters(events, SENTIMENT_FILTER))
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
+
Build your own with `FilterConfig` — every rule is off by default, so a bare `FilterConfig()` passes everything through.
|
|
45
|
+
|
|
44
46
|
## What problems does this solve?
|
|
45
47
|
|
|
46
|
-
- **One faithful parse
|
|
48
|
+
- **One faithful parse.** Anything reading Claude Code transcripts re-implements the same JSONL quirks (str-or-list content, tool results nested two ways, envelope-less mode markers). This is that parser, written once and typed strictly.
|
|
47
49
|
- **Non-lossy by design.** The event model is a superset: sidechains, `<synthetic>` turns, thinking blocks, and unrecognized entry types all survive parsing. You decide what to drop, via `FilterConfig`.
|
|
48
|
-
- **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and
|
|
49
|
-
- **Pluggable backends.** A
|
|
50
|
+
- **Incremental ingestion.** `FileStateStore` tracks per-file mtimes in SQLite (WAL, thread-safe) so re-runs only reparse changed files, and you compose your own writes in the same transaction.
|
|
51
|
+
- **Pluggable backends.** A Rust backend (PyO3 + rayon) is the default fast path, with a pure-Python reference parser behind the same `Backend` protocol as the fallback — both at full event parity.
|
|
50
52
|
|
|
51
53
|
## Docs
|
|
52
54
|
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Typed events for Claude Code transcripts.
|
|
2
|
+
|
|
3
|
+
Discovery, a superset JSONL parser (Python + Rust), and ingestion-state tracking.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from cc_transcript.backend import Backend, ParsedTranscript
|
|
9
|
+
from cc_transcript.discovery import CLAUDE_PROJECTS_DIR, TranscriptDiscovery
|
|
10
|
+
from cc_transcript.filters import JUNK_USER_MESSAGE_RE, SENTIMENT_FILTER, FilterConfig, apply_filters
|
|
11
|
+
from cc_transcript.filterspec import (
|
|
12
|
+
FRUSTRATION_GROUPS,
|
|
13
|
+
INTERRUPT_MARKER_GROUPS,
|
|
14
|
+
INTERRUPT_MARKER_RE,
|
|
15
|
+
MILD_IMPATIENCE_GROUPS,
|
|
16
|
+
PUSHBACK_SPEC,
|
|
17
|
+
RESUME_PHRASE_SET,
|
|
18
|
+
SENTIMENT_JUNK_GROUPS,
|
|
19
|
+
SENTIMENT_SPEC,
|
|
20
|
+
SENTIMENT_STRUCTURAL_GROUPS,
|
|
21
|
+
SHORT_MESSAGE_MAX_WORDS,
|
|
22
|
+
STOP_HOOK_GROUPS,
|
|
23
|
+
STOP_HOOK_RE,
|
|
24
|
+
STRUCTURAL_NOISE_GROUPS,
|
|
25
|
+
STRUCTURAL_NOISE_RE,
|
|
26
|
+
TRIVIAL_ACK_SET,
|
|
27
|
+
Action,
|
|
28
|
+
Clause,
|
|
29
|
+
EntrypointIn,
|
|
30
|
+
FilterSpec,
|
|
31
|
+
KindIs,
|
|
32
|
+
MetaFlag,
|
|
33
|
+
ModelIs,
|
|
34
|
+
TextEmpty,
|
|
35
|
+
TextInSet,
|
|
36
|
+
TextMatchesAny,
|
|
37
|
+
WordCountAtMost,
|
|
38
|
+
annotate_spec,
|
|
39
|
+
apply_spec,
|
|
40
|
+
is_portable,
|
|
41
|
+
keep,
|
|
42
|
+
labels_for,
|
|
43
|
+
spec_to_json,
|
|
44
|
+
)
|
|
45
|
+
from cc_transcript.models import (
|
|
46
|
+
AssistantEvent,
|
|
47
|
+
CcVersion,
|
|
48
|
+
ContentBlock,
|
|
49
|
+
EntryMeta,
|
|
50
|
+
EntryUuid,
|
|
51
|
+
ModeEvent,
|
|
52
|
+
OtherEvent,
|
|
53
|
+
SessionId,
|
|
54
|
+
SystemEvent,
|
|
55
|
+
TextBlock,
|
|
56
|
+
ThinkingBlock,
|
|
57
|
+
ToolResultBlock,
|
|
58
|
+
ToolUseBlock,
|
|
59
|
+
ToolUseId,
|
|
60
|
+
TranscriptEvent,
|
|
61
|
+
UserEvent,
|
|
62
|
+
)
|
|
63
|
+
from cc_transcript.parser import TranscriptParser, parse_events, parse_events_from_bytes
|
|
64
|
+
from cc_transcript.store import FileStateStore
|
|
65
|
+
|
|
66
|
+
__all__ = [
|
|
67
|
+
"CLAUDE_PROJECTS_DIR",
|
|
68
|
+
"FRUSTRATION_GROUPS",
|
|
69
|
+
"INTERRUPT_MARKER_GROUPS",
|
|
70
|
+
"INTERRUPT_MARKER_RE",
|
|
71
|
+
"JUNK_USER_MESSAGE_RE",
|
|
72
|
+
"MILD_IMPATIENCE_GROUPS",
|
|
73
|
+
"PUSHBACK_SPEC",
|
|
74
|
+
"RESUME_PHRASE_SET",
|
|
75
|
+
"SENTIMENT_FILTER",
|
|
76
|
+
"SENTIMENT_JUNK_GROUPS",
|
|
77
|
+
"SENTIMENT_SPEC",
|
|
78
|
+
"SENTIMENT_STRUCTURAL_GROUPS",
|
|
79
|
+
"SHORT_MESSAGE_MAX_WORDS",
|
|
80
|
+
"STOP_HOOK_GROUPS",
|
|
81
|
+
"STOP_HOOK_RE",
|
|
82
|
+
"STRUCTURAL_NOISE_GROUPS",
|
|
83
|
+
"STRUCTURAL_NOISE_RE",
|
|
84
|
+
"TRIVIAL_ACK_SET",
|
|
85
|
+
"Action",
|
|
86
|
+
"AssistantEvent",
|
|
87
|
+
"Backend",
|
|
88
|
+
"CcVersion",
|
|
89
|
+
"Clause",
|
|
90
|
+
"ContentBlock",
|
|
91
|
+
"EntryMeta",
|
|
92
|
+
"EntryUuid",
|
|
93
|
+
"EntrypointIn",
|
|
94
|
+
"FileStateStore",
|
|
95
|
+
"FilterConfig",
|
|
96
|
+
"FilterSpec",
|
|
97
|
+
"KindIs",
|
|
98
|
+
"MetaFlag",
|
|
99
|
+
"ModeEvent",
|
|
100
|
+
"ModelIs",
|
|
101
|
+
"OtherEvent",
|
|
102
|
+
"ParsedTranscript",
|
|
103
|
+
"SessionId",
|
|
104
|
+
"SystemEvent",
|
|
105
|
+
"TextBlock",
|
|
106
|
+
"TextEmpty",
|
|
107
|
+
"TextInSet",
|
|
108
|
+
"TextMatchesAny",
|
|
109
|
+
"ThinkingBlock",
|
|
110
|
+
"ToolResultBlock",
|
|
111
|
+
"ToolUseBlock",
|
|
112
|
+
"ToolUseId",
|
|
113
|
+
"TranscriptDiscovery",
|
|
114
|
+
"TranscriptEvent",
|
|
115
|
+
"TranscriptParser",
|
|
116
|
+
"UserEvent",
|
|
117
|
+
"WordCountAtMost",
|
|
118
|
+
"annotate_spec",
|
|
119
|
+
"apply_filters",
|
|
120
|
+
"apply_spec",
|
|
121
|
+
"is_portable",
|
|
122
|
+
"keep",
|
|
123
|
+
"labels_for",
|
|
124
|
+
"parse_events",
|
|
125
|
+
"parse_events_from_bytes",
|
|
126
|
+
"spec_to_json",
|
|
127
|
+
]
|
|
@@ -11,5 +11,11 @@ class ParseStream:
|
|
|
11
11
|
def recv_many(self, max: int, /) -> list[tuple[str, float, list[TranscriptEvent]]]:
|
|
12
12
|
"""Blocks for at least one parsed file, then drains up to ``max``."""
|
|
13
13
|
|
|
14
|
-
def stream_parse(
|
|
15
|
-
|
|
14
|
+
def stream_parse(
|
|
15
|
+
paths: list[tuple[str, float]], prefetch: int, spec_json: str | None = ..., /
|
|
16
|
+
) -> ParseStream:
|
|
17
|
+
"""Spawns a rayon pool parsing ``paths``, buffering ``prefetch`` results.
|
|
18
|
+
|
|
19
|
+
When ``spec_json`` is the JSON of a portable filter spec, events failing it
|
|
20
|
+
are dropped during parsing, before any Python object is built.
|
|
21
|
+
"""
|
|
@@ -7,6 +7,7 @@ if TYPE_CHECKING:
|
|
|
7
7
|
from collections.abc import AsyncIterator, Sequence
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
|
+
from cc_transcript.filterspec import FilterSpec
|
|
10
11
|
from cc_transcript.models import TranscriptEvent
|
|
11
12
|
|
|
12
13
|
|
|
@@ -39,12 +40,16 @@ class Backend(Protocol):
|
|
|
39
40
|
paths: Sequence[tuple[Path, float]],
|
|
40
41
|
*,
|
|
41
42
|
prefetch: int,
|
|
43
|
+
spec: FilterSpec | None = None,
|
|
42
44
|
) -> AsyncIterator[ParsedTranscript]:
|
|
43
45
|
"""Parses ``paths`` concurrently, yielding results as they complete.
|
|
44
46
|
|
|
45
47
|
Args:
|
|
46
48
|
paths: Pairs of ``(path, mtime)`` to parse.
|
|
47
49
|
prefetch: The number of files to keep in flight at once.
|
|
50
|
+
spec: When given, events failing the spec are dropped during
|
|
51
|
+
parsing; portable specs run in the Rust backend, others fall
|
|
52
|
+
back to the Python interpreter.
|
|
48
53
|
|
|
49
54
|
Yields:
|
|
50
55
|
One :class:`ParsedTranscript` per input path.
|
|
@@ -27,6 +27,11 @@ class TranscriptDiscovery:
|
|
|
27
27
|
except OSError:
|
|
28
28
|
return None
|
|
29
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
def transcript_mtime(path: Path) -> float:
|
|
32
|
+
"""Returns ``path``'s modification time, raising if it cannot be read."""
|
|
33
|
+
return path.stat().st_mtime
|
|
34
|
+
|
|
30
35
|
@staticmethod
|
|
31
36
|
def find_in(
|
|
32
37
|
directory: Path,
|
|
@@ -4,37 +4,47 @@ import re
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from typing import TYPE_CHECKING
|
|
6
6
|
|
|
7
|
-
from cc_transcript.
|
|
7
|
+
from cc_transcript.filterspec import (
|
|
8
|
+
ASSISTANTS,
|
|
9
|
+
SENTIMENT_JUNK_GROUPS,
|
|
10
|
+
USERS,
|
|
11
|
+
Clause,
|
|
12
|
+
EntrypointIn,
|
|
13
|
+
FilterSpec,
|
|
14
|
+
KindIs,
|
|
15
|
+
MetaFlag,
|
|
16
|
+
ModelIs,
|
|
17
|
+
TextEmpty,
|
|
18
|
+
TextMatchesAny,
|
|
19
|
+
apply_spec,
|
|
20
|
+
compile_groups,
|
|
21
|
+
)
|
|
22
|
+
from cc_transcript.models import AssistantEvent, ModeEvent, OtherEvent, SystemEvent, UserEvent
|
|
8
23
|
|
|
9
24
|
if TYPE_CHECKING:
|
|
10
25
|
from collections.abc import Iterable, Iterator
|
|
11
26
|
|
|
27
|
+
from cc_transcript.filterspec import EventKind
|
|
12
28
|
from cc_transcript.models import TranscriptEvent
|
|
13
29
|
|
|
14
|
-
JUNK_USER_MESSAGE_RE =
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
r"|Stop hook feedback:"
|
|
24
|
-
r"|REMAINING_TASKS_ACKNOWLEDGED"
|
|
25
|
-
r"|<<[a-z][a-z0-9-]*>>"
|
|
26
|
-
r"|^Base directory for this skill:"
|
|
27
|
-
r"|(?:###\s+[\w][\w \-]{0,30}\s+){3,}###",
|
|
28
|
-
re.IGNORECASE,
|
|
29
|
-
)
|
|
30
|
+
JUNK_USER_MESSAGE_RE = compile_groups(SENTIMENT_JUNK_GROUPS, True)
|
|
31
|
+
|
|
32
|
+
KIND_BY_TYPE: dict[type[TranscriptEvent], EventKind] = {
|
|
33
|
+
UserEvent: "user",
|
|
34
|
+
AssistantEvent: "assistant",
|
|
35
|
+
SystemEvent: "system",
|
|
36
|
+
ModeEvent: "mode",
|
|
37
|
+
OtherEvent: "other",
|
|
38
|
+
}
|
|
30
39
|
|
|
31
40
|
|
|
32
41
|
@dataclass(frozen=True, slots=True)
|
|
33
42
|
class FilterConfig:
|
|
34
43
|
"""Opt-in, consumer-side filtering of a transcript event stream.
|
|
35
44
|
|
|
36
|
-
|
|
37
|
-
|
|
45
|
+
A back-compatible flag-bag that lowers to a :class:`~cc_transcript.FilterSpec`
|
|
46
|
+
via :meth:`to_spec`. Every flag defaults off, so a bare ``FilterConfig()``
|
|
47
|
+
passes events through untouched.
|
|
38
48
|
|
|
39
49
|
Attributes:
|
|
40
50
|
keep_types: When set, drop every event not an instance of one of these
|
|
@@ -57,6 +67,34 @@ class FilterConfig:
|
|
|
57
67
|
drop_ephemeral_entrypoints: frozenset[str] = frozenset()
|
|
58
68
|
junk_pattern: re.Pattern[str] | None = field(default=None)
|
|
59
69
|
|
|
70
|
+
def to_spec(self) -> FilterSpec:
|
|
71
|
+
"""Lowers this flag-bag into an equivalent ordered :class:`FilterSpec`."""
|
|
72
|
+
return FilterSpec(clauses=tuple(self.clauses()))
|
|
73
|
+
|
|
74
|
+
def clauses(self) -> Iterator[Clause]:
|
|
75
|
+
if self.keep_types is not None:
|
|
76
|
+
yield Clause(KindIs(frozenset(KIND_BY_TYPE[kind] for kind in self.keep_types)), negate=True)
|
|
77
|
+
if self.drop_synthetic:
|
|
78
|
+
yield Clause(ModelIs(frozenset({"<synthetic>"})), applies_to=ASSISTANTS)
|
|
79
|
+
if self.drop_empty:
|
|
80
|
+
yield Clause(TextEmpty(consider_tool_use=True), applies_to=ASSISTANTS)
|
|
81
|
+
yield Clause(TextEmpty(consider_tool_use=False), applies_to=USERS)
|
|
82
|
+
if self.junk_pattern is not None:
|
|
83
|
+
yield Clause(
|
|
84
|
+
TextMatchesAny(
|
|
85
|
+
(("junk", self.junk_pattern.pattern),),
|
|
86
|
+
ignore_case=bool(self.junk_pattern.flags & re.IGNORECASE),
|
|
87
|
+
),
|
|
88
|
+
applies_to=USERS,
|
|
89
|
+
)
|
|
90
|
+
if self.drop_sidechain:
|
|
91
|
+
yield Clause(MetaFlag("is_sidechain"))
|
|
92
|
+
if self.drop_compacted:
|
|
93
|
+
yield Clause(MetaFlag("is_compact_summary"))
|
|
94
|
+
yield Clause(MetaFlag("is_visible_in_transcript_only"))
|
|
95
|
+
if self.drop_ephemeral_entrypoints:
|
|
96
|
+
yield Clause(EntrypointIn(self.drop_ephemeral_entrypoints))
|
|
97
|
+
|
|
60
98
|
|
|
61
99
|
SENTIMENT_FILTER = FilterConfig(
|
|
62
100
|
keep_types=(UserEvent, AssistantEvent),
|
|
@@ -69,30 +107,6 @@ SENTIMENT_FILTER = FilterConfig(
|
|
|
69
107
|
)
|
|
70
108
|
|
|
71
109
|
|
|
72
|
-
def keep(event: TranscriptEvent, config: FilterConfig) -> bool:
|
|
73
|
-
if config.keep_types is not None and not isinstance(event, config.keep_types):
|
|
74
|
-
return False
|
|
75
|
-
match event:
|
|
76
|
-
case OtherEvent() | ModeEvent():
|
|
77
|
-
return True
|
|
78
|
-
case AssistantEvent() if config.drop_synthetic and event.model == "<synthetic>":
|
|
79
|
-
return False
|
|
80
|
-
case AssistantEvent() if config.drop_empty and not event.text.strip() and not any(
|
|
81
|
-
isinstance(block, ToolUseBlock) for block in event.blocks
|
|
82
|
-
):
|
|
83
|
-
return False
|
|
84
|
-
case UserEvent() if config.drop_empty and not event.text.strip():
|
|
85
|
-
return False
|
|
86
|
-
case UserEvent() if config.junk_pattern is not None and config.junk_pattern.search(event.text):
|
|
87
|
-
return False
|
|
88
|
-
case UserEvent(meta=meta) | AssistantEvent(meta=meta) | SystemEvent(meta=meta):
|
|
89
|
-
if config.drop_sidechain and meta.is_sidechain:
|
|
90
|
-
return False
|
|
91
|
-
if config.drop_compacted and (meta.is_compact_summary or meta.is_visible_in_transcript_only):
|
|
92
|
-
return False
|
|
93
|
-
return meta.entrypoint not in config.drop_ephemeral_entrypoints
|
|
94
|
-
|
|
95
|
-
|
|
96
110
|
def apply_filters(events: Iterable[TranscriptEvent], config: FilterConfig) -> Iterator[TranscriptEvent]:
|
|
97
111
|
"""Yields the events that survive ``config``.
|
|
98
112
|
|
|
@@ -103,7 +117,7 @@ def apply_filters(events: Iterable[TranscriptEvent], config: FilterConfig) -> It
|
|
|
103
117
|
Yields:
|
|
104
118
|
The events for which every enabled rule holds.
|
|
105
119
|
"""
|
|
106
|
-
return (
|
|
120
|
+
return apply_spec(events, config.to_spec())
|
|
107
121
|
|
|
108
122
|
|
|
109
123
|
__all__ = ["JUNK_USER_MESSAGE_RE", "SENTIMENT_FILTER", "FilterConfig", "apply_filters"]
|