cc-transcript 0.6.0__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/Cargo.lock +1 -1
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/PKG-INFO +5 -3
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/__init__.py +8 -1
- cc_transcript-0.7.1/cc_transcript/domains/__init__.py +9 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/__init__.py +79 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/candidates.py +70 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/confidence.py +74 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/context.py +188 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/formats.py +64 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/markers.py +11 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/nav.py +100 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/signals.py +247 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/sourcekind.py +17 -0
- cc_transcript-0.7.1/cc_transcript/domains/mining/store.py +206 -0
- {cc_transcript-0.6.0/cc_transcript → cc_transcript-0.7.1/cc_transcript/domains}/sentiment/__init__.py +8 -8
- {cc_transcript-0.6.0/cc_transcript → cc_transcript-0.7.1/cc_transcript/domains}/sentiment/buckets.py +1 -1
- {cc_transcript-0.6.0/cc_transcript → cc_transcript-0.7.1/cc_transcript/domains}/sentiment/engine.py +3 -3
- {cc_transcript-0.6.0/cc_transcript → cc_transcript-0.7.1/cc_transcript/domains}/sentiment/lexicon.py +2 -2
- {cc_transcript-0.6.0/cc_transcript → cc_transcript-0.7.1/cc_transcript/domains}/sentiment/scorespec.py +2 -2
- {cc_transcript-0.6.0/cc_transcript/sentiment → cc_transcript-0.7.1/cc_transcript}/messages.py +2 -2
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/models.py +3 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/parser.py +15 -6
- cc_transcript-0.7.1/cc_transcript/sentiment/__init__.py +39 -0
- cc_transcript-0.7.1/cc_transcript/sentiment/buckets.py +17 -0
- cc_transcript-0.7.1/cc_transcript/sentiment/lexicon.py +7 -0
- cc_transcript-0.7.1/cc_transcript/sentiment/messages.py +13 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/pyproject.toml +4 -2
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/Cargo.toml +1 -1
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/event.rs +6 -1
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/Cargo.toml +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/LICENSE +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/README.md +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/_parser_rs.pyi +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/backend.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/builders.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/discovery.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/filters.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/filterspec.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/py.typed +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/rust.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/cc_transcript/store.py +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/data/afinn-en-165.tsv +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/data/domain_overrides.tsv +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/filter.rs +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/lexicon.rs +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/lib.rs +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/model.rs +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/score.rs +0 -0
- {cc_transcript-0.6.0 → cc_transcript-0.7.1}/rust/src/value.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cc-transcript
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Classifier: Development Status :: 3 - Alpha
|
|
5
5
|
Classifier: Intended Audience :: Developers
|
|
6
6
|
Classifier: Operating System :: OS Independent
|
|
@@ -13,10 +13,12 @@ Requires-Dist: orjson>=3.10
|
|
|
13
13
|
Requires-Dist: pytest>=8.0 ; extra == 'dev'
|
|
14
14
|
Requires-Dist: ty>=0.0.44 ; extra == 'dev'
|
|
15
15
|
Requires-Dist: ruff>=0.8 ; extra == 'dev'
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
Requires-Dist:
|
|
16
|
+
Requires-Dist: cc-transcript[sentiment] ; extra == 'lexicon'
|
|
17
|
+
Requires-Dist: spacy>=3.8 ; extra == 'sentiment'
|
|
18
|
+
Requires-Dist: afinn>=0.1 ; extra == 'sentiment'
|
|
18
19
|
Provides-Extra: dev
|
|
19
20
|
Provides-Extra: lexicon
|
|
21
|
+
Provides-Extra: sentiment
|
|
20
22
|
License-File: LICENSE
|
|
21
23
|
Summary: Typed events for Claude Code transcripts: discovery, a superset JSONL parser (Python + Rust), and ingestion-state tracking.
|
|
22
24
|
Keywords:
|
|
@@ -38,6 +38,13 @@ from cc_transcript.filterspec import (
|
|
|
38
38
|
keep,
|
|
39
39
|
labels_for,
|
|
40
40
|
)
|
|
41
|
+
from cc_transcript.messages import (
|
|
42
|
+
AssistantMessage,
|
|
43
|
+
BaseMessage,
|
|
44
|
+
ToolCall,
|
|
45
|
+
TranscriptMessage,
|
|
46
|
+
UserMessage,
|
|
47
|
+
)
|
|
41
48
|
from cc_transcript.models import (
|
|
42
49
|
AssistantEvent,
|
|
43
50
|
CcVersion,
|
|
@@ -56,5 +63,5 @@ from cc_transcript.models import (
|
|
|
56
63
|
TranscriptEvent,
|
|
57
64
|
UserEvent,
|
|
58
65
|
)
|
|
59
|
-
from cc_transcript.parser import TranscriptParser, parse_events_async, parse_events_from_bytes
|
|
66
|
+
from cc_transcript.parser import TranscriptParser, parse_event, parse_events_async, parse_events_from_bytes
|
|
60
67
|
from cc_transcript.store import FileStateStore
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Built-in domains layered on the cc-transcript core.
|
|
2
|
+
|
|
3
|
+
Each domain builds on the core transcript model and depends only on core — never on
|
|
4
|
+
another domain, and never the reverse. Heavy dependencies sit behind a per-domain
|
|
5
|
+
extra. Today: :mod:`cc_transcript.domains.sentiment` (scoring) and
|
|
6
|
+
:mod:`cc_transcript.domains.mining` (correction/feedback extraction).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Re-exports establish the domain's public surface; pyright sees them as unused.
|
|
2
|
+
# pyright: reportUnusedImport=false
|
|
3
|
+
"""The correction/feedback mining mechanism.
|
|
4
|
+
|
|
5
|
+
Neutral fact-detectors over Claude Code transcripts: each iterator recognizes a
|
|
6
|
+
transcript shape and yields a :class:`MiningSignal` — a neutral fact carrying a
|
|
7
|
+
candidate trigger, confidence, and evidence, but no policy. Apps map signals to
|
|
8
|
+
their own candidate records with policy injected (their filter spec, their
|
|
9
|
+
disqualification rules, their review formats), and persist them through
|
|
10
|
+
:class:`FeedbackStore`.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from cc_transcript.domains.mining.candidates import DedupKey, FeedbackCandidate, dedup_key
|
|
16
|
+
from cc_transcript.domains.mining.confidence import (
|
|
17
|
+
HIGH,
|
|
18
|
+
LOW,
|
|
19
|
+
MEDIUM,
|
|
20
|
+
NOISE_FLOOR,
|
|
21
|
+
NONE,
|
|
22
|
+
VERY_HIGH,
|
|
23
|
+
CandidateSignal,
|
|
24
|
+
Confidence,
|
|
25
|
+
effective_confidence,
|
|
26
|
+
firm,
|
|
27
|
+
noise,
|
|
28
|
+
strong,
|
|
29
|
+
weak,
|
|
30
|
+
)
|
|
31
|
+
from cc_transcript.domains.mining.context import (
|
|
32
|
+
TOOL_INPUT_LIMIT,
|
|
33
|
+
ContextSnapshot,
|
|
34
|
+
ContextTurn,
|
|
35
|
+
build_snapshot,
|
|
36
|
+
summarize_tool_input,
|
|
37
|
+
trigger_for,
|
|
38
|
+
turn_for,
|
|
39
|
+
)
|
|
40
|
+
from cc_transcript.domains.mining.formats import ReviewComment, ReviewFormat, extract_all
|
|
41
|
+
from cc_transcript.domains.mining.markers import (
|
|
42
|
+
DENIAL_PREFIX,
|
|
43
|
+
EDIT_TOOLS,
|
|
44
|
+
INTERRUPT_MARKER_RE,
|
|
45
|
+
REENTRY_LOOKBACK,
|
|
46
|
+
USER_SAID_MARKER,
|
|
47
|
+
USER_SAID_TRAILER,
|
|
48
|
+
)
|
|
49
|
+
from cc_transcript.domains.mining.nav import (
|
|
50
|
+
denial_results,
|
|
51
|
+
denied_tool_payload,
|
|
52
|
+
embedded_user_text,
|
|
53
|
+
interrupt_marker,
|
|
54
|
+
is_bare_interrupt_marker,
|
|
55
|
+
last_edit_index,
|
|
56
|
+
marker_in,
|
|
57
|
+
next_user_message,
|
|
58
|
+
tool_uses,
|
|
59
|
+
)
|
|
60
|
+
from cc_transcript.domains.mining.signals import (
|
|
61
|
+
DEFAULT_DETECTORS,
|
|
62
|
+
MiningSignal,
|
|
63
|
+
correction_text,
|
|
64
|
+
iter_interrupt_marker_signals,
|
|
65
|
+
iter_plan_reentry_signals,
|
|
66
|
+
iter_plan_rejection_signals,
|
|
67
|
+
iter_review_comment_signals,
|
|
68
|
+
iter_tool_denial_signals,
|
|
69
|
+
iter_user_message_signals,
|
|
70
|
+
nearest_assistant_index,
|
|
71
|
+
)
|
|
72
|
+
from cc_transcript.domains.mining.sourcekind import (
|
|
73
|
+
INTERRUPT_REJECTION,
|
|
74
|
+
PLAN_REVIEW,
|
|
75
|
+
REVIEW_COMMENT,
|
|
76
|
+
TRANSCRIPT_MESSAGE,
|
|
77
|
+
SourceKind,
|
|
78
|
+
)
|
|
79
|
+
from cc_transcript.domains.mining.store import FEEDBACK_DDL, FeedbackStore, Stats, event_row
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""The feedback candidate model and the dedup key that makes ingestion idempotent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import TYPE_CHECKING, NewType
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from cc_transcript.models import SessionId
|
|
16
|
+
|
|
17
|
+
from cc_transcript.domains.mining.confidence import CandidateSignal
|
|
18
|
+
from cc_transcript.domains.mining.context import ContextSnapshot
|
|
19
|
+
from cc_transcript.domains.mining.sourcekind import SourceKind
|
|
20
|
+
|
|
21
|
+
DedupKey = NewType("DedupKey", str)
|
|
22
|
+
"""A content-derived SHA-256 key that makes feedback ingestion idempotent."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True, slots=True)
|
|
26
|
+
class FeedbackCandidate:
|
|
27
|
+
"""A single piece of developer pushback extracted from a transcript.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
dedup_key: The content-derived key that makes ingestion idempotent.
|
|
31
|
+
source_kind: Which detector produced the candidate.
|
|
32
|
+
occurred_at: When the feedback was given.
|
|
33
|
+
text: The verbatim pushback text.
|
|
34
|
+
context: The conversational window around the feedback.
|
|
35
|
+
session_id: The transcript session the feedback came from.
|
|
36
|
+
origin_path: The file the candidate was extracted from.
|
|
37
|
+
origin_uuid: The originating transcript entry's uuid.
|
|
38
|
+
cc_version: The Claude Code version recorded for the origin.
|
|
39
|
+
payload: Detector-specific metadata preserved verbatim.
|
|
40
|
+
signal: The de-noising confidence signal, when computed.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
dedup_key: DedupKey
|
|
44
|
+
source_kind: SourceKind
|
|
45
|
+
occurred_at: datetime
|
|
46
|
+
text: str
|
|
47
|
+
context: ContextSnapshot
|
|
48
|
+
session_id: SessionId | None = None
|
|
49
|
+
origin_path: Path | None = None
|
|
50
|
+
origin_uuid: str | None = None
|
|
51
|
+
cc_version: str | None = None
|
|
52
|
+
payload: Mapping[str, Any] | None = None
|
|
53
|
+
signal: CandidateSignal | None = None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def dedup_key(*parts: str) -> DedupKey:
|
|
57
|
+
"""Returns the stable dedup key for ``parts``.
|
|
58
|
+
|
|
59
|
+
Detectors key on session, kind, and the feedback content (plus its code
|
|
60
|
+
location for review comments) rather than the transcript entry's uuid or the
|
|
61
|
+
absolute file path, so the same pushback recorded under two transcript entries
|
|
62
|
+
collapses to one row, and the database stays portable and idempotent across moves.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
parts: The content fragments that uniquely identify a candidate.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The SHA-256 hex digest of the parts joined by a null byte.
|
|
69
|
+
"""
|
|
70
|
+
return DedupKey(hashlib.sha256("\x00".join(parts).encode()).hexdigest())
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""The de-noising confidence primitive carried alongside mined feedback facts."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, NewType
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Mapping
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
Confidence = NewType("Confidence", float)
|
|
13
|
+
"""A de-noising score in the closed interval [0, 1]; higher is more trustworthy."""
|
|
14
|
+
|
|
15
|
+
NONE = Confidence(0.0)
|
|
16
|
+
LOW = Confidence(0.25)
|
|
17
|
+
MEDIUM = Confidence(0.5)
|
|
18
|
+
HIGH = Confidence(0.75)
|
|
19
|
+
VERY_HIGH = Confidence(0.95)
|
|
20
|
+
NOISE_FLOOR = LOW
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True, slots=True)
|
|
24
|
+
class CandidateSignal:
|
|
25
|
+
"""A confidence verdict on a mined fact, with the reasons that produced it.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
confidence: The de-noising score in [0, 1].
|
|
29
|
+
reasons: The short reason codes that justify the score.
|
|
30
|
+
durable: Whether the signal should persist across re-derivation.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
confidence: Confidence
|
|
34
|
+
reasons: tuple[str, ...] = ()
|
|
35
|
+
durable: bool = True
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def strong(*reasons: str, durable: bool = True) -> CandidateSignal:
|
|
39
|
+
"""Returns a :data:`HIGH`-confidence signal carrying ``reasons``."""
|
|
40
|
+
return CandidateSignal(HIGH, reasons, durable)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def firm(*reasons: str, durable: bool = True) -> CandidateSignal:
|
|
44
|
+
"""Returns a :data:`MEDIUM`-confidence signal carrying ``reasons``."""
|
|
45
|
+
return CandidateSignal(MEDIUM, reasons, durable)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def weak(*reasons: str, durable: bool = True) -> CandidateSignal:
|
|
49
|
+
"""Returns a :data:`LOW`-confidence signal carrying ``reasons``."""
|
|
50
|
+
return CandidateSignal(LOW, reasons, durable)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def noise(*reasons: str, durable: bool = True) -> CandidateSignal:
|
|
54
|
+
"""Returns a :data:`NONE`-confidence signal carrying ``reasons``."""
|
|
55
|
+
return CandidateSignal(NONE, reasons, durable)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def effective_confidence(signal: CandidateSignal | None) -> Confidence:
|
|
59
|
+
"""Returns ``signal``'s confidence, or :data:`MEDIUM` when no signal is set."""
|
|
60
|
+
return signal.confidence if signal else MEDIUM
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def to_payload(signal: CandidateSignal) -> dict[str, Any]:
|
|
64
|
+
return {"confidence": signal.confidence, "reasons": list(signal.reasons), "durable": signal.durable}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def from_payload(data: Mapping[str, Any] | None) -> CandidateSignal | None:
|
|
68
|
+
if data is None:
|
|
69
|
+
return None
|
|
70
|
+
return CandidateSignal(
|
|
71
|
+
confidence=Confidence(data["confidence"]),
|
|
72
|
+
reasons=tuple(data["reasons"]),
|
|
73
|
+
durable=data["durable"],
|
|
74
|
+
)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""The conversational-window primitive captured around each piece of feedback."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
8
|
+
|
|
9
|
+
from cc_transcript.models import AssistantEvent, ToolUseBlock, UserEvent
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Mapping, Sequence
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from cc_transcript.models import TranscriptEvent
|
|
16
|
+
|
|
17
|
+
ASSISTANT_TEXT_LIMIT = 2000
|
|
18
|
+
TOOL_INPUT_LIMIT = 1500
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True, slots=True)
|
|
22
|
+
class ContextTurn:
|
|
23
|
+
"""One conversational turn surrounding a piece of feedback.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
role: Whether the turn came from the user, the assistant, or a tool.
|
|
27
|
+
text: The turn's text content.
|
|
28
|
+
tool_calls: The names of the tools the turn invoked, in order.
|
|
29
|
+
tool_inputs: One input summary per tool call, in the same order.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
role: Literal["user", "assistant", "tool"]
|
|
33
|
+
text: str
|
|
34
|
+
tool_calls: tuple[str, ...] = ()
|
|
35
|
+
tool_inputs: tuple[str, ...] = ()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True, slots=True)
|
|
39
|
+
class ContextSnapshot:
|
|
40
|
+
"""The conversational window around a piece of feedback.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
before: The turns leading up to the trigger.
|
|
44
|
+
trigger: The assistant action the feedback responds to, when known.
|
|
45
|
+
after: The turns following the trigger.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
before: tuple[ContextTurn, ...]
|
|
49
|
+
trigger: ContextTurn | None
|
|
50
|
+
after: tuple[ContextTurn, ...]
|
|
51
|
+
|
|
52
|
+
def to_json(self) -> str:
|
|
53
|
+
"""Serializes the snapshot to the JSON stored in ``context_json``."""
|
|
54
|
+
return json.dumps(
|
|
55
|
+
{
|
|
56
|
+
"before": [turn_to_dict(turn) for turn in self.before],
|
|
57
|
+
"trigger": turn_to_dict(self.trigger) if self.trigger else None,
|
|
58
|
+
"after": [turn_to_dict(turn) for turn in self.after],
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_json(cls, raw: str) -> ContextSnapshot:
|
|
64
|
+
"""Deserializes a snapshot from a ``context_json`` string."""
|
|
65
|
+
data = json.loads(raw)
|
|
66
|
+
return cls(
|
|
67
|
+
before=tuple(turn_from_dict(turn) for turn in data["before"]),
|
|
68
|
+
trigger=turn_from_dict(data["trigger"]) if data["trigger"] else None,
|
|
69
|
+
after=tuple(turn_from_dict(turn) for turn in data["after"]),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def turn_to_dict(turn: ContextTurn) -> dict[str, Any]:
|
|
74
|
+
return {
|
|
75
|
+
"role": turn.role,
|
|
76
|
+
"text": turn.text,
|
|
77
|
+
"tool_calls": list(turn.tool_calls),
|
|
78
|
+
"tool_inputs": list(turn.tool_inputs),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def turn_from_dict(data: Mapping[str, Any]) -> ContextTurn:
|
|
83
|
+
return ContextTurn(
|
|
84
|
+
role=data["role"],
|
|
85
|
+
text=data["text"],
|
|
86
|
+
tool_calls=tuple(data["tool_calls"]),
|
|
87
|
+
tool_inputs=tuple(data.get("tool_inputs", ())),
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def summarize_tool_input(name: str, input: Mapping[str, Any]) -> str:
|
|
92
|
+
"""Summarizes one tool call's input for context snapshots.
|
|
93
|
+
|
|
94
|
+
Extracts the field that captures what the tool actually did — the Bash
|
|
95
|
+
command, the Edit diff, the plan body — falling back to the raw JSON for
|
|
96
|
+
unrecognized tools, truncated to :data:`TOOL_INPUT_LIMIT`.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
name: The tool's name as recorded in the transcript.
|
|
100
|
+
input: The tool call's input mapping, preserved verbatim by the parser.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
The bounded one-string summary of the call.
|
|
104
|
+
"""
|
|
105
|
+
match name:
|
|
106
|
+
case "Bash":
|
|
107
|
+
summary = str(input.get("command", ""))
|
|
108
|
+
case "Edit":
|
|
109
|
+
summary = f"{input.get('file_path', '')}\n- {input.get('old_string', '')}\n+ {input.get('new_string', '')}"
|
|
110
|
+
case "MultiEdit":
|
|
111
|
+
first: Mapping[str, Any] = next(iter(input.get("edits") or ()), {})
|
|
112
|
+
summary = f"{input.get('file_path', '')}\n- {first.get('old_string', '')}\n+ {first.get('new_string', '')}"
|
|
113
|
+
case "Write":
|
|
114
|
+
summary = f"{input.get('file_path', '')}\n{input.get('content', '')}"
|
|
115
|
+
case "ExitPlanMode":
|
|
116
|
+
summary = str(input.get("plan", ""))
|
|
117
|
+
case "Task" | "Agent":
|
|
118
|
+
summary = str(input.get("prompt", ""))
|
|
119
|
+
case _:
|
|
120
|
+
summary = json.dumps(dict(input))
|
|
121
|
+
return summary[:TOOL_INPUT_LIMIT]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def turn_for(event: UserEvent | AssistantEvent) -> ContextTurn:
|
|
125
|
+
match event:
|
|
126
|
+
case UserEvent():
|
|
127
|
+
return ContextTurn(role="user", text=event.text)
|
|
128
|
+
case AssistantEvent():
|
|
129
|
+
uses = tuple(block for block in event.blocks if isinstance(block, ToolUseBlock))
|
|
130
|
+
return ContextTurn(
|
|
131
|
+
role="assistant",
|
|
132
|
+
text=event.text[:ASSISTANT_TEXT_LIMIT],
|
|
133
|
+
tool_calls=tuple(use.name for use in uses),
|
|
134
|
+
tool_inputs=tuple(summarize_tool_input(use.name, use.input) for use in uses),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def trigger_for(events: Sequence[TranscriptEvent], index: int, lower: int) -> ContextTurn | None:
|
|
139
|
+
return next(
|
|
140
|
+
(
|
|
141
|
+
turn_for(event)
|
|
142
|
+
for i in range(index - 1, lower - 1, -1)
|
|
143
|
+
if isinstance(event := events[i], AssistantEvent)
|
|
144
|
+
),
|
|
145
|
+
None,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def build_snapshot(
|
|
150
|
+
events: Sequence[TranscriptEvent],
|
|
151
|
+
index: int,
|
|
152
|
+
*,
|
|
153
|
+
before: int = 6,
|
|
154
|
+
after: int = 2,
|
|
155
|
+
lower_bound: int | None = None,
|
|
156
|
+
) -> ContextSnapshot:
|
|
157
|
+
"""Builds the conversational window around the event at ``index``.
|
|
158
|
+
|
|
159
|
+
A turn is a :class:`UserEvent` or :class:`AssistantEvent`; system, mode, and
|
|
160
|
+
other events are skipped. The trigger is the nearest preceding assistant
|
|
161
|
+
turn — the action the feedback responds to.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
events: The full ordered event stream for one transcript.
|
|
165
|
+
index: The index of the event the feedback was attached to.
|
|
166
|
+
before: The maximum number of turns to capture before the trigger.
|
|
167
|
+
after: The maximum number of turns to capture after the index.
|
|
168
|
+
lower_bound: When set, an event index the ``before`` window and trigger
|
|
169
|
+
search may not reach back past — used to anchor plan-review context
|
|
170
|
+
to the triggering edit cycle.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
The assembled :class:`ContextSnapshot`.
|
|
174
|
+
"""
|
|
175
|
+
lower = lower_bound if lower_bound is not None else 0
|
|
176
|
+
return ContextSnapshot(
|
|
177
|
+
before=tuple(
|
|
178
|
+
turn_for(event)
|
|
179
|
+
for i in range(index - 1, lower - 1, -1)
|
|
180
|
+
if isinstance(event := events[i], UserEvent | AssistantEvent)
|
|
181
|
+
)[:before][::-1],
|
|
182
|
+
trigger=trigger_for(events, index, lower),
|
|
183
|
+
after=tuple(
|
|
184
|
+
turn_for(event)
|
|
185
|
+
for i in range(index + 1, len(events))
|
|
186
|
+
if isinstance(event := events[i], UserEvent | AssistantEvent)
|
|
187
|
+
)[:after],
|
|
188
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Generic infrastructure for parsing structured code-review messages.
|
|
2
|
+
|
|
3
|
+
The concrete review formats are app policy; an app injects its own
|
|
4
|
+
:class:`ReviewFormat` sequence into :func:`extract_all`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import re
|
|
14
|
+
from collections.abc import Callable, Iterator, Sequence
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True, slots=True)
|
|
18
|
+
class ReviewComment:
|
|
19
|
+
"""A single inline review comment parsed from a code-review message.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
file: The file the comment targets, when cited.
|
|
23
|
+
line_start: The first line the comment targets, when cited.
|
|
24
|
+
line_end: The last line the comment targets, when a range is cited.
|
|
25
|
+
comment: The comment's text.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
file: str | None
|
|
29
|
+
line_start: int | None
|
|
30
|
+
line_end: int | None
|
|
31
|
+
comment: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True, slots=True)
|
|
35
|
+
class ReviewFormat:
|
|
36
|
+
"""A named code-review text format with a detector and extractor.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
name: The format's identifier.
|
|
40
|
+
pattern: A pattern that matches when the format is present in a text.
|
|
41
|
+
extract: Parses a matching text into its review comments.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
name: str
|
|
45
|
+
pattern: re.Pattern[str]
|
|
46
|
+
extract: Callable[[str], tuple[ReviewComment, ...]]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_all(text: str, formats: Sequence[ReviewFormat]) -> Iterator[tuple[ReviewFormat, ReviewComment]]:
|
|
50
|
+
"""Yields every ``(format, comment)`` extracted by any matching format.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
text: The raw review message text.
|
|
54
|
+
formats: The review formats to try, in order.
|
|
55
|
+
|
|
56
|
+
Yields:
|
|
57
|
+
One pair per extracted comment, across all formats whose pattern matches.
|
|
58
|
+
"""
|
|
59
|
+
return (
|
|
60
|
+
(fmt, comment)
|
|
61
|
+
for fmt in formats
|
|
62
|
+
if fmt.pattern.search(text)
|
|
63
|
+
for comment in fmt.extract(text)
|
|
64
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Claude Code transcript marker constants the mining fact-detectors recognize."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from cc_transcript import INTERRUPT_MARKER_RE as INTERRUPT_MARKER_RE
|
|
6
|
+
|
|
7
|
+
DENIAL_PREFIX = "The user doesn't want to proceed with this tool use. The tool use was rejected"
|
|
8
|
+
USER_SAID_MARKER = "To tell you how to proceed, the user said:\n"
|
|
9
|
+
USER_SAID_TRAILER = "Note: The user's next message"
|
|
10
|
+
EDIT_TOOLS = frozenset({"Edit", "Write", "MultiEdit", "NotebookEdit"})
|
|
11
|
+
REENTRY_LOOKBACK = 40
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Pure navigation helpers over a transcript's ordered events."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from cc_transcript.models import AssistantEvent, ToolResultBlock, ToolUseBlock, UserEvent
|
|
8
|
+
|
|
9
|
+
from cc_transcript.domains.mining.markers import (
|
|
10
|
+
DENIAL_PREFIX,
|
|
11
|
+
EDIT_TOOLS,
|
|
12
|
+
INTERRUPT_MARKER_RE,
|
|
13
|
+
REENTRY_LOOKBACK,
|
|
14
|
+
USER_SAID_MARKER,
|
|
15
|
+
USER_SAID_TRAILER,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Iterator, Sequence
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from cc_transcript.models import ToolUseId, TranscriptEvent
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def tool_uses(events: Sequence[TranscriptEvent]) -> dict[ToolUseId, ToolUseBlock]:
|
|
26
|
+
return {
|
|
27
|
+
block.id: block
|
|
28
|
+
for event in events
|
|
29
|
+
if isinstance(event, AssistantEvent)
|
|
30
|
+
for block in event.blocks
|
|
31
|
+
if isinstance(block, ToolUseBlock)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def denial_results(event: UserEvent) -> Iterator[ToolResultBlock]:
|
|
36
|
+
return (
|
|
37
|
+
block
|
|
38
|
+
for block in event.blocks
|
|
39
|
+
if isinstance(block, ToolResultBlock)
|
|
40
|
+
if block.is_error
|
|
41
|
+
if block.content.startswith(DENIAL_PREFIX)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def embedded_user_text(content: str) -> str | None:
|
|
46
|
+
if (start := content.find(USER_SAID_MARKER)) == -1:
|
|
47
|
+
return None
|
|
48
|
+
return content[start + len(USER_SAID_MARKER) :].split(USER_SAID_TRAILER, 1)[0].strip()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def last_edit_index(events: Sequence[TranscriptEvent], index: int) -> int | None:
|
|
52
|
+
return next(
|
|
53
|
+
(
|
|
54
|
+
i
|
|
55
|
+
for i in range(index - 1, max(index - REENTRY_LOOKBACK, 0) - 1, -1)
|
|
56
|
+
if isinstance(event := events[i], AssistantEvent)
|
|
57
|
+
if any(isinstance(b, ToolUseBlock) and b.name in EDIT_TOOLS for b in event.blocks)
|
|
58
|
+
),
|
|
59
|
+
None,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def next_user_message(events: Sequence[TranscriptEvent], index: int) -> tuple[int, UserEvent] | None:
|
|
64
|
+
return next(
|
|
65
|
+
(
|
|
66
|
+
(i, event)
|
|
67
|
+
for i in range(index, len(events))
|
|
68
|
+
if isinstance(event := events[i], UserEvent)
|
|
69
|
+
if event.text.strip()
|
|
70
|
+
),
|
|
71
|
+
None,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def denied_tool_payload(use: ToolUseBlock) -> dict[str, Any]:
|
|
76
|
+
return {"tool": use.name, "file_path": use.input.get("file_path")}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def interrupt_marker(content: str) -> str | None:
|
|
80
|
+
stripped = content.lstrip()
|
|
81
|
+
if (match := INTERRUPT_MARKER_RE.match(stripped)) is None:
|
|
82
|
+
return None
|
|
83
|
+
end = stripped.find("]")
|
|
84
|
+
return stripped[: end + 1] if end != -1 else match.group(0)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def is_bare_interrupt_marker(text: str) -> bool:
|
|
88
|
+
return (marker := interrupt_marker(text)) is not None and not text.strip()[len(marker.strip()) :].strip()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def marker_in(event: UserEvent) -> str | None:
|
|
92
|
+
return next(
|
|
93
|
+
(
|
|
94
|
+
marker
|
|
95
|
+
for block in event.blocks
|
|
96
|
+
if isinstance(block, ToolResultBlock)
|
|
97
|
+
if (marker := interrupt_marker(block.content)) is not None
|
|
98
|
+
),
|
|
99
|
+
None,
|
|
100
|
+
)
|