contextbase-plugin-claude-local 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.3
2
+ Name: contextbase-plugin-claude-local
3
+ Version: 0.2.9
4
+ Summary: Claude local plugin for ContextBase
5
+ Author: Alizain Feerasta
6
+ Author-email: Alizain Feerasta <alizain.feerasta@gmail.com>
7
+ Requires-Dist: contextbase-shared-plugins==0.2.9
8
+ Requires-Dist: dagster==1.12.14
9
+ Requires-Dist: dagster-dlt==0.28.14
10
+ Requires-Dist: dlt>=1.26.0
11
+ Requires-Dist: pydantic>=2.12.0
12
+ Requires-Python: >=3.14, <3.15
@@ -0,0 +1,17 @@
1
+ plugin_claude_local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ plugin_claude_local/binding_config.py,sha256=a4cpuKyuUPQVqIqIMCMFy2meKLkk-sZjRcb3b6BOV7A,337
3
+ plugin_claude_local/component.py,sha256=9m-42ISkOrwsx0Jtxa8zVL-dDOSJfaAT4P1DVQN-7Ug,4120
4
+ plugin_claude_local/defs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ plugin_claude_local/defs/defs.yaml,sha256=eBgWLlMQShBV205vMyw-mhA6rS9-A4pu8PnBvPsdNbw,61
6
+ plugin_claude_local/models/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
7
+ plugin_claude_local/models/ctx.py,sha256=mDOde7jA0IQzD0BMHoPA0CG4ijd3C4MDC1nHhrUrD4M,2605
8
+ plugin_claude_local/models/ingress.py,sha256=9zG4L5wSS3v8I-oeXKskZkuzrw27yQtGAD0RHKwVENs,3843
9
+ plugin_claude_local/models/translators.py,sha256=i9HBM0wBZBWfUkk4HMoa8ad7FwYNOa-87EWMCObgJQo,7506
10
+ plugin_claude_local/plugin.json,sha256=j_wDEBQ05D_7oMz5wXWUZz-BHhjDxMazubGTzATbx6A,85
11
+ plugin_claude_local/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ plugin_claude_local/sources/snapshot.py,sha256=1MjTA7b3LfjAo2ldvX3DXXBvnBSqMsz8mFHZjX36bfE,3729
13
+ plugin_claude_local/utils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
14
+ plugin_claude_local/utils/parse.py,sha256=d6XtBzUksY-q1thsirmSQhJOUFad2GwRu-S7XMgmTss,12848
15
+ contextbase_plugin_claude_local-0.2.9.dist-info/WHEEL,sha256=i9aSRDivn5iP9LaR1BLQX2GNAuriQWPsFwbbWygTX2k,81
16
+ contextbase_plugin_claude_local-0.2.9.dist-info/METADATA,sha256=00_zPCVDl70-NkhIIHjB5mnbcWSSKmmHGwpVx5J-49U,412
17
+ contextbase_plugin_claude_local-0.2.9.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.15
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
File without changes
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import Field
6
+
7
+ from shared_plugins.bindings import BaseBindingConfigModel, ResolvedPath
8
+
9
+
10
+ class ClaudeLocalBindingConfig(BaseBindingConfigModel):
11
+ projects_dir: ResolvedPath = Field(
12
+ default_factory=lambda: Path.home() / ".claude" / "projects",
13
+ )
@@ -0,0 +1,122 @@
1
+ import dagster as dg
2
+ from dagster import AssetExecutionContext
3
+ from dagster_dlt import DagsterDltResource
4
+ from shared_plugins.automation import non_overlapping_automation_condition
5
+ from shared_plugins.bindings import parse_binding_config
6
+ from shared_plugins.control_plane import ControlPlaneClient
7
+ from shared_plugins.dlt import resolve_partition_binding, run_dlt_pipeline
8
+ from shared_plugins.naming import (
9
+ dagster_asset_group_name,
10
+ dagster_asset_tags,
11
+ dagster_dlt_asset_key,
12
+ dagster_partition_def_name,
13
+ dagster_pool_name,
14
+ dlt_source_name,
15
+ plugin_id_from_module,
16
+ )
17
+ from shared_plugins.resources import DLT_RESOURCE
18
+
19
+ from .binding_config import ClaudeLocalBindingConfig
20
+ from .sources.snapshot import claude_local_snapshot_source
21
+
22
+ PLUGIN_ID = plugin_id_from_module(__file__)
23
+ SNAPSHOT_JOB = "snapshot"
24
+ SNAPSHOT_SOURCE_NAME = dlt_source_name(PLUGIN_ID, SNAPSHOT_JOB)
25
+
26
+
27
+ def _build_snapshot_specs(
28
+ partitions_def: dg.PartitionsDefinition,
29
+ automation_condition: dg.AutomationCondition,
30
+ ) -> list[dg.AssetSpec]:
31
+ shared = dict(
32
+ group_name=dagster_asset_group_name(PLUGIN_ID),
33
+ tags=dagster_asset_tags(PLUGIN_ID),
34
+ automation_condition=automation_condition,
35
+ partitions_def=partitions_def,
36
+ )
37
+
38
+ project_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "project")
39
+ conversation_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "conversation")
40
+ line_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "line")
41
+ turn_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "turn")
42
+
43
+ return [
44
+ dg.AssetSpec(
45
+ key=project_key,
46
+ **shared,
47
+ ),
48
+ dg.AssetSpec(
49
+ key=conversation_key,
50
+ deps=[project_key],
51
+ **shared,
52
+ ),
53
+ dg.AssetSpec(
54
+ key=line_key,
55
+ deps=[conversation_key],
56
+ **shared,
57
+ ),
58
+ dg.AssetSpec(
59
+ key=turn_key,
60
+ deps=[line_key],
61
+ **shared,
62
+ ),
63
+ ]
64
+
65
+
66
+ class ClaudeLocalSyncComponent(dg.Component):
67
+ def build_defs(self, context: dg.ComponentLoadContext) -> dg.Definitions:
68
+ partitions_def = dg.DynamicPartitionsDefinition(
69
+ name=dagster_partition_def_name(PLUGIN_ID)
70
+ )
71
+
72
+ snapshot_specs = _build_snapshot_specs(
73
+ partitions_def=partitions_def,
74
+ automation_condition=non_overlapping_automation_condition(
75
+ dg.AutomationCondition.on_missing()
76
+ | dg.AutomationCondition.on_cron("*/15 * * * *")
77
+ ),
78
+ )
79
+
80
+ @dg.multi_asset(
81
+ specs=snapshot_specs,
82
+ can_subset=True,
83
+ name="claude_local_snapshot",
84
+ pool=dagster_pool_name(PLUGIN_ID),
85
+ )
86
+ def claude_local_snapshot_assets(
87
+ context: AssetExecutionContext,
88
+ dlt_resource: DagsterDltResource,
89
+ control_plane: dg.ResourceParam[ControlPlaneClient],
90
+ ):
91
+ binding = resolve_partition_binding(
92
+ context=context,
93
+ control_plane=control_plane,
94
+ plugin_id=PLUGIN_ID,
95
+ )
96
+ binding_id = str(binding.binding_id)
97
+ cfg = parse_binding_config(binding, ClaudeLocalBindingConfig)
98
+
99
+ source = claude_local_snapshot_source(binding_id, cfg)
100
+ yield from run_dlt_pipeline(
101
+ context=context,
102
+ dlt_resource=dlt_resource,
103
+ source=source,
104
+ plugin_id=PLUGIN_ID,
105
+ binding_id=binding_id,
106
+ job_name=SNAPSHOT_JOB,
107
+ )
108
+
109
+ automation_sensor = dg.AutomationConditionSensorDefinition(
110
+ name="claude_local_automation_sensor",
111
+ target=dg.AssetSelection.assets(claude_local_snapshot_assets),
112
+ default_status=dg.DefaultSensorStatus.RUNNING,
113
+ minimum_interval_seconds=30,
114
+ )
115
+
116
+ return dg.Definitions(
117
+ assets=[claude_local_snapshot_assets],
118
+ sensors=[automation_sensor],
119
+ resources={
120
+ "dlt_resource": DLT_RESOURCE,
121
+ },
122
+ )
File without changes
@@ -0,0 +1 @@
1
+ type: plugin_claude_local.component.ClaudeLocalSyncComponent
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,89 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import AwareDatetime
6
+ from shared_plugins.models import CtxModel, IdStr, NonNegativeInt
7
+
8
+
9
+ class ProjectRow(CtxModel):
10
+ encoded_dir: IdStr
11
+
12
+
13
+ class ConversationRow(CtxModel):
14
+ jsonl_stem: IdStr
15
+ project_dir: IdStr
16
+ jsonl_parent_dir: str = ""
17
+ jsonl_path: IdStr
18
+ jsonl_mtime: AwareDatetime | None = None
19
+ user_message_count: NonNegativeInt
20
+ assistant_message_count: NonNegativeInt
21
+
22
+
23
+ class LineRow(CtxModel):
24
+ project_dir: IdStr
25
+ jsonl_stem: IdStr
26
+ jsonl_parent_dir: str = ""
27
+ line_index: NonNegativeInt
28
+ type: str | None = None
29
+ subtype: str | None = None
30
+ uuid: str | None = None
31
+ parent_uuid: str | None = None
32
+ timestamp: AwareDatetime | None = None
33
+ session_id: str | None = None
34
+ version: str | None = None
35
+ cwd: str | None = None
36
+ # Canonical JSONL line text, with trailing CR/LF removed by the parser.
37
+ raw_line: str
38
+ # Parsed JSON value for querying. Strings in nested structures may have
39
+ # embedded null bytes removed before persistence because Postgres rejects
40
+ # \u0000 in text/jsonb.
41
+ payload: Any = None
42
+ parse_error: str | None = None
43
+ turn_projection_error: str | None = None
44
+
45
+
46
+ class TurnRow(CtxModel):
47
+ # -- file metadata --
48
+ project_dir: IdStr
49
+ jsonl_stem: IdStr
50
+ jsonl_parent_dir: str = ""
51
+ line_index: NonNegativeInt
52
+
53
+ # -- event envelope (union of user + assistant event fields) --
54
+ type: str
55
+ uuid: IdStr
56
+ parent_uuid: str | None = None
57
+ timestamp: AwareDatetime
58
+ is_sidechain: bool
59
+ user_type: str
60
+ cwd: str
61
+ session_id: str
62
+ version: str
63
+ git_branch: str
64
+ slug: str | None = None
65
+ agent_id: str | None = None
66
+ entrypoint: str | None = None
67
+
68
+ # -- user-only fields --
69
+ prompt_id: str | None = None
70
+ is_meta: bool | None = None
71
+ tool_use_result: Any = None # jsonb: dict | list | str | None
72
+ source_tool_use_id: str | None = None
73
+ source_tool_assistant_uuid: str | None = None
74
+ thinking_metadata: Any = None # jsonb: dict | None
75
+ permission_mode: str | None = None
76
+ todos: Any = None # jsonb: list | None
77
+ is_visible_in_transcript_only: bool | None = None
78
+ is_compact_summary: bool | None = None
79
+ plan_content: str | None = None
80
+
81
+ # -- assistant-only fields --
82
+ request_id: str | None = None
83
+ error: str | None = None
84
+ is_api_error_message: bool | None = None
85
+ api_error: str | None = None
86
+
87
+ # -- message content / top-level drift (jsonb) --
88
+ message: dict[str, Any]
89
+ turn_extra: dict[str, Any] | None = None
@@ -0,0 +1,110 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Literal
4
+
5
+ from pydantic import AwareDatetime, Field
6
+ from shared_plugins.models import IngressModel
7
+
8
+ # These key sets define the curated transcript projection boundary.
9
+ # parse.py uses them to split a raw user/assistant record into:
10
+ # 1. first-class turn columns we intentionally promote, and
11
+ # 2. turn_extra for every other top-level key.
12
+ # Add a key here only when we want a durable query column in claude_local.turn.
13
+ COMMON_TRANSCRIPT_SOURCE_KEYS = frozenset(
14
+ {
15
+ "type",
16
+ "uuid",
17
+ "parentUuid",
18
+ "timestamp",
19
+ "message",
20
+ "isSidechain",
21
+ "userType",
22
+ "cwd",
23
+ "sessionId",
24
+ "version",
25
+ "gitBranch",
26
+ "slug",
27
+ "agentId",
28
+ "entrypoint",
29
+ }
30
+ )
31
+
32
+ USER_TRANSCRIPT_SOURCE_KEYS = COMMON_TRANSCRIPT_SOURCE_KEYS | frozenset(
33
+ {
34
+ "promptId",
35
+ "isMeta",
36
+ "toolUseResult",
37
+ "sourceToolUseID",
38
+ "sourceToolAssistantUUID",
39
+ "thinkingMetadata",
40
+ "permissionMode",
41
+ "todos",
42
+ "isVisibleInTranscriptOnly",
43
+ "isCompactSummary",
44
+ "planContent",
45
+ }
46
+ )
47
+
48
+ ASSISTANT_TRANSCRIPT_SOURCE_KEYS = COMMON_TRANSCRIPT_SOURCE_KEYS | frozenset(
49
+ {
50
+ "requestId",
51
+ "error",
52
+ "isApiErrorMessage",
53
+ "apiError",
54
+ }
55
+ )
56
+
57
+
58
+ class ClaudeTranscriptTurnIngress(IngressModel):
59
+ # This model is intentionally narrower than Claude's full local JSONL schema.
60
+ # It validates the transcript projection that claude_local.turn promises, not
61
+ # every nested field Claude may emit in the raw source log.
62
+ uuid: str
63
+ parent_uuid: str | None = Field(alias="parentUuid")
64
+ timestamp: AwareDatetime
65
+ # Keep the message envelope as raw JSON so future nested drift stays queryable
66
+ # without forcing ingress model churn for every new Claude field.
67
+ message: dict[str, Any]
68
+ is_sidechain: bool = Field(alias="isSidechain")
69
+ user_type: str = Field(alias="userType")
70
+ cwd: str
71
+ session_id: str = Field(alias="sessionId")
72
+ version: str
73
+ git_branch: str = Field(alias="gitBranch")
74
+ slug: str | None = None
75
+ agent_id: str | None = Field(default=None, alias="agentId")
76
+ entrypoint: str | None = None
77
+ # Top-level transcript keys that exist in the raw record but are not promoted
78
+ # to first-class columns land here so claude_local.turn does not drop them.
79
+ turn_extra: dict[str, Any] | None = None
80
+
81
+
82
+ class ClaudeUserTurnIngress(ClaudeTranscriptTurnIngress):
83
+ type: Literal["user"]
84
+ prompt_id: str | None = Field(default=None, alias="promptId")
85
+ is_meta: bool | None = Field(default=None, alias="isMeta")
86
+ tool_use_result: dict[str, Any] | list[Any] | str | None = Field(
87
+ default=None, alias="toolUseResult"
88
+ )
89
+ source_tool_use_id: str | None = Field(default=None, alias="sourceToolUseID")
90
+ source_tool_assistant_uuid: str | None = Field(
91
+ default=None, alias="sourceToolAssistantUUID"
92
+ )
93
+ thinking_metadata: dict[str, Any] | None = Field(
94
+ default=None, alias="thinkingMetadata"
95
+ )
96
+ permission_mode: str | None = Field(default=None, alias="permissionMode")
97
+ todos: list[Any] | None = None
98
+ is_visible_in_transcript_only: bool | None = Field(
99
+ default=None, alias="isVisibleInTranscriptOnly"
100
+ )
101
+ is_compact_summary: bool | None = Field(default=None, alias="isCompactSummary")
102
+ plan_content: str | None = Field(default=None, alias="planContent")
103
+
104
+
105
+ class ClaudeAssistantTurnIngress(ClaudeTranscriptTurnIngress):
106
+ type: Literal["assistant"]
107
+ request_id: str | None = Field(default=None, alias="requestId")
108
+ error: str | None = None
109
+ is_api_error_message: bool | None = Field(default=None, alias="isApiErrorMessage")
110
+ api_error: str | None = Field(default=None, alias="apiError")
@@ -0,0 +1,208 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable, Iterator
4
+ from typing import Any
5
+
6
+ from ..utils.parse import ParsedConversationFile
7
+ from .ctx import ConversationRow, LineRow, ProjectRow, TurnRow
8
+ from .ingress import ClaudeAssistantTurnIngress, ClaudeUserTurnIngress
9
+
10
+
11
+ def _strip_null_bytes(value: Any) -> Any:
12
+ """Recursively strip \\x00 null bytes from strings in a structure.
13
+
14
+ Postgres rejects \\u0000 in both text and jsonb columns.
15
+
16
+ This is intentionally applied only to structured values that must be written
17
+ to Postgres text/jsonb columns. It does not change LineRow.raw_line, which
18
+ remains the canonical decoded JSONL line text for fidelity/debugging.
19
+ """
20
+ if isinstance(value, str):
21
+ return value.replace("\x00", "")
22
+ if isinstance(value, dict):
23
+ return {k: _strip_null_bytes(v) for k, v in value.items()}
24
+ if isinstance(value, list):
25
+ return [_strip_null_bytes(item) for item in value]
26
+ return value
27
+
28
+
29
+ def projects_to_ctx_models(
30
+ *,
31
+ binding_id: str,
32
+ project_dirs: Iterable[str],
33
+ ) -> Iterator[ProjectRow]:
34
+ seen_dirs: set[str] = set()
35
+
36
+ for encoded_dir in project_dirs:
37
+ if encoded_dir in seen_dirs:
38
+ continue
39
+ seen_dirs.add(encoded_dir)
40
+
41
+ yield ProjectRow(
42
+ ctx_binding_id=binding_id,
43
+ ctx_source_updated_at=None,
44
+ encoded_dir=encoded_dir,
45
+ )
46
+
47
+
48
+ def conversations_to_ctx_models(
49
+ *,
50
+ binding_id: str,
51
+ conversations: Iterable[ParsedConversationFile],
52
+ ) -> Iterator[ConversationRow]:
53
+ for conversation in conversations:
54
+ yield ConversationRow(
55
+ ctx_binding_id=binding_id,
56
+ ctx_source_updated_at=conversation.file_mtime,
57
+ jsonl_stem=conversation.conversation_id,
58
+ project_dir=conversation.project,
59
+ jsonl_parent_dir=conversation.parent_conversation_id or "",
60
+ jsonl_path=conversation.file_path,
61
+ jsonl_mtime=conversation.file_mtime,
62
+ user_message_count=conversation.user_message_count,
63
+ assistant_message_count=conversation.assistant_message_count,
64
+ )
65
+
66
+
67
+ def lines_to_ctx_models(
68
+ *,
69
+ binding_id: str,
70
+ conversations: Iterable[ParsedConversationFile],
71
+ ) -> Iterator[LineRow]:
72
+ for conversation in conversations:
73
+ for line in conversation.lines:
74
+ yield LineRow(
75
+ ctx_binding_id=binding_id,
76
+ ctx_source_updated_at=line.timestamp,
77
+ project_dir=conversation.project,
78
+ jsonl_stem=conversation.conversation_id,
79
+ jsonl_parent_dir=conversation.parent_conversation_id or "",
80
+ line_index=line.line_index,
81
+ type=line.type,
82
+ subtype=line.subtype,
83
+ uuid=line.uuid,
84
+ parent_uuid=line.parent_uuid,
85
+ timestamp=line.timestamp,
86
+ session_id=line.session_id,
87
+ version=line.version,
88
+ cwd=line.cwd,
89
+ raw_line=line.raw_line,
90
+ # payload is the queryable parsed JSON form of the source line, not
91
+ # a byte-perfect mirror. Keep raw_line when exact decoded line text
92
+ # matters, especially for malformed JSON or embedded \\u0000 cases.
93
+ payload=(
94
+ _strip_null_bytes(line.payload)
95
+ if line.payload is not None
96
+ else None
97
+ ),
98
+ parse_error=line.parse_error,
99
+ turn_projection_error=line.turn_projection_error,
100
+ )
101
+
102
+
103
+ def _user_turn_to_ctx_model(
104
+ *,
105
+ binding_id: str,
106
+ conversation: ParsedConversationFile,
107
+ turn: ClaudeUserTurnIngress,
108
+ line_index: int,
109
+ ) -> TurnRow:
110
+ # claude_local.turn is a projection, not a source rewrite. Preserve message,
111
+ # tool_use_result, and turn_extra shapes as-is apart from null-byte stripping.
112
+ return TurnRow(
113
+ ctx_binding_id=binding_id,
114
+ ctx_source_updated_at=turn.timestamp,
115
+ project_dir=conversation.project,
116
+ jsonl_stem=conversation.conversation_id,
117
+ jsonl_parent_dir=conversation.parent_conversation_id or "",
118
+ line_index=line_index,
119
+ type=turn.type,
120
+ uuid=turn.uuid,
121
+ parent_uuid=turn.parent_uuid,
122
+ timestamp=turn.timestamp,
123
+ is_sidechain=turn.is_sidechain,
124
+ user_type=turn.user_type,
125
+ cwd=turn.cwd,
126
+ session_id=turn.session_id,
127
+ version=turn.version,
128
+ git_branch=turn.git_branch,
129
+ slug=turn.slug,
130
+ agent_id=turn.agent_id,
131
+ entrypoint=turn.entrypoint,
132
+ prompt_id=turn.prompt_id,
133
+ is_meta=turn.is_meta,
134
+ tool_use_result=_strip_null_bytes(turn.tool_use_result),
135
+ source_tool_use_id=turn.source_tool_use_id,
136
+ source_tool_assistant_uuid=turn.source_tool_assistant_uuid,
137
+ thinking_metadata=_strip_null_bytes(turn.thinking_metadata),
138
+ permission_mode=turn.permission_mode,
139
+ todos=_strip_null_bytes(turn.todos),
140
+ is_visible_in_transcript_only=turn.is_visible_in_transcript_only,
141
+ is_compact_summary=turn.is_compact_summary,
142
+ plan_content=turn.plan_content,
143
+ message=_strip_null_bytes(turn.message),
144
+ turn_extra=_strip_null_bytes(turn.turn_extra),
145
+ )
146
+
147
+
148
+ def _assistant_turn_to_ctx_model(
149
+ *,
150
+ binding_id: str,
151
+ conversation: ParsedConversationFile,
152
+ turn: ClaudeAssistantTurnIngress,
153
+ line_index: int,
154
+ ) -> TurnRow:
155
+ # Keep assistant message/turn_extra structures raw so new nested Claude fields
156
+ # survive in the curated transcript table without bespoke translator logic.
157
+ return TurnRow(
158
+ ctx_binding_id=binding_id,
159
+ ctx_source_updated_at=turn.timestamp,
160
+ project_dir=conversation.project,
161
+ jsonl_stem=conversation.conversation_id,
162
+ jsonl_parent_dir=conversation.parent_conversation_id or "",
163
+ line_index=line_index,
164
+ type=turn.type,
165
+ uuid=turn.uuid,
166
+ parent_uuid=turn.parent_uuid,
167
+ timestamp=turn.timestamp,
168
+ is_sidechain=turn.is_sidechain,
169
+ user_type=turn.user_type,
170
+ cwd=turn.cwd,
171
+ session_id=turn.session_id,
172
+ version=turn.version,
173
+ git_branch=turn.git_branch,
174
+ slug=turn.slug,
175
+ agent_id=turn.agent_id,
176
+ entrypoint=turn.entrypoint,
177
+ request_id=turn.request_id,
178
+ error=turn.error,
179
+ is_api_error_message=turn.is_api_error_message,
180
+ api_error=turn.api_error,
181
+ message=_strip_null_bytes(turn.message),
182
+ turn_extra=_strip_null_bytes(turn.turn_extra),
183
+ )
184
+
185
+
186
+ def turns_to_ctx_models(
187
+ *,
188
+ binding_id: str,
189
+ conversations: Iterable[ParsedConversationFile],
190
+ ) -> Iterator[TurnRow]:
191
+ for conversation in conversations:
192
+ for validated in conversation.turns:
193
+ ingress = validated.ingress
194
+
195
+ if isinstance(ingress, ClaudeUserTurnIngress):
196
+ yield _user_turn_to_ctx_model(
197
+ binding_id=binding_id,
198
+ conversation=conversation,
199
+ turn=ingress,
200
+ line_index=validated.line_index,
201
+ )
202
+ elif isinstance(ingress, ClaudeAssistantTurnIngress):
203
+ yield _assistant_turn_to_ctx_model(
204
+ binding_id=binding_id,
205
+ conversation=conversation,
206
+ turn=ingress,
207
+ line_index=validated.line_index,
208
+ )
@@ -0,0 +1,7 @@
1
+ {
2
+ "auth": {
3
+ "type": "none"
4
+ },
5
+ "mode": "dagster",
6
+ "plugin_id": "claude_local"
7
+ }
File without changes
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import Any
5
+
6
+ import dlt
7
+ from shared_plugins.naming import (
8
+ dlt_resource_name,
9
+ dlt_source_name,
10
+ plugin_id_from_module,
11
+ )
12
+ from shared_plugins.resources import ctx_dlt_resource
13
+
14
+ from ..binding_config import ClaudeLocalBindingConfig
15
+ from ..models.ctx import ConversationRow, LineRow, ProjectRow, TurnRow
16
+ from ..models.translators import (
17
+ conversations_to_ctx_models,
18
+ lines_to_ctx_models,
19
+ projects_to_ctx_models,
20
+ turns_to_ctx_models,
21
+ )
22
+ from ..utils.parse import scan_claude_snapshot
23
+
24
+ PLUGIN_ID = plugin_id_from_module(__file__)
25
+ JOB = "snapshot"
26
+ MERGE_WRITE_DISPOSITION = {"disposition": "merge", "strategy": "upsert"}
27
+
28
+
29
+ @dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
30
+ def claude_local_snapshot_source(
31
+ binding_id: str,
32
+ cfg: ClaudeLocalBindingConfig,
33
+ ) -> tuple[Any, ...]:
34
+ snapshot = scan_claude_snapshot(cfg.projects_dir)
35
+
36
+ @ctx_dlt_resource(
37
+ name=dlt_resource_name("project"),
38
+ write_disposition=MERGE_WRITE_DISPOSITION,
39
+ primary_key=("_ctx_binding_id", "encoded_dir"),
40
+ )
41
+ def project_resource() -> Iterator[ProjectRow]:
42
+ yield from projects_to_ctx_models(
43
+ binding_id=binding_id,
44
+ project_dirs=snapshot.project_dirs,
45
+ )
46
+
47
+ @ctx_dlt_resource(
48
+ name=dlt_resource_name("conversation"),
49
+ write_disposition=MERGE_WRITE_DISPOSITION,
50
+ primary_key=(
51
+ "_ctx_binding_id",
52
+ "project_dir",
53
+ "jsonl_parent_dir",
54
+ "jsonl_stem",
55
+ ),
56
+ )
57
+ def conversation_resource() -> Iterator[ConversationRow]:
58
+ yield from conversations_to_ctx_models(
59
+ binding_id=binding_id,
60
+ conversations=snapshot.conversations,
61
+ )
62
+
63
+ @ctx_dlt_resource(
64
+ name=dlt_resource_name("line"),
65
+ write_disposition=MERGE_WRITE_DISPOSITION,
66
+ primary_key=(
67
+ "_ctx_binding_id",
68
+ "project_dir",
69
+ "jsonl_parent_dir",
70
+ "jsonl_stem",
71
+ "line_index",
72
+ ),
73
+ columns={
74
+ "payload": {"data_type": "json"},
75
+ # These columns are correctness-critical but may be all-null on a clean
76
+ # corpus. Declare them explicitly so dlt does not omit them when it
77
+ # cannot infer a type from the current load package.
78
+ "parse_error": {"data_type": "text"},
79
+ "turn_projection_error": {"data_type": "text"},
80
+ },
81
+ )
82
+ def line_resource() -> Iterator[LineRow]:
83
+ yield from lines_to_ctx_models(
84
+ binding_id=binding_id,
85
+ conversations=snapshot.conversations,
86
+ )
87
+
88
+ @ctx_dlt_resource(
89
+ name=dlt_resource_name("turn"),
90
+ write_disposition=MERGE_WRITE_DISPOSITION,
91
+ primary_key=(
92
+ "_ctx_binding_id",
93
+ "project_dir",
94
+ "jsonl_parent_dir",
95
+ "jsonl_stem",
96
+ "line_index",
97
+ ),
98
+ columns={
99
+ # Preserve transcript substructures as JSON so nested Claude drift stays
100
+ # queryable without creating new relational columns every release.
101
+ "message": {"data_type": "json"},
102
+ "tool_use_result": {"data_type": "json"},
103
+ "thinking_metadata": {"data_type": "json"},
104
+ "todos": {"data_type": "json"},
105
+ "turn_extra": {"data_type": "json"},
106
+ },
107
+ )
108
+ def turn_resource() -> Iterator[TurnRow]:
109
+ yield from turns_to_ctx_models(
110
+ binding_id=binding_id,
111
+ conversations=snapshot.conversations,
112
+ )
113
+
114
+ return (
115
+ project_resource,
116
+ conversation_resource,
117
+ line_resource,
118
+ turn_resource,
119
+ )
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,420 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ from dataclasses import dataclass
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from pydantic import ValidationError
12
+ from shared_plugins.models import format_validation_error
13
+ from shared_plugins.values import as_mapping, as_string, parse_utc_datetime_from_str
14
+
15
+ from ..models.ingress import (
16
+ ASSISTANT_TRANSCRIPT_SOURCE_KEYS,
17
+ USER_TRANSCRIPT_SOURCE_KEYS,
18
+ ClaudeAssistantTurnIngress,
19
+ ClaudeUserTurnIngress,
20
+ )
21
+
22
+ LOGGER = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class ValidatedTurn:
27
+ line_index: int
28
+ ingress: ClaudeUserTurnIngress | ClaudeAssistantTurnIngress
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class ParsedLine:
33
+ line_index: int
34
+ # The original non-empty JSONL line text with trailing line terminators removed.
35
+ raw_line: str
36
+ payload: Any = None
37
+ parse_error: str | None = None
38
+ type: str | None = None
39
+ subtype: str | None = None
40
+ uuid: str | None = None
41
+ parent_uuid: str | None = None
42
+ timestamp: datetime | None = None
43
+ session_id: str | None = None
44
+ version: str | None = None
45
+ cwd: str | None = None
46
+ turn_projection_error: str | None = None
47
+ turn: ValidatedTurn | None = None
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class ConversationFileRef:
52
+ file_path: str
53
+ project: str
54
+ conversation_id: str
55
+ parent_conversation_id: str | None
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class ParsedConversationFile:
60
+ file_path: str
61
+ project: str
62
+ conversation_id: str
63
+ parent_conversation_id: str | None
64
+ file_mtime: datetime | None
65
+ lines: list[ParsedLine]
66
+ turns: list[ValidatedTurn]
67
+ user_message_count: int
68
+ assistant_message_count: int
69
+
70
+
71
+ @dataclass(frozen=True)
72
+ class ParsedClaudeSnapshot:
73
+ project_dirs: list[str]
74
+ conversations: list[ParsedConversationFile]
75
+
76
+
77
+ def parse_json_line(line: str) -> tuple[Any | None, str | None]:
78
+ try:
79
+ return json.loads(line), None
80
+ except json.JSONDecodeError as exc:
81
+ return None, f"invalid_json: {exc.msg} (column {exc.colno})"
82
+
83
+
84
+ def get_file_mtime(file_path: str) -> datetime | None:
85
+ try:
86
+ stat_result = Path(file_path).stat()
87
+ except OSError:
88
+ return None
89
+ return datetime.fromtimestamp(stat_result.st_mtime, tz=timezone.utc)
90
+
91
+
92
+ def _read_jsonl_lines(file_path: str) -> list[tuple[int, str]] | None:
93
+ """Read non-empty JSONL lines as text.
94
+
95
+ This preserves the original decoded line contents, but strips trailing CR/LF
96
+ terminators and skips blank lines. It is therefore a canonical JSONL-line
97
+ capture, not a byte-for-byte file mirror.
98
+ """
99
+ lines: list[tuple[int, str]] = []
100
+
101
+ try:
102
+ with open(file_path, "r", encoding="utf-8") as handle:
103
+ for line_index, raw_line in enumerate(handle):
104
+ line = raw_line.rstrip("\n").rstrip("\r")
105
+ if not line.strip():
106
+ continue
107
+ lines.append((line_index, line))
108
+ except (OSError, UnicodeError):
109
+ return None
110
+
111
+ return lines
112
+
113
+
114
+ def _extract_timestamp(value: object) -> datetime | None:
115
+ timestamp_raw = as_string(value)
116
+ if timestamp_raw is None:
117
+ return None
118
+
119
+ try:
120
+ return parse_utc_datetime_from_str(timestamp_raw)
121
+ except (TypeError, ValueError):
122
+ return None
123
+
124
+
125
+ def _build_projection_payload(
126
+ raw: dict[str, Any],
127
+ *,
128
+ source_keys: frozenset[str],
129
+ ) -> dict[str, Any]:
130
+ # Split the raw transcript line into promoted turn columns and durable
131
+ # top-level drift. This is the seam that lets ingress stay strict without
132
+ # pretending Claude's full top-level shape is stable.
133
+ prepared = {key: value for key, value in raw.items() if key in source_keys}
134
+ turn_extra = {key: value for key, value in raw.items() if key not in source_keys}
135
+ if turn_extra:
136
+ prepared["turn_extra"] = turn_extra
137
+ return prepared
138
+
139
+
140
+ def _validate_projection_model(
141
+ raw: dict[str, Any],
142
+ *,
143
+ file_path: str,
144
+ line_index: int,
145
+ source_keys: frozenset[str],
146
+ model_type: type[ClaudeUserTurnIngress] | type[ClaudeAssistantTurnIngress],
147
+ ) -> tuple[ValidatedTurn | None, str | None]:
148
+ prepared = _build_projection_payload(raw, source_keys=source_keys)
149
+
150
+ try:
151
+ ingress = model_type.model_validate(prepared)
152
+ except ValidationError as exc:
153
+ error = format_validation_error(exc)
154
+ LOGGER.warning(
155
+ "claude_local.turn_projection_failed type=%s file=%s line=%d error=%s raw=%s",
156
+ raw.get("type"),
157
+ file_path,
158
+ line_index,
159
+ error,
160
+ json.dumps(raw, default=str, ensure_ascii=False)[:2000],
161
+ )
162
+ return None, error
163
+
164
+ return ValidatedTurn(line_index=line_index, ingress=ingress), None
165
+
166
+
167
+ def _project_transcript_turn(
168
+ raw: dict[str, Any],
169
+ *,
170
+ file_path: str,
171
+ line_index: int,
172
+ ) -> tuple[ValidatedTurn | None, str | None]:
173
+ # Only user/assistant lines participate in the curated transcript surface.
174
+ # Every other top-level Claude line type still lands in claude_local.line.
175
+ line_type = as_string(raw.get("type"))
176
+
177
+ if line_type == "user":
178
+ return _validate_projection_model(
179
+ raw,
180
+ file_path=file_path,
181
+ line_index=line_index,
182
+ source_keys=USER_TRANSCRIPT_SOURCE_KEYS,
183
+ model_type=ClaudeUserTurnIngress,
184
+ )
185
+
186
+ if line_type == "assistant":
187
+ return _validate_projection_model(
188
+ raw,
189
+ file_path=file_path,
190
+ line_index=line_index,
191
+ source_keys=ASSISTANT_TRANSCRIPT_SOURCE_KEYS,
192
+ model_type=ClaudeAssistantTurnIngress,
193
+ )
194
+
195
+ return None, None
196
+
197
+
198
+ def _build_line(
199
+ *,
200
+ raw_line: str,
201
+ payload: Any,
202
+ parse_error: str | None,
203
+ file_path: str,
204
+ line_index: int,
205
+ ) -> ParsedLine:
206
+ payload_mapping = as_mapping(payload)
207
+ line_type = as_string(payload_mapping.get("type")) if payload_mapping else None
208
+
209
+ turn: ValidatedTurn | None = None
210
+ turn_projection_error: str | None = None
211
+ if payload_mapping is not None:
212
+ turn, turn_projection_error = _project_transcript_turn(
213
+ dict(payload_mapping),
214
+ file_path=file_path,
215
+ line_index=line_index,
216
+ )
217
+
218
+ return ParsedLine(
219
+ line_index=line_index,
220
+ raw_line=raw_line,
221
+ payload=payload,
222
+ parse_error=parse_error,
223
+ type=line_type,
224
+ subtype=(
225
+ as_string(payload_mapping.get("subtype")) if payload_mapping else None
226
+ ),
227
+ uuid=as_string(payload_mapping.get("uuid")) if payload_mapping else None,
228
+ parent_uuid=(
229
+ as_string(payload_mapping.get("parentUuid")) if payload_mapping else None
230
+ ),
231
+ timestamp=(
232
+ _extract_timestamp(payload_mapping.get("timestamp"))
233
+ if payload_mapping
234
+ else None
235
+ ),
236
+ session_id=(
237
+ as_string(payload_mapping.get("sessionId")) if payload_mapping else None
238
+ ),
239
+ version=as_string(payload_mapping.get("version")) if payload_mapping else None,
240
+ cwd=as_string(payload_mapping.get("cwd")) if payload_mapping else None,
241
+ turn_projection_error=turn_projection_error,
242
+ turn=turn,
243
+ )
244
+
245
+
246
+ def list_project_dirs(projects_dir: Path) -> list[str]:
247
+ try:
248
+ if not projects_dir.is_dir():
249
+ return []
250
+ except OSError:
251
+ return []
252
+
253
+ project_dirs: list[str] = []
254
+ try:
255
+ entries = list(os.scandir(projects_dir))
256
+ except OSError:
257
+ return []
258
+
259
+ for entry in entries:
260
+ try:
261
+ if entry.is_dir(follow_symlinks=False):
262
+ project_dirs.append(entry.name)
263
+ except OSError:
264
+ continue
265
+
266
+ project_dirs.sort()
267
+ return project_dirs
268
+
269
+
270
+ def _list_project_conversation_file_refs(
271
+ projects_path: Path,
272
+ project: str,
273
+ ) -> list[ConversationFileRef]:
274
+ project_dir = projects_path / project
275
+
276
+ try:
277
+ entries = list(os.scandir(project_dir))
278
+ except OSError:
279
+ LOGGER.warning(
280
+ "claude_local.skipped_project reason=%s project_dir=%s",
281
+ "unreadable_project_dir",
282
+ project_dir,
283
+ )
284
+ return []
285
+
286
+ refs: list[ConversationFileRef] = []
287
+ parent_candidates: list[str] = []
288
+
289
+ for entry in entries:
290
+ try:
291
+ if entry.is_file(follow_symlinks=False) and entry.name.lower().endswith(
292
+ ".jsonl"
293
+ ):
294
+ conversation_id = Path(entry.name).stem
295
+ if not conversation_id:
296
+ continue
297
+ refs.append(
298
+ ConversationFileRef(
299
+ file_path=entry.path,
300
+ project=project,
301
+ conversation_id=conversation_id,
302
+ parent_conversation_id=None,
303
+ )
304
+ )
305
+ elif entry.is_dir(follow_symlinks=False):
306
+ parent_candidates.append(entry.name)
307
+ except OSError:
308
+ continue
309
+
310
+ for parent_conversation_id in sorted(parent_candidates):
311
+ subagents_dir = project_dir / parent_conversation_id / "subagents"
312
+ try:
313
+ if not subagents_dir.is_dir():
314
+ continue
315
+ except OSError:
316
+ continue
317
+
318
+ try:
319
+ subagent_entries = list(os.scandir(subagents_dir))
320
+ except OSError:
321
+ LOGGER.warning(
322
+ "claude_local.skipped_subagents_dir reason=%s subagents_dir=%s",
323
+ "unreadable_subagents_dir",
324
+ subagents_dir,
325
+ )
326
+ continue
327
+
328
+ for subagent_entry in subagent_entries:
329
+ try:
330
+ if not subagent_entry.is_file(follow_symlinks=False):
331
+ continue
332
+ if not subagent_entry.name.lower().endswith(".jsonl"):
333
+ continue
334
+ except OSError:
335
+ continue
336
+
337
+ conversation_id = Path(subagent_entry.name).stem
338
+ if not conversation_id:
339
+ continue
340
+
341
+ refs.append(
342
+ ConversationFileRef(
343
+ file_path=subagent_entry.path,
344
+ project=project,
345
+ conversation_id=conversation_id,
346
+ parent_conversation_id=parent_conversation_id,
347
+ )
348
+ )
349
+
350
+ refs.sort(key=lambda ref: ref.file_path)
351
+ return refs
352
+
353
+
354
+ def list_conversation_file_refs(
355
+ projects_dir: Path,
356
+ project_dirs: list[str],
357
+ ) -> list[ConversationFileRef]:
358
+ refs: list[ConversationFileRef] = []
359
+ for project in project_dirs:
360
+ refs.extend(_list_project_conversation_file_refs(projects_dir, project))
361
+ refs.sort(key=lambda ref: ref.file_path)
362
+ return refs
363
+
364
+
365
+ def scan_claude_snapshot(projects_dir: Path) -> ParsedClaudeSnapshot:
366
+ project_dirs = list_project_dirs(projects_dir)
367
+ file_refs = list_conversation_file_refs(projects_dir, project_dirs)
368
+
369
+ conversations: list[ParsedConversationFile] = []
370
+ for file_ref in file_refs:
371
+ raw_lines = _read_jsonl_lines(file_ref.file_path)
372
+ if raw_lines is None:
373
+ LOGGER.warning(
374
+ "claude_local.skipped_file reason=%s file_path=%s project=%s conversation_id=%s",
375
+ "unreadable_file",
376
+ file_ref.file_path,
377
+ file_ref.project,
378
+ file_ref.conversation_id,
379
+ )
380
+ continue
381
+
382
+ lines: list[ParsedLine] = []
383
+ turns: list[ValidatedTurn] = []
384
+ user_message_count = 0
385
+ assistant_message_count = 0
386
+
387
+ for line_index, raw_line in raw_lines:
388
+ payload, parse_error = parse_json_line(raw_line)
389
+ line = _build_line(
390
+ raw_line=raw_line,
391
+ payload=payload,
392
+ parse_error=parse_error,
393
+ file_path=file_ref.file_path,
394
+ line_index=line_index,
395
+ )
396
+ lines.append(line)
397
+
398
+ if line.type == "user":
399
+ user_message_count += 1
400
+ elif line.type == "assistant":
401
+ assistant_message_count += 1
402
+
403
+ if line.turn is not None:
404
+ turns.append(line.turn)
405
+
406
+ conversations.append(
407
+ ParsedConversationFile(
408
+ file_path=file_ref.file_path,
409
+ project=file_ref.project,
410
+ conversation_id=file_ref.conversation_id,
411
+ parent_conversation_id=file_ref.parent_conversation_id,
412
+ file_mtime=get_file_mtime(file_ref.file_path),
413
+ lines=lines,
414
+ turns=turns,
415
+ user_message_count=user_message_count,
416
+ assistant_message_count=assistant_message_count,
417
+ )
418
+ )
419
+
420
+ return ParsedClaudeSnapshot(project_dirs=project_dirs, conversations=conversations)