contextbase-plugin-claude-local 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextbase_plugin_claude_local-0.2.9.dist-info/METADATA +12 -0
- contextbase_plugin_claude_local-0.2.9.dist-info/RECORD +17 -0
- contextbase_plugin_claude_local-0.2.9.dist-info/WHEEL +4 -0
- plugin_claude_local/__init__.py +0 -0
- plugin_claude_local/binding_config.py +13 -0
- plugin_claude_local/component.py +122 -0
- plugin_claude_local/defs/__init__.py +0 -0
- plugin_claude_local/defs/defs.yaml +1 -0
- plugin_claude_local/models/__init__.py +1 -0
- plugin_claude_local/models/ctx.py +89 -0
- plugin_claude_local/models/ingress.py +110 -0
- plugin_claude_local/models/translators.py +208 -0
- plugin_claude_local/plugin.json +7 -0
- plugin_claude_local/sources/__init__.py +0 -0
- plugin_claude_local/sources/snapshot.py +119 -0
- plugin_claude_local/utils/__init__.py +1 -0
- plugin_claude_local/utils/parse.py +420 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: contextbase-plugin-claude-local
|
|
3
|
+
Version: 0.2.9
|
|
4
|
+
Summary: Claude local plugin for ContextBase
|
|
5
|
+
Author: Alizain Feerasta
|
|
6
|
+
Author-email: Alizain Feerasta <alizain.feerasta@gmail.com>
|
|
7
|
+
Requires-Dist: contextbase-shared-plugins==0.2.9
|
|
8
|
+
Requires-Dist: dagster==1.12.14
|
|
9
|
+
Requires-Dist: dagster-dlt==0.28.14
|
|
10
|
+
Requires-Dist: dlt>=1.26.0
|
|
11
|
+
Requires-Dist: pydantic>=2.12.0
|
|
12
|
+
Requires-Python: >=3.14, <3.15
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
plugin_claude_local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
plugin_claude_local/binding_config.py,sha256=a4cpuKyuUPQVqIqIMCMFy2meKLkk-sZjRcb3b6BOV7A,337
|
|
3
|
+
plugin_claude_local/component.py,sha256=9m-42ISkOrwsx0Jtxa8zVL-dDOSJfaAT4P1DVQN-7Ug,4120
|
|
4
|
+
plugin_claude_local/defs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
plugin_claude_local/defs/defs.yaml,sha256=eBgWLlMQShBV205vMyw-mhA6rS9-A4pu8PnBvPsdNbw,61
|
|
6
|
+
plugin_claude_local/models/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
7
|
+
plugin_claude_local/models/ctx.py,sha256=mDOde7jA0IQzD0BMHoPA0CG4ijd3C4MDC1nHhrUrD4M,2605
|
|
8
|
+
plugin_claude_local/models/ingress.py,sha256=9zG4L5wSS3v8I-oeXKskZkuzrw27yQtGAD0RHKwVENs,3843
|
|
9
|
+
plugin_claude_local/models/translators.py,sha256=i9HBM0wBZBWfUkk4HMoa8ad7FwYNOa-87EWMCObgJQo,7506
|
|
10
|
+
plugin_claude_local/plugin.json,sha256=j_wDEBQ05D_7oMz5wXWUZz-BHhjDxMazubGTzATbx6A,85
|
|
11
|
+
plugin_claude_local/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
plugin_claude_local/sources/snapshot.py,sha256=1MjTA7b3LfjAo2ldvX3DXXBvnBSqMsz8mFHZjX36bfE,3729
|
|
13
|
+
plugin_claude_local/utils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
14
|
+
plugin_claude_local/utils/parse.py,sha256=d6XtBzUksY-q1thsirmSQhJOUFad2GwRu-S7XMgmTss,12848
|
|
15
|
+
contextbase_plugin_claude_local-0.2.9.dist-info/WHEEL,sha256=i9aSRDivn5iP9LaR1BLQX2GNAuriQWPsFwbbWygTX2k,81
|
|
16
|
+
contextbase_plugin_claude_local-0.2.9.dist-info/METADATA,sha256=00_zPCVDl70-NkhIIHjB5mnbcWSSKmmHGwpVx5J-49U,412
|
|
17
|
+
contextbase_plugin_claude_local-0.2.9.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import Field
|
|
6
|
+
|
|
7
|
+
from shared_plugins.bindings import BaseBindingConfigModel, ResolvedPath
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ClaudeLocalBindingConfig(BaseBindingConfigModel):
|
|
11
|
+
projects_dir: ResolvedPath = Field(
|
|
12
|
+
default_factory=lambda: Path.home() / ".claude" / "projects",
|
|
13
|
+
)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import dagster as dg
|
|
2
|
+
from dagster import AssetExecutionContext
|
|
3
|
+
from dagster_dlt import DagsterDltResource
|
|
4
|
+
from shared_plugins.automation import non_overlapping_automation_condition
|
|
5
|
+
from shared_plugins.bindings import parse_binding_config
|
|
6
|
+
from shared_plugins.control_plane import ControlPlaneClient
|
|
7
|
+
from shared_plugins.dlt import resolve_partition_binding, run_dlt_pipeline
|
|
8
|
+
from shared_plugins.naming import (
|
|
9
|
+
dagster_asset_group_name,
|
|
10
|
+
dagster_asset_tags,
|
|
11
|
+
dagster_dlt_asset_key,
|
|
12
|
+
dagster_partition_def_name,
|
|
13
|
+
dagster_pool_name,
|
|
14
|
+
dlt_source_name,
|
|
15
|
+
plugin_id_from_module,
|
|
16
|
+
)
|
|
17
|
+
from shared_plugins.resources import DLT_RESOURCE
|
|
18
|
+
|
|
19
|
+
from .binding_config import ClaudeLocalBindingConfig
|
|
20
|
+
from .sources.snapshot import claude_local_snapshot_source
|
|
21
|
+
|
|
22
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
23
|
+
SNAPSHOT_JOB = "snapshot"
|
|
24
|
+
SNAPSHOT_SOURCE_NAME = dlt_source_name(PLUGIN_ID, SNAPSHOT_JOB)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_snapshot_specs(
|
|
28
|
+
partitions_def: dg.PartitionsDefinition,
|
|
29
|
+
automation_condition: dg.AutomationCondition,
|
|
30
|
+
) -> list[dg.AssetSpec]:
|
|
31
|
+
shared = dict(
|
|
32
|
+
group_name=dagster_asset_group_name(PLUGIN_ID),
|
|
33
|
+
tags=dagster_asset_tags(PLUGIN_ID),
|
|
34
|
+
automation_condition=automation_condition,
|
|
35
|
+
partitions_def=partitions_def,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
project_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "project")
|
|
39
|
+
conversation_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "conversation")
|
|
40
|
+
line_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "line")
|
|
41
|
+
turn_key = dagster_dlt_asset_key(SNAPSHOT_SOURCE_NAME, "turn")
|
|
42
|
+
|
|
43
|
+
return [
|
|
44
|
+
dg.AssetSpec(
|
|
45
|
+
key=project_key,
|
|
46
|
+
**shared,
|
|
47
|
+
),
|
|
48
|
+
dg.AssetSpec(
|
|
49
|
+
key=conversation_key,
|
|
50
|
+
deps=[project_key],
|
|
51
|
+
**shared,
|
|
52
|
+
),
|
|
53
|
+
dg.AssetSpec(
|
|
54
|
+
key=line_key,
|
|
55
|
+
deps=[conversation_key],
|
|
56
|
+
**shared,
|
|
57
|
+
),
|
|
58
|
+
dg.AssetSpec(
|
|
59
|
+
key=turn_key,
|
|
60
|
+
deps=[line_key],
|
|
61
|
+
**shared,
|
|
62
|
+
),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ClaudeLocalSyncComponent(dg.Component):
|
|
67
|
+
def build_defs(self, context: dg.ComponentLoadContext) -> dg.Definitions:
|
|
68
|
+
partitions_def = dg.DynamicPartitionsDefinition(
|
|
69
|
+
name=dagster_partition_def_name(PLUGIN_ID)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
snapshot_specs = _build_snapshot_specs(
|
|
73
|
+
partitions_def=partitions_def,
|
|
74
|
+
automation_condition=non_overlapping_automation_condition(
|
|
75
|
+
dg.AutomationCondition.on_missing()
|
|
76
|
+
| dg.AutomationCondition.on_cron("*/15 * * * *")
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@dg.multi_asset(
|
|
81
|
+
specs=snapshot_specs,
|
|
82
|
+
can_subset=True,
|
|
83
|
+
name="claude_local_snapshot",
|
|
84
|
+
pool=dagster_pool_name(PLUGIN_ID),
|
|
85
|
+
)
|
|
86
|
+
def claude_local_snapshot_assets(
|
|
87
|
+
context: AssetExecutionContext,
|
|
88
|
+
dlt_resource: DagsterDltResource,
|
|
89
|
+
control_plane: dg.ResourceParam[ControlPlaneClient],
|
|
90
|
+
):
|
|
91
|
+
binding = resolve_partition_binding(
|
|
92
|
+
context=context,
|
|
93
|
+
control_plane=control_plane,
|
|
94
|
+
plugin_id=PLUGIN_ID,
|
|
95
|
+
)
|
|
96
|
+
binding_id = str(binding.binding_id)
|
|
97
|
+
cfg = parse_binding_config(binding, ClaudeLocalBindingConfig)
|
|
98
|
+
|
|
99
|
+
source = claude_local_snapshot_source(binding_id, cfg)
|
|
100
|
+
yield from run_dlt_pipeline(
|
|
101
|
+
context=context,
|
|
102
|
+
dlt_resource=dlt_resource,
|
|
103
|
+
source=source,
|
|
104
|
+
plugin_id=PLUGIN_ID,
|
|
105
|
+
binding_id=binding_id,
|
|
106
|
+
job_name=SNAPSHOT_JOB,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
automation_sensor = dg.AutomationConditionSensorDefinition(
|
|
110
|
+
name="claude_local_automation_sensor",
|
|
111
|
+
target=dg.AssetSelection.assets(claude_local_snapshot_assets),
|
|
112
|
+
default_status=dg.DefaultSensorStatus.RUNNING,
|
|
113
|
+
minimum_interval_seconds=30,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return dg.Definitions(
|
|
117
|
+
assets=[claude_local_snapshot_assets],
|
|
118
|
+
sensors=[automation_sensor],
|
|
119
|
+
resources={
|
|
120
|
+
"dlt_resource": DLT_RESOURCE,
|
|
121
|
+
},
|
|
122
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
type: plugin_claude_local.component.ClaudeLocalSyncComponent
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import AwareDatetime
|
|
6
|
+
from shared_plugins.models import CtxModel, IdStr, NonNegativeInt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProjectRow(CtxModel):
|
|
10
|
+
encoded_dir: IdStr
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ConversationRow(CtxModel):
|
|
14
|
+
jsonl_stem: IdStr
|
|
15
|
+
project_dir: IdStr
|
|
16
|
+
jsonl_parent_dir: str = ""
|
|
17
|
+
jsonl_path: IdStr
|
|
18
|
+
jsonl_mtime: AwareDatetime | None = None
|
|
19
|
+
user_message_count: NonNegativeInt
|
|
20
|
+
assistant_message_count: NonNegativeInt
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LineRow(CtxModel):
|
|
24
|
+
project_dir: IdStr
|
|
25
|
+
jsonl_stem: IdStr
|
|
26
|
+
jsonl_parent_dir: str = ""
|
|
27
|
+
line_index: NonNegativeInt
|
|
28
|
+
type: str | None = None
|
|
29
|
+
subtype: str | None = None
|
|
30
|
+
uuid: str | None = None
|
|
31
|
+
parent_uuid: str | None = None
|
|
32
|
+
timestamp: AwareDatetime | None = None
|
|
33
|
+
session_id: str | None = None
|
|
34
|
+
version: str | None = None
|
|
35
|
+
cwd: str | None = None
|
|
36
|
+
# Canonical JSONL line text, with trailing CR/LF removed by the parser.
|
|
37
|
+
raw_line: str
|
|
38
|
+
# Parsed JSON value for querying. Strings in nested structures may have
|
|
39
|
+
# embedded null bytes removed before persistence because Postgres rejects
|
|
40
|
+
# \u0000 in text/jsonb.
|
|
41
|
+
payload: Any = None
|
|
42
|
+
parse_error: str | None = None
|
|
43
|
+
turn_projection_error: str | None = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TurnRow(CtxModel):
|
|
47
|
+
# -- file metadata --
|
|
48
|
+
project_dir: IdStr
|
|
49
|
+
jsonl_stem: IdStr
|
|
50
|
+
jsonl_parent_dir: str = ""
|
|
51
|
+
line_index: NonNegativeInt
|
|
52
|
+
|
|
53
|
+
# -- event envelope (union of user + assistant event fields) --
|
|
54
|
+
type: str
|
|
55
|
+
uuid: IdStr
|
|
56
|
+
parent_uuid: str | None = None
|
|
57
|
+
timestamp: AwareDatetime
|
|
58
|
+
is_sidechain: bool
|
|
59
|
+
user_type: str
|
|
60
|
+
cwd: str
|
|
61
|
+
session_id: str
|
|
62
|
+
version: str
|
|
63
|
+
git_branch: str
|
|
64
|
+
slug: str | None = None
|
|
65
|
+
agent_id: str | None = None
|
|
66
|
+
entrypoint: str | None = None
|
|
67
|
+
|
|
68
|
+
# -- user-only fields --
|
|
69
|
+
prompt_id: str | None = None
|
|
70
|
+
is_meta: bool | None = None
|
|
71
|
+
tool_use_result: Any = None # jsonb: dict | list | str | None
|
|
72
|
+
source_tool_use_id: str | None = None
|
|
73
|
+
source_tool_assistant_uuid: str | None = None
|
|
74
|
+
thinking_metadata: Any = None # jsonb: dict | None
|
|
75
|
+
permission_mode: str | None = None
|
|
76
|
+
todos: Any = None # jsonb: list | None
|
|
77
|
+
is_visible_in_transcript_only: bool | None = None
|
|
78
|
+
is_compact_summary: bool | None = None
|
|
79
|
+
plan_content: str | None = None
|
|
80
|
+
|
|
81
|
+
# -- assistant-only fields --
|
|
82
|
+
request_id: str | None = None
|
|
83
|
+
error: str | None = None
|
|
84
|
+
is_api_error_message: bool | None = None
|
|
85
|
+
api_error: str | None = None
|
|
86
|
+
|
|
87
|
+
# -- message content / top-level drift (jsonb) --
|
|
88
|
+
message: dict[str, Any]
|
|
89
|
+
turn_extra: dict[str, Any] | None = None
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import AwareDatetime, Field
|
|
6
|
+
from shared_plugins.models import IngressModel
|
|
7
|
+
|
|
8
|
+
# These key sets define the curated transcript projection boundary.
|
|
9
|
+
# parse.py uses them to split a raw user/assistant record into:
|
|
10
|
+
# 1. first-class turn columns we intentionally promote, and
|
|
11
|
+
# 2. turn_extra for every other top-level key.
|
|
12
|
+
# Add a key here only when we want a durable query column in claude_local.turn.
|
|
13
|
+
COMMON_TRANSCRIPT_SOURCE_KEYS = frozenset(
|
|
14
|
+
{
|
|
15
|
+
"type",
|
|
16
|
+
"uuid",
|
|
17
|
+
"parentUuid",
|
|
18
|
+
"timestamp",
|
|
19
|
+
"message",
|
|
20
|
+
"isSidechain",
|
|
21
|
+
"userType",
|
|
22
|
+
"cwd",
|
|
23
|
+
"sessionId",
|
|
24
|
+
"version",
|
|
25
|
+
"gitBranch",
|
|
26
|
+
"slug",
|
|
27
|
+
"agentId",
|
|
28
|
+
"entrypoint",
|
|
29
|
+
}
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
USER_TRANSCRIPT_SOURCE_KEYS = COMMON_TRANSCRIPT_SOURCE_KEYS | frozenset(
|
|
33
|
+
{
|
|
34
|
+
"promptId",
|
|
35
|
+
"isMeta",
|
|
36
|
+
"toolUseResult",
|
|
37
|
+
"sourceToolUseID",
|
|
38
|
+
"sourceToolAssistantUUID",
|
|
39
|
+
"thinkingMetadata",
|
|
40
|
+
"permissionMode",
|
|
41
|
+
"todos",
|
|
42
|
+
"isVisibleInTranscriptOnly",
|
|
43
|
+
"isCompactSummary",
|
|
44
|
+
"planContent",
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
ASSISTANT_TRANSCRIPT_SOURCE_KEYS = COMMON_TRANSCRIPT_SOURCE_KEYS | frozenset(
|
|
49
|
+
{
|
|
50
|
+
"requestId",
|
|
51
|
+
"error",
|
|
52
|
+
"isApiErrorMessage",
|
|
53
|
+
"apiError",
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ClaudeTranscriptTurnIngress(IngressModel):
|
|
59
|
+
# This model is intentionally narrower than Claude's full local JSONL schema.
|
|
60
|
+
# It validates the transcript projection that claude_local.turn promises, not
|
|
61
|
+
# every nested field Claude may emit in the raw source log.
|
|
62
|
+
uuid: str
|
|
63
|
+
parent_uuid: str | None = Field(alias="parentUuid")
|
|
64
|
+
timestamp: AwareDatetime
|
|
65
|
+
# Keep the message envelope as raw JSON so future nested drift stays queryable
|
|
66
|
+
# without forcing ingress model churn for every new Claude field.
|
|
67
|
+
message: dict[str, Any]
|
|
68
|
+
is_sidechain: bool = Field(alias="isSidechain")
|
|
69
|
+
user_type: str = Field(alias="userType")
|
|
70
|
+
cwd: str
|
|
71
|
+
session_id: str = Field(alias="sessionId")
|
|
72
|
+
version: str
|
|
73
|
+
git_branch: str = Field(alias="gitBranch")
|
|
74
|
+
slug: str | None = None
|
|
75
|
+
agent_id: str | None = Field(default=None, alias="agentId")
|
|
76
|
+
entrypoint: str | None = None
|
|
77
|
+
# Top-level transcript keys that exist in the raw record but are not promoted
|
|
78
|
+
# to first-class columns land here so claude_local.turn does not drop them.
|
|
79
|
+
turn_extra: dict[str, Any] | None = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ClaudeUserTurnIngress(ClaudeTranscriptTurnIngress):
|
|
83
|
+
type: Literal["user"]
|
|
84
|
+
prompt_id: str | None = Field(default=None, alias="promptId")
|
|
85
|
+
is_meta: bool | None = Field(default=None, alias="isMeta")
|
|
86
|
+
tool_use_result: dict[str, Any] | list[Any] | str | None = Field(
|
|
87
|
+
default=None, alias="toolUseResult"
|
|
88
|
+
)
|
|
89
|
+
source_tool_use_id: str | None = Field(default=None, alias="sourceToolUseID")
|
|
90
|
+
source_tool_assistant_uuid: str | None = Field(
|
|
91
|
+
default=None, alias="sourceToolAssistantUUID"
|
|
92
|
+
)
|
|
93
|
+
thinking_metadata: dict[str, Any] | None = Field(
|
|
94
|
+
default=None, alias="thinkingMetadata"
|
|
95
|
+
)
|
|
96
|
+
permission_mode: str | None = Field(default=None, alias="permissionMode")
|
|
97
|
+
todos: list[Any] | None = None
|
|
98
|
+
is_visible_in_transcript_only: bool | None = Field(
|
|
99
|
+
default=None, alias="isVisibleInTranscriptOnly"
|
|
100
|
+
)
|
|
101
|
+
is_compact_summary: bool | None = Field(default=None, alias="isCompactSummary")
|
|
102
|
+
plan_content: str | None = Field(default=None, alias="planContent")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class ClaudeAssistantTurnIngress(ClaudeTranscriptTurnIngress):
|
|
106
|
+
type: Literal["assistant"]
|
|
107
|
+
request_id: str | None = Field(default=None, alias="requestId")
|
|
108
|
+
error: str | None = None
|
|
109
|
+
is_api_error_message: bool | None = Field(default=None, alias="isApiErrorMessage")
|
|
110
|
+
api_error: str | None = Field(default=None, alias="apiError")
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Iterator
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..utils.parse import ParsedConversationFile
|
|
7
|
+
from .ctx import ConversationRow, LineRow, ProjectRow, TurnRow
|
|
8
|
+
from .ingress import ClaudeAssistantTurnIngress, ClaudeUserTurnIngress
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _strip_null_bytes(value: Any) -> Any:
|
|
12
|
+
"""Recursively strip \\x00 null bytes from strings in a structure.
|
|
13
|
+
|
|
14
|
+
Postgres rejects \\u0000 in both text and jsonb columns.
|
|
15
|
+
|
|
16
|
+
This is intentionally applied only to structured values that must be written
|
|
17
|
+
to Postgres text/jsonb columns. It does not change LineRow.raw_line, which
|
|
18
|
+
remains the canonical decoded JSONL line text for fidelity/debugging.
|
|
19
|
+
"""
|
|
20
|
+
if isinstance(value, str):
|
|
21
|
+
return value.replace("\x00", "")
|
|
22
|
+
if isinstance(value, dict):
|
|
23
|
+
return {k: _strip_null_bytes(v) for k, v in value.items()}
|
|
24
|
+
if isinstance(value, list):
|
|
25
|
+
return [_strip_null_bytes(item) for item in value]
|
|
26
|
+
return value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def projects_to_ctx_models(
|
|
30
|
+
*,
|
|
31
|
+
binding_id: str,
|
|
32
|
+
project_dirs: Iterable[str],
|
|
33
|
+
) -> Iterator[ProjectRow]:
|
|
34
|
+
seen_dirs: set[str] = set()
|
|
35
|
+
|
|
36
|
+
for encoded_dir in project_dirs:
|
|
37
|
+
if encoded_dir in seen_dirs:
|
|
38
|
+
continue
|
|
39
|
+
seen_dirs.add(encoded_dir)
|
|
40
|
+
|
|
41
|
+
yield ProjectRow(
|
|
42
|
+
ctx_binding_id=binding_id,
|
|
43
|
+
ctx_source_updated_at=None,
|
|
44
|
+
encoded_dir=encoded_dir,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def conversations_to_ctx_models(
|
|
49
|
+
*,
|
|
50
|
+
binding_id: str,
|
|
51
|
+
conversations: Iterable[ParsedConversationFile],
|
|
52
|
+
) -> Iterator[ConversationRow]:
|
|
53
|
+
for conversation in conversations:
|
|
54
|
+
yield ConversationRow(
|
|
55
|
+
ctx_binding_id=binding_id,
|
|
56
|
+
ctx_source_updated_at=conversation.file_mtime,
|
|
57
|
+
jsonl_stem=conversation.conversation_id,
|
|
58
|
+
project_dir=conversation.project,
|
|
59
|
+
jsonl_parent_dir=conversation.parent_conversation_id or "",
|
|
60
|
+
jsonl_path=conversation.file_path,
|
|
61
|
+
jsonl_mtime=conversation.file_mtime,
|
|
62
|
+
user_message_count=conversation.user_message_count,
|
|
63
|
+
assistant_message_count=conversation.assistant_message_count,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def lines_to_ctx_models(
|
|
68
|
+
*,
|
|
69
|
+
binding_id: str,
|
|
70
|
+
conversations: Iterable[ParsedConversationFile],
|
|
71
|
+
) -> Iterator[LineRow]:
|
|
72
|
+
for conversation in conversations:
|
|
73
|
+
for line in conversation.lines:
|
|
74
|
+
yield LineRow(
|
|
75
|
+
ctx_binding_id=binding_id,
|
|
76
|
+
ctx_source_updated_at=line.timestamp,
|
|
77
|
+
project_dir=conversation.project,
|
|
78
|
+
jsonl_stem=conversation.conversation_id,
|
|
79
|
+
jsonl_parent_dir=conversation.parent_conversation_id or "",
|
|
80
|
+
line_index=line.line_index,
|
|
81
|
+
type=line.type,
|
|
82
|
+
subtype=line.subtype,
|
|
83
|
+
uuid=line.uuid,
|
|
84
|
+
parent_uuid=line.parent_uuid,
|
|
85
|
+
timestamp=line.timestamp,
|
|
86
|
+
session_id=line.session_id,
|
|
87
|
+
version=line.version,
|
|
88
|
+
cwd=line.cwd,
|
|
89
|
+
raw_line=line.raw_line,
|
|
90
|
+
# payload is the queryable parsed JSON form of the source line, not
|
|
91
|
+
# a byte-perfect mirror. Keep raw_line when exact decoded line text
|
|
92
|
+
# matters, especially for malformed JSON or embedded \\u0000 cases.
|
|
93
|
+
payload=(
|
|
94
|
+
_strip_null_bytes(line.payload)
|
|
95
|
+
if line.payload is not None
|
|
96
|
+
else None
|
|
97
|
+
),
|
|
98
|
+
parse_error=line.parse_error,
|
|
99
|
+
turn_projection_error=line.turn_projection_error,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _user_turn_to_ctx_model(
|
|
104
|
+
*,
|
|
105
|
+
binding_id: str,
|
|
106
|
+
conversation: ParsedConversationFile,
|
|
107
|
+
turn: ClaudeUserTurnIngress,
|
|
108
|
+
line_index: int,
|
|
109
|
+
) -> TurnRow:
|
|
110
|
+
# claude_local.turn is a projection, not a source rewrite. Preserve message,
|
|
111
|
+
# tool_use_result, and turn_extra shapes as-is apart from null-byte stripping.
|
|
112
|
+
return TurnRow(
|
|
113
|
+
ctx_binding_id=binding_id,
|
|
114
|
+
ctx_source_updated_at=turn.timestamp,
|
|
115
|
+
project_dir=conversation.project,
|
|
116
|
+
jsonl_stem=conversation.conversation_id,
|
|
117
|
+
jsonl_parent_dir=conversation.parent_conversation_id or "",
|
|
118
|
+
line_index=line_index,
|
|
119
|
+
type=turn.type,
|
|
120
|
+
uuid=turn.uuid,
|
|
121
|
+
parent_uuid=turn.parent_uuid,
|
|
122
|
+
timestamp=turn.timestamp,
|
|
123
|
+
is_sidechain=turn.is_sidechain,
|
|
124
|
+
user_type=turn.user_type,
|
|
125
|
+
cwd=turn.cwd,
|
|
126
|
+
session_id=turn.session_id,
|
|
127
|
+
version=turn.version,
|
|
128
|
+
git_branch=turn.git_branch,
|
|
129
|
+
slug=turn.slug,
|
|
130
|
+
agent_id=turn.agent_id,
|
|
131
|
+
entrypoint=turn.entrypoint,
|
|
132
|
+
prompt_id=turn.prompt_id,
|
|
133
|
+
is_meta=turn.is_meta,
|
|
134
|
+
tool_use_result=_strip_null_bytes(turn.tool_use_result),
|
|
135
|
+
source_tool_use_id=turn.source_tool_use_id,
|
|
136
|
+
source_tool_assistant_uuid=turn.source_tool_assistant_uuid,
|
|
137
|
+
thinking_metadata=_strip_null_bytes(turn.thinking_metadata),
|
|
138
|
+
permission_mode=turn.permission_mode,
|
|
139
|
+
todos=_strip_null_bytes(turn.todos),
|
|
140
|
+
is_visible_in_transcript_only=turn.is_visible_in_transcript_only,
|
|
141
|
+
is_compact_summary=turn.is_compact_summary,
|
|
142
|
+
plan_content=turn.plan_content,
|
|
143
|
+
message=_strip_null_bytes(turn.message),
|
|
144
|
+
turn_extra=_strip_null_bytes(turn.turn_extra),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _assistant_turn_to_ctx_model(
|
|
149
|
+
*,
|
|
150
|
+
binding_id: str,
|
|
151
|
+
conversation: ParsedConversationFile,
|
|
152
|
+
turn: ClaudeAssistantTurnIngress,
|
|
153
|
+
line_index: int,
|
|
154
|
+
) -> TurnRow:
|
|
155
|
+
# Keep assistant message/turn_extra structures raw so new nested Claude fields
|
|
156
|
+
# survive in the curated transcript table without bespoke translator logic.
|
|
157
|
+
return TurnRow(
|
|
158
|
+
ctx_binding_id=binding_id,
|
|
159
|
+
ctx_source_updated_at=turn.timestamp,
|
|
160
|
+
project_dir=conversation.project,
|
|
161
|
+
jsonl_stem=conversation.conversation_id,
|
|
162
|
+
jsonl_parent_dir=conversation.parent_conversation_id or "",
|
|
163
|
+
line_index=line_index,
|
|
164
|
+
type=turn.type,
|
|
165
|
+
uuid=turn.uuid,
|
|
166
|
+
parent_uuid=turn.parent_uuid,
|
|
167
|
+
timestamp=turn.timestamp,
|
|
168
|
+
is_sidechain=turn.is_sidechain,
|
|
169
|
+
user_type=turn.user_type,
|
|
170
|
+
cwd=turn.cwd,
|
|
171
|
+
session_id=turn.session_id,
|
|
172
|
+
version=turn.version,
|
|
173
|
+
git_branch=turn.git_branch,
|
|
174
|
+
slug=turn.slug,
|
|
175
|
+
agent_id=turn.agent_id,
|
|
176
|
+
entrypoint=turn.entrypoint,
|
|
177
|
+
request_id=turn.request_id,
|
|
178
|
+
error=turn.error,
|
|
179
|
+
is_api_error_message=turn.is_api_error_message,
|
|
180
|
+
api_error=turn.api_error,
|
|
181
|
+
message=_strip_null_bytes(turn.message),
|
|
182
|
+
turn_extra=_strip_null_bytes(turn.turn_extra),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def turns_to_ctx_models(
|
|
187
|
+
*,
|
|
188
|
+
binding_id: str,
|
|
189
|
+
conversations: Iterable[ParsedConversationFile],
|
|
190
|
+
) -> Iterator[TurnRow]:
|
|
191
|
+
for conversation in conversations:
|
|
192
|
+
for validated in conversation.turns:
|
|
193
|
+
ingress = validated.ingress
|
|
194
|
+
|
|
195
|
+
if isinstance(ingress, ClaudeUserTurnIngress):
|
|
196
|
+
yield _user_turn_to_ctx_model(
|
|
197
|
+
binding_id=binding_id,
|
|
198
|
+
conversation=conversation,
|
|
199
|
+
turn=ingress,
|
|
200
|
+
line_index=validated.line_index,
|
|
201
|
+
)
|
|
202
|
+
elif isinstance(ingress, ClaudeAssistantTurnIngress):
|
|
203
|
+
yield _assistant_turn_to_ctx_model(
|
|
204
|
+
binding_id=binding_id,
|
|
205
|
+
conversation=conversation,
|
|
206
|
+
turn=ingress,
|
|
207
|
+
line_index=validated.line_index,
|
|
208
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from shared_plugins.naming import (
|
|
8
|
+
dlt_resource_name,
|
|
9
|
+
dlt_source_name,
|
|
10
|
+
plugin_id_from_module,
|
|
11
|
+
)
|
|
12
|
+
from shared_plugins.resources import ctx_dlt_resource
|
|
13
|
+
|
|
14
|
+
from ..binding_config import ClaudeLocalBindingConfig
|
|
15
|
+
from ..models.ctx import ConversationRow, LineRow, ProjectRow, TurnRow
|
|
16
|
+
from ..models.translators import (
|
|
17
|
+
conversations_to_ctx_models,
|
|
18
|
+
lines_to_ctx_models,
|
|
19
|
+
projects_to_ctx_models,
|
|
20
|
+
turns_to_ctx_models,
|
|
21
|
+
)
|
|
22
|
+
from ..utils.parse import scan_claude_snapshot
|
|
23
|
+
|
|
24
|
+
PLUGIN_ID = plugin_id_from_module(__file__)
|
|
25
|
+
JOB = "snapshot"
|
|
26
|
+
MERGE_WRITE_DISPOSITION = {"disposition": "merge", "strategy": "upsert"}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dlt.source(name=dlt_source_name(PLUGIN_ID, JOB))
|
|
30
|
+
def claude_local_snapshot_source(
|
|
31
|
+
binding_id: str,
|
|
32
|
+
cfg: ClaudeLocalBindingConfig,
|
|
33
|
+
) -> tuple[Any, ...]:
|
|
34
|
+
snapshot = scan_claude_snapshot(cfg.projects_dir)
|
|
35
|
+
|
|
36
|
+
@ctx_dlt_resource(
|
|
37
|
+
name=dlt_resource_name("project"),
|
|
38
|
+
write_disposition=MERGE_WRITE_DISPOSITION,
|
|
39
|
+
primary_key=("_ctx_binding_id", "encoded_dir"),
|
|
40
|
+
)
|
|
41
|
+
def project_resource() -> Iterator[ProjectRow]:
|
|
42
|
+
yield from projects_to_ctx_models(
|
|
43
|
+
binding_id=binding_id,
|
|
44
|
+
project_dirs=snapshot.project_dirs,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@ctx_dlt_resource(
|
|
48
|
+
name=dlt_resource_name("conversation"),
|
|
49
|
+
write_disposition=MERGE_WRITE_DISPOSITION,
|
|
50
|
+
primary_key=(
|
|
51
|
+
"_ctx_binding_id",
|
|
52
|
+
"project_dir",
|
|
53
|
+
"jsonl_parent_dir",
|
|
54
|
+
"jsonl_stem",
|
|
55
|
+
),
|
|
56
|
+
)
|
|
57
|
+
def conversation_resource() -> Iterator[ConversationRow]:
|
|
58
|
+
yield from conversations_to_ctx_models(
|
|
59
|
+
binding_id=binding_id,
|
|
60
|
+
conversations=snapshot.conversations,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@ctx_dlt_resource(
|
|
64
|
+
name=dlt_resource_name("line"),
|
|
65
|
+
write_disposition=MERGE_WRITE_DISPOSITION,
|
|
66
|
+
primary_key=(
|
|
67
|
+
"_ctx_binding_id",
|
|
68
|
+
"project_dir",
|
|
69
|
+
"jsonl_parent_dir",
|
|
70
|
+
"jsonl_stem",
|
|
71
|
+
"line_index",
|
|
72
|
+
),
|
|
73
|
+
columns={
|
|
74
|
+
"payload": {"data_type": "json"},
|
|
75
|
+
# These columns are correctness-critical but may be all-null on a clean
|
|
76
|
+
# corpus. Declare them explicitly so dlt does not omit them when it
|
|
77
|
+
# cannot infer a type from the current load package.
|
|
78
|
+
"parse_error": {"data_type": "text"},
|
|
79
|
+
"turn_projection_error": {"data_type": "text"},
|
|
80
|
+
},
|
|
81
|
+
)
|
|
82
|
+
def line_resource() -> Iterator[LineRow]:
|
|
83
|
+
yield from lines_to_ctx_models(
|
|
84
|
+
binding_id=binding_id,
|
|
85
|
+
conversations=snapshot.conversations,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@ctx_dlt_resource(
|
|
89
|
+
name=dlt_resource_name("turn"),
|
|
90
|
+
write_disposition=MERGE_WRITE_DISPOSITION,
|
|
91
|
+
primary_key=(
|
|
92
|
+
"_ctx_binding_id",
|
|
93
|
+
"project_dir",
|
|
94
|
+
"jsonl_parent_dir",
|
|
95
|
+
"jsonl_stem",
|
|
96
|
+
"line_index",
|
|
97
|
+
),
|
|
98
|
+
columns={
|
|
99
|
+
# Preserve transcript substructures as JSON so nested Claude drift stays
|
|
100
|
+
# queryable without creating new relational columns every release.
|
|
101
|
+
"message": {"data_type": "json"},
|
|
102
|
+
"tool_use_result": {"data_type": "json"},
|
|
103
|
+
"thinking_metadata": {"data_type": "json"},
|
|
104
|
+
"todos": {"data_type": "json"},
|
|
105
|
+
"turn_extra": {"data_type": "json"},
|
|
106
|
+
},
|
|
107
|
+
)
|
|
108
|
+
def turn_resource() -> Iterator[TurnRow]:
|
|
109
|
+
yield from turns_to_ctx_models(
|
|
110
|
+
binding_id=binding_id,
|
|
111
|
+
conversations=snapshot.conversations,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return (
|
|
115
|
+
project_resource,
|
|
116
|
+
conversation_resource,
|
|
117
|
+
line_resource,
|
|
118
|
+
turn_resource,
|
|
119
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from pydantic import ValidationError
|
|
12
|
+
from shared_plugins.models import format_validation_error
|
|
13
|
+
from shared_plugins.values import as_mapping, as_string, parse_utc_datetime_from_str
|
|
14
|
+
|
|
15
|
+
from ..models.ingress import (
|
|
16
|
+
ASSISTANT_TRANSCRIPT_SOURCE_KEYS,
|
|
17
|
+
USER_TRANSCRIPT_SOURCE_KEYS,
|
|
18
|
+
ClaudeAssistantTurnIngress,
|
|
19
|
+
ClaudeUserTurnIngress,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
LOGGER = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class ValidatedTurn:
|
|
27
|
+
line_index: int
|
|
28
|
+
ingress: ClaudeUserTurnIngress | ClaudeAssistantTurnIngress
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class ParsedLine:
|
|
33
|
+
line_index: int
|
|
34
|
+
# The original non-empty JSONL line text with trailing line terminators removed.
|
|
35
|
+
raw_line: str
|
|
36
|
+
payload: Any = None
|
|
37
|
+
parse_error: str | None = None
|
|
38
|
+
type: str | None = None
|
|
39
|
+
subtype: str | None = None
|
|
40
|
+
uuid: str | None = None
|
|
41
|
+
parent_uuid: str | None = None
|
|
42
|
+
timestamp: datetime | None = None
|
|
43
|
+
session_id: str | None = None
|
|
44
|
+
version: str | None = None
|
|
45
|
+
cwd: str | None = None
|
|
46
|
+
turn_projection_error: str | None = None
|
|
47
|
+
turn: ValidatedTurn | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class ConversationFileRef:
|
|
52
|
+
file_path: str
|
|
53
|
+
project: str
|
|
54
|
+
conversation_id: str
|
|
55
|
+
parent_conversation_id: str | None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(frozen=True)
|
|
59
|
+
class ParsedConversationFile:
|
|
60
|
+
file_path: str
|
|
61
|
+
project: str
|
|
62
|
+
conversation_id: str
|
|
63
|
+
parent_conversation_id: str | None
|
|
64
|
+
file_mtime: datetime | None
|
|
65
|
+
lines: list[ParsedLine]
|
|
66
|
+
turns: list[ValidatedTurn]
|
|
67
|
+
user_message_count: int
|
|
68
|
+
assistant_message_count: int
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(frozen=True)
|
|
72
|
+
class ParsedClaudeSnapshot:
|
|
73
|
+
project_dirs: list[str]
|
|
74
|
+
conversations: list[ParsedConversationFile]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def parse_json_line(line: str) -> tuple[Any | None, str | None]:
|
|
78
|
+
try:
|
|
79
|
+
return json.loads(line), None
|
|
80
|
+
except json.JSONDecodeError as exc:
|
|
81
|
+
return None, f"invalid_json: {exc.msg} (column {exc.colno})"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_file_mtime(file_path: str) -> datetime | None:
|
|
85
|
+
try:
|
|
86
|
+
stat_result = Path(file_path).stat()
|
|
87
|
+
except OSError:
|
|
88
|
+
return None
|
|
89
|
+
return datetime.fromtimestamp(stat_result.st_mtime, tz=timezone.utc)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _read_jsonl_lines(file_path: str) -> list[tuple[int, str]] | None:
|
|
93
|
+
"""Read non-empty JSONL lines as text.
|
|
94
|
+
|
|
95
|
+
This preserves the original decoded line contents, but strips trailing CR/LF
|
|
96
|
+
terminators and skips blank lines. It is therefore a canonical JSONL-line
|
|
97
|
+
capture, not a byte-for-byte file mirror.
|
|
98
|
+
"""
|
|
99
|
+
lines: list[tuple[int, str]] = []
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
with open(file_path, "r", encoding="utf-8") as handle:
|
|
103
|
+
for line_index, raw_line in enumerate(handle):
|
|
104
|
+
line = raw_line.rstrip("\n").rstrip("\r")
|
|
105
|
+
if not line.strip():
|
|
106
|
+
continue
|
|
107
|
+
lines.append((line_index, line))
|
|
108
|
+
except (OSError, UnicodeError):
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
return lines
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _extract_timestamp(value: object) -> datetime | None:
|
|
115
|
+
timestamp_raw = as_string(value)
|
|
116
|
+
if timestamp_raw is None:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
return parse_utc_datetime_from_str(timestamp_raw)
|
|
121
|
+
except (TypeError, ValueError):
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _build_projection_payload(
|
|
126
|
+
raw: dict[str, Any],
|
|
127
|
+
*,
|
|
128
|
+
source_keys: frozenset[str],
|
|
129
|
+
) -> dict[str, Any]:
|
|
130
|
+
# Split the raw transcript line into promoted turn columns and durable
|
|
131
|
+
# top-level drift. This is the seam that lets ingress stay strict without
|
|
132
|
+
# pretending Claude's full top-level shape is stable.
|
|
133
|
+
prepared = {key: value for key, value in raw.items() if key in source_keys}
|
|
134
|
+
turn_extra = {key: value for key, value in raw.items() if key not in source_keys}
|
|
135
|
+
if turn_extra:
|
|
136
|
+
prepared["turn_extra"] = turn_extra
|
|
137
|
+
return prepared
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _validate_projection_model(
|
|
141
|
+
raw: dict[str, Any],
|
|
142
|
+
*,
|
|
143
|
+
file_path: str,
|
|
144
|
+
line_index: int,
|
|
145
|
+
source_keys: frozenset[str],
|
|
146
|
+
model_type: type[ClaudeUserTurnIngress] | type[ClaudeAssistantTurnIngress],
|
|
147
|
+
) -> tuple[ValidatedTurn | None, str | None]:
|
|
148
|
+
prepared = _build_projection_payload(raw, source_keys=source_keys)
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
ingress = model_type.model_validate(prepared)
|
|
152
|
+
except ValidationError as exc:
|
|
153
|
+
error = format_validation_error(exc)
|
|
154
|
+
LOGGER.warning(
|
|
155
|
+
"claude_local.turn_projection_failed type=%s file=%s line=%d error=%s raw=%s",
|
|
156
|
+
raw.get("type"),
|
|
157
|
+
file_path,
|
|
158
|
+
line_index,
|
|
159
|
+
error,
|
|
160
|
+
json.dumps(raw, default=str, ensure_ascii=False)[:2000],
|
|
161
|
+
)
|
|
162
|
+
return None, error
|
|
163
|
+
|
|
164
|
+
return ValidatedTurn(line_index=line_index, ingress=ingress), None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _project_transcript_turn(
|
|
168
|
+
raw: dict[str, Any],
|
|
169
|
+
*,
|
|
170
|
+
file_path: str,
|
|
171
|
+
line_index: int,
|
|
172
|
+
) -> tuple[ValidatedTurn | None, str | None]:
|
|
173
|
+
# Only user/assistant lines participate in the curated transcript surface.
|
|
174
|
+
# Every other top-level Claude line type still lands in claude_local.line.
|
|
175
|
+
line_type = as_string(raw.get("type"))
|
|
176
|
+
|
|
177
|
+
if line_type == "user":
|
|
178
|
+
return _validate_projection_model(
|
|
179
|
+
raw,
|
|
180
|
+
file_path=file_path,
|
|
181
|
+
line_index=line_index,
|
|
182
|
+
source_keys=USER_TRANSCRIPT_SOURCE_KEYS,
|
|
183
|
+
model_type=ClaudeUserTurnIngress,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if line_type == "assistant":
|
|
187
|
+
return _validate_projection_model(
|
|
188
|
+
raw,
|
|
189
|
+
file_path=file_path,
|
|
190
|
+
line_index=line_index,
|
|
191
|
+
source_keys=ASSISTANT_TRANSCRIPT_SOURCE_KEYS,
|
|
192
|
+
model_type=ClaudeAssistantTurnIngress,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return None, None
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _build_line(
|
|
199
|
+
*,
|
|
200
|
+
raw_line: str,
|
|
201
|
+
payload: Any,
|
|
202
|
+
parse_error: str | None,
|
|
203
|
+
file_path: str,
|
|
204
|
+
line_index: int,
|
|
205
|
+
) -> ParsedLine:
|
|
206
|
+
payload_mapping = as_mapping(payload)
|
|
207
|
+
line_type = as_string(payload_mapping.get("type")) if payload_mapping else None
|
|
208
|
+
|
|
209
|
+
turn: ValidatedTurn | None = None
|
|
210
|
+
turn_projection_error: str | None = None
|
|
211
|
+
if payload_mapping is not None:
|
|
212
|
+
turn, turn_projection_error = _project_transcript_turn(
|
|
213
|
+
dict(payload_mapping),
|
|
214
|
+
file_path=file_path,
|
|
215
|
+
line_index=line_index,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return ParsedLine(
|
|
219
|
+
line_index=line_index,
|
|
220
|
+
raw_line=raw_line,
|
|
221
|
+
payload=payload,
|
|
222
|
+
parse_error=parse_error,
|
|
223
|
+
type=line_type,
|
|
224
|
+
subtype=(
|
|
225
|
+
as_string(payload_mapping.get("subtype")) if payload_mapping else None
|
|
226
|
+
),
|
|
227
|
+
uuid=as_string(payload_mapping.get("uuid")) if payload_mapping else None,
|
|
228
|
+
parent_uuid=(
|
|
229
|
+
as_string(payload_mapping.get("parentUuid")) if payload_mapping else None
|
|
230
|
+
),
|
|
231
|
+
timestamp=(
|
|
232
|
+
_extract_timestamp(payload_mapping.get("timestamp"))
|
|
233
|
+
if payload_mapping
|
|
234
|
+
else None
|
|
235
|
+
),
|
|
236
|
+
session_id=(
|
|
237
|
+
as_string(payload_mapping.get("sessionId")) if payload_mapping else None
|
|
238
|
+
),
|
|
239
|
+
version=as_string(payload_mapping.get("version")) if payload_mapping else None,
|
|
240
|
+
cwd=as_string(payload_mapping.get("cwd")) if payload_mapping else None,
|
|
241
|
+
turn_projection_error=turn_projection_error,
|
|
242
|
+
turn=turn,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def list_project_dirs(projects_dir: Path) -> list[str]:
|
|
247
|
+
try:
|
|
248
|
+
if not projects_dir.is_dir():
|
|
249
|
+
return []
|
|
250
|
+
except OSError:
|
|
251
|
+
return []
|
|
252
|
+
|
|
253
|
+
project_dirs: list[str] = []
|
|
254
|
+
try:
|
|
255
|
+
entries = list(os.scandir(projects_dir))
|
|
256
|
+
except OSError:
|
|
257
|
+
return []
|
|
258
|
+
|
|
259
|
+
for entry in entries:
|
|
260
|
+
try:
|
|
261
|
+
if entry.is_dir(follow_symlinks=False):
|
|
262
|
+
project_dirs.append(entry.name)
|
|
263
|
+
except OSError:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
project_dirs.sort()
|
|
267
|
+
return project_dirs
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _list_project_conversation_file_refs(
|
|
271
|
+
projects_path: Path,
|
|
272
|
+
project: str,
|
|
273
|
+
) -> list[ConversationFileRef]:
|
|
274
|
+
project_dir = projects_path / project
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
entries = list(os.scandir(project_dir))
|
|
278
|
+
except OSError:
|
|
279
|
+
LOGGER.warning(
|
|
280
|
+
"claude_local.skipped_project reason=%s project_dir=%s",
|
|
281
|
+
"unreadable_project_dir",
|
|
282
|
+
project_dir,
|
|
283
|
+
)
|
|
284
|
+
return []
|
|
285
|
+
|
|
286
|
+
refs: list[ConversationFileRef] = []
|
|
287
|
+
parent_candidates: list[str] = []
|
|
288
|
+
|
|
289
|
+
for entry in entries:
|
|
290
|
+
try:
|
|
291
|
+
if entry.is_file(follow_symlinks=False) and entry.name.lower().endswith(
|
|
292
|
+
".jsonl"
|
|
293
|
+
):
|
|
294
|
+
conversation_id = Path(entry.name).stem
|
|
295
|
+
if not conversation_id:
|
|
296
|
+
continue
|
|
297
|
+
refs.append(
|
|
298
|
+
ConversationFileRef(
|
|
299
|
+
file_path=entry.path,
|
|
300
|
+
project=project,
|
|
301
|
+
conversation_id=conversation_id,
|
|
302
|
+
parent_conversation_id=None,
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
elif entry.is_dir(follow_symlinks=False):
|
|
306
|
+
parent_candidates.append(entry.name)
|
|
307
|
+
except OSError:
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
for parent_conversation_id in sorted(parent_candidates):
|
|
311
|
+
subagents_dir = project_dir / parent_conversation_id / "subagents"
|
|
312
|
+
try:
|
|
313
|
+
if not subagents_dir.is_dir():
|
|
314
|
+
continue
|
|
315
|
+
except OSError:
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
try:
|
|
319
|
+
subagent_entries = list(os.scandir(subagents_dir))
|
|
320
|
+
except OSError:
|
|
321
|
+
LOGGER.warning(
|
|
322
|
+
"claude_local.skipped_subagents_dir reason=%s subagents_dir=%s",
|
|
323
|
+
"unreadable_subagents_dir",
|
|
324
|
+
subagents_dir,
|
|
325
|
+
)
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
for subagent_entry in subagent_entries:
|
|
329
|
+
try:
|
|
330
|
+
if not subagent_entry.is_file(follow_symlinks=False):
|
|
331
|
+
continue
|
|
332
|
+
if not subagent_entry.name.lower().endswith(".jsonl"):
|
|
333
|
+
continue
|
|
334
|
+
except OSError:
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
conversation_id = Path(subagent_entry.name).stem
|
|
338
|
+
if not conversation_id:
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
refs.append(
|
|
342
|
+
ConversationFileRef(
|
|
343
|
+
file_path=subagent_entry.path,
|
|
344
|
+
project=project,
|
|
345
|
+
conversation_id=conversation_id,
|
|
346
|
+
parent_conversation_id=parent_conversation_id,
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
refs.sort(key=lambda ref: ref.file_path)
|
|
351
|
+
return refs
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def list_conversation_file_refs(
|
|
355
|
+
projects_dir: Path,
|
|
356
|
+
project_dirs: list[str],
|
|
357
|
+
) -> list[ConversationFileRef]:
|
|
358
|
+
refs: list[ConversationFileRef] = []
|
|
359
|
+
for project in project_dirs:
|
|
360
|
+
refs.extend(_list_project_conversation_file_refs(projects_dir, project))
|
|
361
|
+
refs.sort(key=lambda ref: ref.file_path)
|
|
362
|
+
return refs
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def scan_claude_snapshot(projects_dir: Path) -> ParsedClaudeSnapshot:
|
|
366
|
+
project_dirs = list_project_dirs(projects_dir)
|
|
367
|
+
file_refs = list_conversation_file_refs(projects_dir, project_dirs)
|
|
368
|
+
|
|
369
|
+
conversations: list[ParsedConversationFile] = []
|
|
370
|
+
for file_ref in file_refs:
|
|
371
|
+
raw_lines = _read_jsonl_lines(file_ref.file_path)
|
|
372
|
+
if raw_lines is None:
|
|
373
|
+
LOGGER.warning(
|
|
374
|
+
"claude_local.skipped_file reason=%s file_path=%s project=%s conversation_id=%s",
|
|
375
|
+
"unreadable_file",
|
|
376
|
+
file_ref.file_path,
|
|
377
|
+
file_ref.project,
|
|
378
|
+
file_ref.conversation_id,
|
|
379
|
+
)
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
lines: list[ParsedLine] = []
|
|
383
|
+
turns: list[ValidatedTurn] = []
|
|
384
|
+
user_message_count = 0
|
|
385
|
+
assistant_message_count = 0
|
|
386
|
+
|
|
387
|
+
for line_index, raw_line in raw_lines:
|
|
388
|
+
payload, parse_error = parse_json_line(raw_line)
|
|
389
|
+
line = _build_line(
|
|
390
|
+
raw_line=raw_line,
|
|
391
|
+
payload=payload,
|
|
392
|
+
parse_error=parse_error,
|
|
393
|
+
file_path=file_ref.file_path,
|
|
394
|
+
line_index=line_index,
|
|
395
|
+
)
|
|
396
|
+
lines.append(line)
|
|
397
|
+
|
|
398
|
+
if line.type == "user":
|
|
399
|
+
user_message_count += 1
|
|
400
|
+
elif line.type == "assistant":
|
|
401
|
+
assistant_message_count += 1
|
|
402
|
+
|
|
403
|
+
if line.turn is not None:
|
|
404
|
+
turns.append(line.turn)
|
|
405
|
+
|
|
406
|
+
conversations.append(
|
|
407
|
+
ParsedConversationFile(
|
|
408
|
+
file_path=file_ref.file_path,
|
|
409
|
+
project=file_ref.project,
|
|
410
|
+
conversation_id=file_ref.conversation_id,
|
|
411
|
+
parent_conversation_id=file_ref.parent_conversation_id,
|
|
412
|
+
file_mtime=get_file_mtime(file_ref.file_path),
|
|
413
|
+
lines=lines,
|
|
414
|
+
turns=turns,
|
|
415
|
+
user_message_count=user_message_count,
|
|
416
|
+
assistant_message_count=assistant_message_count,
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
return ParsedClaudeSnapshot(project_dirs=project_dirs, conversations=conversations)
|