@gobing-ai/ts-llm-jsonl-importer 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -0
- package/dist/hash.d.ts +5 -0
- package/dist/hash.d.ts.map +1 -0
- package/dist/hash.js +19 -0
- package/dist/importer.d.ts +6 -0
- package/dist/importer.d.ts.map +1 -0
- package/dist/importer.js +226 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/redaction.d.ts +8 -0
- package/dist/redaction.d.ts.map +1 -0
- package/dist/redaction.js +35 -0
- package/dist/schema-sql.d.ts +2 -0
- package/dist/schema-sql.d.ts.map +1 -0
- package/dist/schema-sql.js +82 -0
- package/dist/sources.d.ts +6 -0
- package/dist/sources.d.ts.map +1 -0
- package/dist/sources.js +87 -0
- package/dist/types.d.ts +76 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +0 -0
- package/package.json +60 -0
- package/src/hash.ts +21 -0
- package/src/importer.ts +342 -0
- package/src/index.ts +18 -0
- package/src/redaction.ts +44 -0
- package/src/schema-sql.ts +82 -0
- package/src/sources.ts +111 -0
- package/src/types.ts +89 -0
package/package.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@gobing-ai/ts-llm-jsonl-importer",
|
|
3
|
+
"version": "0.2.1",
|
|
4
|
+
"description": "@gobing-ai/ts-llm-jsonl-importer — Generic JSONL importer for LLM agent history files.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"typescript",
|
|
7
|
+
"jsonl",
|
|
8
|
+
"llm",
|
|
9
|
+
"history",
|
|
10
|
+
"etl"
|
|
11
|
+
],
|
|
12
|
+
"repository": {
|
|
13
|
+
"type": "git",
|
|
14
|
+
"url": "git+https://github.com/gobing-ai/ts-libs.git",
|
|
15
|
+
"directory": "packages/llm-jsonl-importer"
|
|
16
|
+
},
|
|
17
|
+
"author": "Robin Min <minlongbing@gmail.com>",
|
|
18
|
+
"contributors": [
|
|
19
|
+
"Robin Min <minlongbing@gmail.com>"
|
|
20
|
+
],
|
|
21
|
+
"type": "module",
|
|
22
|
+
"private": false,
|
|
23
|
+
"sideEffects": false,
|
|
24
|
+
"license": "Apache-2.0",
|
|
25
|
+
"main": "./dist/index.js",
|
|
26
|
+
"types": "./dist/index.d.ts",
|
|
27
|
+
"exports": {
|
|
28
|
+
".": {
|
|
29
|
+
"types": "./dist/index.d.ts",
|
|
30
|
+
"import": "./dist/index.js"
|
|
31
|
+
}
|
|
32
|
+
},
|
|
33
|
+
"files": [
|
|
34
|
+
"dist",
|
|
35
|
+
"src",
|
|
36
|
+
"README.md"
|
|
37
|
+
],
|
|
38
|
+
"scripts": {
|
|
39
|
+
"build": "tsc -p tsconfig.build.json && bun ../../scripts/builder.ts fix-dist-esm-extensions dist",
|
|
40
|
+
"test": "NODE_ENV=test bun test --coverage --coverage-dir=.coverage --reporter=dots",
|
|
41
|
+
"test:full": "NODE_ENV=test bun test --update-snapshots --coverage --coverage-dir=.coverage",
|
|
42
|
+
"typecheck": "tsc --noEmit",
|
|
43
|
+
"lint": "biome check . && bun run typecheck",
|
|
44
|
+
"format": "biome check . --write",
|
|
45
|
+
"check": "bun run lint && bun run test",
|
|
46
|
+
"prepublishOnly": "bun run build",
|
|
47
|
+
"release": "echo 'Manual publish is disabled. Releases go through GitHub Actions via Trusted Publishing — push a tag: git tag @gobing-ai/ts-llm-jsonl-importer-v<version> && git push --tags' && exit 1"
|
|
48
|
+
},
|
|
49
|
+
"dependencies": {
|
|
50
|
+
"@gobing-ai/ts-db": "workspace:*",
|
|
51
|
+
"@gobing-ai/ts-runtime": "workspace:*",
|
|
52
|
+
"zod": "^4.1.0"
|
|
53
|
+
},
|
|
54
|
+
"devDependencies": {
|
|
55
|
+
"@types/bun": "1.3.14"
|
|
56
|
+
},
|
|
57
|
+
"publishConfig": {
|
|
58
|
+
"access": "public"
|
|
59
|
+
}
|
|
60
|
+
}
|
package/src/hash.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
|
|
3
|
+
/** Serialize JSON with sorted object keys so equivalent records hash identically. */
|
|
4
|
+
export function stableJson(value: unknown): string {
|
|
5
|
+
if (Array.isArray(value)) {
|
|
6
|
+
return `[${value.map((entry) => stableJson(entry)).join(',')}]`;
|
|
7
|
+
}
|
|
8
|
+
if (value !== null && typeof value === 'object') {
|
|
9
|
+
const record = value as Record<string, unknown>;
|
|
10
|
+
return `{${Object.keys(record)
|
|
11
|
+
.sort()
|
|
12
|
+
.map((key) => `${JSON.stringify(key)}:${stableJson(record[key])}`)
|
|
13
|
+
.join(',')}}`;
|
|
14
|
+
}
|
|
15
|
+
return JSON.stringify(value);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/** Compute a SHA-256 hash for an already-normalized record. */
|
|
19
|
+
export function sha256(value: unknown): string {
|
|
20
|
+
return createHash('sha256').update(stableJson(value)).digest('hex');
|
|
21
|
+
}
|
package/src/importer.ts
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import { resolve } from 'node:path';
|
|
2
|
+
import { getFs, walkDir } from '@gobing-ai/ts-runtime';
|
|
3
|
+
import { sha256 } from './hash';
|
|
4
|
+
import { redactRecord } from './redaction';
|
|
5
|
+
import { HISTORY_IMPORT_SCHEMA_SQL } from './schema-sql';
|
|
6
|
+
import { getSourceDefinition } from './sources';
|
|
7
|
+
import type {
|
|
8
|
+
ImportIssue,
|
|
9
|
+
ImportOptions,
|
|
10
|
+
ImportResult,
|
|
11
|
+
JsonObject,
|
|
12
|
+
LlmJsonlSource,
|
|
13
|
+
SourceDefinition,
|
|
14
|
+
TransformContext,
|
|
15
|
+
} from './types';
|
|
16
|
+
|
|
17
|
+
interface SplitRecord {
|
|
18
|
+
readonly targetTable: string;
|
|
19
|
+
readonly raw: JsonObject;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface CheckpointRow {
|
|
23
|
+
readonly last_imported_line: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const VALID_TABLE_NAME = /^history_etl_[a-z_]+$/;
|
|
27
|
+
|
|
28
|
+
/** Apply importer-owned schema to the target database. */
|
|
29
|
+
export async function applyHistoryImportSchema(db: ImportOptions['db']): Promise<void> {
|
|
30
|
+
for (const statement of HISTORY_IMPORT_SCHEMA_SQL.split(';')) {
|
|
31
|
+
const sql = statement.trim();
|
|
32
|
+
if (sql.length > 0) {
|
|
33
|
+
await db.exec(sql);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Run the JSONL import pipeline for one source definition. */
|
|
39
|
+
export async function runJsonlImport(source: LlmJsonlSource, options: ImportOptions): Promise<ImportResult> {
|
|
40
|
+
const definition = getSourceDefinition(source);
|
|
41
|
+
await applyHistoryImportSchema(options.db);
|
|
42
|
+
|
|
43
|
+
const mode = options.mode ?? 'incremental';
|
|
44
|
+
const files = await discoverFiles(definition, options.roots, options.files);
|
|
45
|
+
if (mode === 'full' && !options.dryRun) {
|
|
46
|
+
await resetCheckpoints(options.db, source, files);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const parseErrors: ImportIssue[] = [];
|
|
50
|
+
const validationErrors: ImportIssue[] = [];
|
|
51
|
+
let processedLines = 0;
|
|
52
|
+
let importedRecords = 0;
|
|
53
|
+
let skippedDuplicates = 0;
|
|
54
|
+
let checkpointUpdates = 0;
|
|
55
|
+
|
|
56
|
+
for (const file of files) {
|
|
57
|
+
const checkpoint = mode === 'incremental' ? await readCheckpoint(options.db, source, file) : 0;
|
|
58
|
+
const lines = (await getFs().readFile(file)).split(/\r?\n/);
|
|
59
|
+
|
|
60
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
61
|
+
const lineNumber = index + 1;
|
|
62
|
+
const line = lines[index]?.trim();
|
|
63
|
+
if (line === undefined || line.length === 0 || lineNumber <= checkpoint) continue;
|
|
64
|
+
processedLines += 1;
|
|
65
|
+
|
|
66
|
+
const raw = parseJsonLine(line, file, lineNumber, parseErrors);
|
|
67
|
+
if (raw === undefined) continue;
|
|
68
|
+
|
|
69
|
+
const splitRecords = splitRawRecord(definition, raw);
|
|
70
|
+
let lineSucceeded = false;
|
|
71
|
+
|
|
72
|
+
for (let splitIndex = 0; splitIndex < splitRecords.length; splitIndex += 1) {
|
|
73
|
+
const split = splitRecords[splitIndex];
|
|
74
|
+
if (split === undefined) continue;
|
|
75
|
+
const normalized = normalizeRecord(definition, split.raw, {
|
|
76
|
+
source,
|
|
77
|
+
sourceFile: file,
|
|
78
|
+
sourceLine: lineNumber,
|
|
79
|
+
splitIndex,
|
|
80
|
+
});
|
|
81
|
+
const parsed = definition.schema.safeParse(normalized);
|
|
82
|
+
if (!parsed.success) {
|
|
83
|
+
validationErrors.push({
|
|
84
|
+
sourceFile: file,
|
|
85
|
+
sourceLine: lineNumber,
|
|
86
|
+
reason: parsed.error.issues.map((issue) => issue.message).join('; '),
|
|
87
|
+
});
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const redacted = redactRecord(parsed.data, options.redactionRules);
|
|
92
|
+
lineSucceeded = true;
|
|
93
|
+
const recordHash = sha256({
|
|
94
|
+
source,
|
|
95
|
+
sourceFile: file,
|
|
96
|
+
sourceLine: lineNumber,
|
|
97
|
+
splitIndex,
|
|
98
|
+
record: redacted,
|
|
99
|
+
});
|
|
100
|
+
if (await ledgerExists(options.db, recordHash)) {
|
|
101
|
+
skippedDuplicates += 1;
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
if (!options.dryRun) {
|
|
105
|
+
await insertRecord(
|
|
106
|
+
options.db,
|
|
107
|
+
split.targetTable,
|
|
108
|
+
recordHash,
|
|
109
|
+
file,
|
|
110
|
+
lineNumber,
|
|
111
|
+
splitIndex,
|
|
112
|
+
redacted,
|
|
113
|
+
options.now,
|
|
114
|
+
);
|
|
115
|
+
await insertLedger(
|
|
116
|
+
options.db,
|
|
117
|
+
recordHash,
|
|
118
|
+
source,
|
|
119
|
+
file,
|
|
120
|
+
lineNumber,
|
|
121
|
+
splitIndex,
|
|
122
|
+
split.targetTable,
|
|
123
|
+
options.now,
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
importedRecords += 1;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (lineSucceeded && !options.dryRun) {
|
|
130
|
+
await writeCheckpoint(options.db, source, file, lineNumber, options.now);
|
|
131
|
+
checkpointUpdates += 1;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
source,
|
|
138
|
+
mode,
|
|
139
|
+
scannedFiles: files.length,
|
|
140
|
+
processedLines,
|
|
141
|
+
importedRecords,
|
|
142
|
+
skippedDuplicates,
|
|
143
|
+
parseErrors,
|
|
144
|
+
validationErrors,
|
|
145
|
+
checkpointUpdates,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function parseJsonLine(
|
|
150
|
+
line: string,
|
|
151
|
+
sourceFile: string,
|
|
152
|
+
sourceLine: number,
|
|
153
|
+
parseErrors: ImportIssue[],
|
|
154
|
+
): JsonObject | undefined {
|
|
155
|
+
try {
|
|
156
|
+
const value = JSON.parse(line);
|
|
157
|
+
if (value === null || typeof value !== 'object' || Array.isArray(value)) {
|
|
158
|
+
parseErrors.push({ sourceFile, sourceLine, reason: 'JSONL row must be an object' });
|
|
159
|
+
return undefined;
|
|
160
|
+
}
|
|
161
|
+
return value as JsonObject;
|
|
162
|
+
} catch (error) {
|
|
163
|
+
parseErrors.push({ sourceFile, sourceLine, reason: error instanceof Error ? error.message : 'Invalid JSON' });
|
|
164
|
+
return undefined;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
function splitRawRecord(definition: SourceDefinition, raw: JsonObject): readonly SplitRecord[] {
|
|
169
|
+
const targetTable = targetTableFor(definition.targetTable);
|
|
170
|
+
if (definition.splitConfig.mode === 'one-to-one') {
|
|
171
|
+
return [{ targetTable, raw }];
|
|
172
|
+
}
|
|
173
|
+
if (definition.splitConfig.mode === 'custom') {
|
|
174
|
+
const config = definition.splitConfig;
|
|
175
|
+
return config.split(raw).map((entry) => ({
|
|
176
|
+
targetTable: targetTableFor(config.targetTable ?? definition.targetTable),
|
|
177
|
+
raw: entry,
|
|
178
|
+
}));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const config = definition.splitConfig;
|
|
182
|
+
const nested = raw[config.field];
|
|
183
|
+
if (!Array.isArray(nested)) {
|
|
184
|
+
return [{ targetTable, raw }];
|
|
185
|
+
}
|
|
186
|
+
return nested
|
|
187
|
+
.filter((entry): entry is JsonObject => entry !== null && typeof entry === 'object' && !Array.isArray(entry))
|
|
188
|
+
.map((entry) => ({
|
|
189
|
+
targetTable: targetTableFor(config.targetTable ?? definition.targetTable),
|
|
190
|
+
raw: { ...raw, ...entry },
|
|
191
|
+
}));
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function normalizeRecord(definition: SourceDefinition, raw: JsonObject, context: TransformContext): JsonObject {
|
|
195
|
+
const normalized: JsonObject = {};
|
|
196
|
+
for (const [rawKey, targetKey] of Object.entries(definition.fieldMap)) {
|
|
197
|
+
if (rawKey in raw) normalized[targetKey] = raw[rawKey];
|
|
198
|
+
}
|
|
199
|
+
for (const [targetKey, transform] of Object.entries(definition.fieldTransforms)) {
|
|
200
|
+
normalized[targetKey] = transform(normalized[targetKey], raw, context);
|
|
201
|
+
}
|
|
202
|
+
normalized.source = context.source;
|
|
203
|
+
normalized.source_file = context.sourceFile;
|
|
204
|
+
normalized.source_line = context.sourceLine;
|
|
205
|
+
normalized.split_index = context.splitIndex;
|
|
206
|
+
return normalized;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
async function discoverFiles(
|
|
210
|
+
definition: SourceDefinition,
|
|
211
|
+
roots: readonly string[] | undefined,
|
|
212
|
+
files: readonly string[] | undefined,
|
|
213
|
+
): Promise<readonly string[]> {
|
|
214
|
+
if (files !== undefined && files.length > 0) {
|
|
215
|
+
return files.map((file) => resolve(file)).sort();
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const fs = getFs();
|
|
219
|
+
const resolvedRoots = (roots ?? definition.defaultRoots).map((root) => resolve(root));
|
|
220
|
+
const found = new Set<string>();
|
|
221
|
+
for (const root of resolvedRoots) {
|
|
222
|
+
if (!(await fs.exists(root))) continue;
|
|
223
|
+
const stat = await fs.stat(root);
|
|
224
|
+
if (stat === null) continue;
|
|
225
|
+
if (stat.isFile()) {
|
|
226
|
+
if (matchesPattern(root, definition.filePatterns)) found.add(root);
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
229
|
+
for (const file of await walkDir(root)) {
|
|
230
|
+
if (matchesPattern(file, definition.filePatterns)) found.add(file);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return [...found].sort();
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function matchesPattern(path: string, patterns: readonly string[]): boolean {
|
|
237
|
+
return patterns.some((pattern) => {
|
|
238
|
+
if (pattern === '*.jsonl') return path.endsWith('.jsonl');
|
|
239
|
+
if (pattern === '*.json') return path.endsWith('.json');
|
|
240
|
+
return path.endsWith(pattern.replace(/^\*/, ''));
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
async function readCheckpoint(db: ImportOptions['db'], source: string, sourceFile: string): Promise<number> {
|
|
245
|
+
const row = await db.queryFirst<CheckpointRow>(
|
|
246
|
+
'SELECT last_imported_line FROM history_import_checkpoint WHERE source = ? AND source_file = ?',
|
|
247
|
+
source,
|
|
248
|
+
sourceFile,
|
|
249
|
+
);
|
|
250
|
+
return row?.last_imported_line ?? 0;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async function resetCheckpoints(db: ImportOptions['db'], source: string, files: readonly string[]): Promise<void> {
|
|
254
|
+
for (const file of files) {
|
|
255
|
+
await db.run('DELETE FROM history_import_checkpoint WHERE source = ? AND source_file = ?', source, file);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
async function writeCheckpoint(
|
|
260
|
+
db: ImportOptions['db'],
|
|
261
|
+
source: string,
|
|
262
|
+
sourceFile: string,
|
|
263
|
+
line: number,
|
|
264
|
+
now: ImportOptions['now'],
|
|
265
|
+
): Promise<void> {
|
|
266
|
+
await db.run(
|
|
267
|
+
`INSERT INTO history_import_checkpoint (source, source_file, last_imported_line, updated_at)
|
|
268
|
+
VALUES (?, ?, ?, ?)
|
|
269
|
+
ON CONFLICT(source, source_file) DO UPDATE SET
|
|
270
|
+
last_imported_line = excluded.last_imported_line,
|
|
271
|
+
updated_at = excluded.updated_at`,
|
|
272
|
+
source,
|
|
273
|
+
sourceFile,
|
|
274
|
+
line,
|
|
275
|
+
timestamp(now),
|
|
276
|
+
);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
async function ledgerExists(db: ImportOptions['db'], recordHash: string): Promise<boolean> {
|
|
280
|
+
const row = await db.queryFirst<{ record_hash: string }>(
|
|
281
|
+
'SELECT record_hash FROM history_import_ledger WHERE record_hash = ?',
|
|
282
|
+
recordHash,
|
|
283
|
+
);
|
|
284
|
+
return row !== undefined && row !== null;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
async function insertRecord(
|
|
288
|
+
db: ImportOptions['db'],
|
|
289
|
+
targetTable: string,
|
|
290
|
+
recordHash: string,
|
|
291
|
+
sourceFile: string,
|
|
292
|
+
sourceLine: number,
|
|
293
|
+
splitIndex: number,
|
|
294
|
+
payload: JsonObject,
|
|
295
|
+
now: ImportOptions['now'],
|
|
296
|
+
): Promise<void> {
|
|
297
|
+
const table = targetTableFor(targetTable);
|
|
298
|
+
await db.run(
|
|
299
|
+
`INSERT INTO ${table} (record_hash, source_file, source_line, split_index, payload_json, imported_at)
|
|
300
|
+
VALUES (?, ?, ?, ?, ?, ?)`,
|
|
301
|
+
recordHash,
|
|
302
|
+
sourceFile,
|
|
303
|
+
sourceLine,
|
|
304
|
+
splitIndex,
|
|
305
|
+
JSON.stringify(payload),
|
|
306
|
+
timestamp(now),
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
async function insertLedger(
|
|
311
|
+
db: ImportOptions['db'],
|
|
312
|
+
recordHash: string,
|
|
313
|
+
source: string,
|
|
314
|
+
sourceFile: string,
|
|
315
|
+
sourceLine: number,
|
|
316
|
+
splitIndex: number,
|
|
317
|
+
targetTable: string,
|
|
318
|
+
now: ImportOptions['now'],
|
|
319
|
+
): Promise<void> {
|
|
320
|
+
await db.run(
|
|
321
|
+
`INSERT INTO history_import_ledger (record_hash, source, source_file, source_line, split_index, target_table, imported_at)
|
|
322
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
323
|
+
recordHash,
|
|
324
|
+
source,
|
|
325
|
+
sourceFile,
|
|
326
|
+
sourceLine,
|
|
327
|
+
splitIndex,
|
|
328
|
+
targetTable,
|
|
329
|
+
timestamp(now),
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
function targetTableFor(table: string): string {
|
|
334
|
+
if (!VALID_TABLE_NAME.test(table)) {
|
|
335
|
+
throw new Error(`Invalid history ETL target table: ${table}`);
|
|
336
|
+
}
|
|
337
|
+
return table;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
function timestamp(now: ImportOptions['now']): string {
|
|
341
|
+
return (now?.() ?? new Date()).toISOString();
|
|
342
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export { sha256, stableJson } from './hash';
|
|
2
|
+
export { applyHistoryImportSchema, runJsonlImport } from './importer';
|
|
3
|
+
export { DEFAULT_REDACTION_RULES, redactRecord, redactValue } from './redaction';
|
|
4
|
+
export { HISTORY_IMPORT_SCHEMA_SQL } from './schema-sql';
|
|
5
|
+
export { getSourceDefinition, SOURCE_DEFINITIONS } from './sources';
|
|
6
|
+
export type {
|
|
7
|
+
FieldTransform,
|
|
8
|
+
ImportIssue,
|
|
9
|
+
ImportMode,
|
|
10
|
+
ImportOptions,
|
|
11
|
+
ImportResult,
|
|
12
|
+
JsonObject,
|
|
13
|
+
LlmJsonlSource,
|
|
14
|
+
RedactionRule,
|
|
15
|
+
SourceDefinition,
|
|
16
|
+
SplitConfig,
|
|
17
|
+
TransformContext,
|
|
18
|
+
} from './types';
|
package/src/redaction.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import type { JsonObject, RedactionRule } from './types';
|
|
2
|
+
|
|
3
|
+
/** Default redaction rules for common token, key, and email shapes in agent logs. */
|
|
4
|
+
export const DEFAULT_REDACTION_RULES: readonly RedactionRule[] = [
|
|
5
|
+
{
|
|
6
|
+
name: 'api-key',
|
|
7
|
+
pattern: /\b(?:sk|pk|ghp|github_pat|xox[baprs])-[-_a-zA-Z0-9]{12,}\b/g,
|
|
8
|
+
replacement: '[REDACTED:token]',
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
name: 'assignment-secret',
|
|
12
|
+
pattern: /\b(?:api[_-]?key|token|secret|password)\s*[:=]\s*["']?[^"',\s}]+/gi,
|
|
13
|
+
replacement: '[REDACTED:secret]',
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
name: 'email',
|
|
17
|
+
pattern: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi,
|
|
18
|
+
replacement: '[REDACTED:email]',
|
|
19
|
+
},
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
/** Redact supported scalar and composite JSON values recursively. */
|
|
23
|
+
export function redactValue(value: unknown, rules: readonly RedactionRule[] = DEFAULT_REDACTION_RULES): unknown {
|
|
24
|
+
if (typeof value === 'string') {
|
|
25
|
+
return rules.reduce((current, rule) => current.replace(rule.pattern, rule.replacement), value);
|
|
26
|
+
}
|
|
27
|
+
if (Array.isArray(value)) {
|
|
28
|
+
return value.map((entry) => redactValue(entry, rules));
|
|
29
|
+
}
|
|
30
|
+
if (value !== null && typeof value === 'object') {
|
|
31
|
+
return Object.fromEntries(
|
|
32
|
+
Object.entries(value as JsonObject).map(([key, entry]) => [key, redactValue(entry, rules)]),
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
return value;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Redact a normalized ETL record before hashing or writing it to the database. */
|
|
39
|
+
export function redactRecord<TRecord extends JsonObject>(
|
|
40
|
+
record: TRecord,
|
|
41
|
+
rules: readonly RedactionRule[] = DEFAULT_REDACTION_RULES,
|
|
42
|
+
): TRecord {
|
|
43
|
+
return redactValue(record, rules) as TRecord;
|
|
44
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
export const HISTORY_IMPORT_SCHEMA_SQL = `
|
|
2
|
+
CREATE TABLE IF NOT EXISTS history_import_checkpoint (
|
|
3
|
+
source TEXT NOT NULL,
|
|
4
|
+
source_file TEXT NOT NULL,
|
|
5
|
+
last_imported_line INTEGER NOT NULL DEFAULT 0,
|
|
6
|
+
updated_at TEXT NOT NULL,
|
|
7
|
+
PRIMARY KEY (source, source_file)
|
|
8
|
+
);
|
|
9
|
+
|
|
10
|
+
CREATE TABLE IF NOT EXISTS history_import_ledger (
|
|
11
|
+
record_hash TEXT PRIMARY KEY,
|
|
12
|
+
source TEXT NOT NULL,
|
|
13
|
+
source_file TEXT NOT NULL,
|
|
14
|
+
source_line INTEGER NOT NULL,
|
|
15
|
+
split_index INTEGER NOT NULL,
|
|
16
|
+
target_table TEXT NOT NULL,
|
|
17
|
+
imported_at TEXT NOT NULL
|
|
18
|
+
);
|
|
19
|
+
|
|
20
|
+
CREATE TABLE IF NOT EXISTS history_etl_pi (
|
|
21
|
+
record_hash TEXT PRIMARY KEY,
|
|
22
|
+
source_file TEXT NOT NULL,
|
|
23
|
+
source_line INTEGER NOT NULL,
|
|
24
|
+
split_index INTEGER NOT NULL,
|
|
25
|
+
payload_json TEXT NOT NULL,
|
|
26
|
+
imported_at TEXT NOT NULL
|
|
27
|
+
);
|
|
28
|
+
|
|
29
|
+
CREATE TABLE IF NOT EXISTS history_etl_claude (
|
|
30
|
+
record_hash TEXT PRIMARY KEY,
|
|
31
|
+
source_file TEXT NOT NULL,
|
|
32
|
+
source_line INTEGER NOT NULL,
|
|
33
|
+
split_index INTEGER NOT NULL,
|
|
34
|
+
payload_json TEXT NOT NULL,
|
|
35
|
+
imported_at TEXT NOT NULL
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
CREATE TABLE IF NOT EXISTS history_etl_codex (
|
|
39
|
+
record_hash TEXT PRIMARY KEY,
|
|
40
|
+
source_file TEXT NOT NULL,
|
|
41
|
+
source_line INTEGER NOT NULL,
|
|
42
|
+
split_index INTEGER NOT NULL,
|
|
43
|
+
payload_json TEXT NOT NULL,
|
|
44
|
+
imported_at TEXT NOT NULL
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
CREATE TABLE IF NOT EXISTS history_etl_gemini (
|
|
48
|
+
record_hash TEXT PRIMARY KEY,
|
|
49
|
+
source_file TEXT NOT NULL,
|
|
50
|
+
source_line INTEGER NOT NULL,
|
|
51
|
+
split_index INTEGER NOT NULL,
|
|
52
|
+
payload_json TEXT NOT NULL,
|
|
53
|
+
imported_at TEXT NOT NULL
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
CREATE TABLE IF NOT EXISTS history_etl_opencode (
|
|
57
|
+
record_hash TEXT PRIMARY KEY,
|
|
58
|
+
source_file TEXT NOT NULL,
|
|
59
|
+
source_line INTEGER NOT NULL,
|
|
60
|
+
split_index INTEGER NOT NULL,
|
|
61
|
+
payload_json TEXT NOT NULL,
|
|
62
|
+
imported_at TEXT NOT NULL
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
CREATE TABLE IF NOT EXISTS history_etl_antigravity (
|
|
66
|
+
record_hash TEXT PRIMARY KEY,
|
|
67
|
+
source_file TEXT NOT NULL,
|
|
68
|
+
source_line INTEGER NOT NULL,
|
|
69
|
+
split_index INTEGER NOT NULL,
|
|
70
|
+
payload_json TEXT NOT NULL,
|
|
71
|
+
imported_at TEXT NOT NULL
|
|
72
|
+
);
|
|
73
|
+
|
|
74
|
+
CREATE TABLE IF NOT EXISTS history_etl_openclaw (
|
|
75
|
+
record_hash TEXT PRIMARY KEY,
|
|
76
|
+
source_file TEXT NOT NULL,
|
|
77
|
+
source_line INTEGER NOT NULL,
|
|
78
|
+
split_index INTEGER NOT NULL,
|
|
79
|
+
payload_json TEXT NOT NULL,
|
|
80
|
+
imported_at TEXT NOT NULL
|
|
81
|
+
);
|
|
82
|
+
`.trim();
|
package/src/sources.ts
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import type { FieldTransform, JsonObject, LlmJsonlSource, SourceDefinition, TransformContext } from './types';
|
|
3
|
+
|
|
4
|
+
const sourceRecordSchema = z
|
|
5
|
+
.object({
|
|
6
|
+
source_record_id: z.string().min(1),
|
|
7
|
+
created_at: z.string().min(1),
|
|
8
|
+
content: z.string().min(1),
|
|
9
|
+
role: z.string().optional(),
|
|
10
|
+
model: z.string().optional(),
|
|
11
|
+
})
|
|
12
|
+
.passthrough();
|
|
13
|
+
|
|
14
|
+
function firstString(...values: readonly unknown[]): string | undefined {
|
|
15
|
+
for (const value of values) {
|
|
16
|
+
if (typeof value === 'string' && value.length > 0) return value;
|
|
17
|
+
if (typeof value === 'number' && Number.isFinite(value)) return String(value);
|
|
18
|
+
}
|
|
19
|
+
return undefined;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function defaultCreatedAt(value: unknown): string {
|
|
23
|
+
return firstString(value) ?? new Date(0).toISOString();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function defaultRecordId(value: unknown, raw: JsonObject, context: TransformContext): string {
|
|
27
|
+
return (
|
|
28
|
+
firstString(
|
|
29
|
+
value,
|
|
30
|
+
raw.id,
|
|
31
|
+
raw.uuid,
|
|
32
|
+
raw.message_id,
|
|
33
|
+
raw.messageId,
|
|
34
|
+
raw.session_id,
|
|
35
|
+
raw.sessionId,
|
|
36
|
+
raw.conversation_id,
|
|
37
|
+
) ?? `${context.source}:${context.sourceFile}:${context.sourceLine}:${context.splitIndex}`
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function defaultContent(value: unknown, raw: JsonObject): string | undefined {
|
|
42
|
+
return firstString(value, raw.content, raw.text, raw.message, raw.prompt, raw.response, raw.output);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function transforms(): Readonly<Record<string, FieldTransform>> {
|
|
46
|
+
return {
|
|
47
|
+
source_record_id: defaultRecordId,
|
|
48
|
+
created_at: defaultCreatedAt,
|
|
49
|
+
content: defaultContent,
|
|
50
|
+
role: (value, raw) => firstString(value, raw.role, raw.type),
|
|
51
|
+
model: (value, raw) => firstString(value, raw.model, raw.model_name, raw.modelName),
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function sourceDefinition(
|
|
56
|
+
source: LlmJsonlSource,
|
|
57
|
+
displayName: string,
|
|
58
|
+
defaultRoots: readonly string[],
|
|
59
|
+
filePatterns: readonly string[],
|
|
60
|
+
): SourceDefinition {
|
|
61
|
+
return {
|
|
62
|
+
source,
|
|
63
|
+
displayName,
|
|
64
|
+
defaultRoots,
|
|
65
|
+
filePatterns,
|
|
66
|
+
targetTable: `history_etl_${source}`,
|
|
67
|
+
splitConfig: source === 'pi' ? { mode: 'one-to-many', field: 'messages' } : { mode: 'one-to-one' },
|
|
68
|
+
fieldMap: {
|
|
69
|
+
id: 'source_record_id',
|
|
70
|
+
uuid: 'source_record_id',
|
|
71
|
+
message_id: 'source_record_id',
|
|
72
|
+
messageId: 'source_record_id',
|
|
73
|
+
session_id: 'source_record_id',
|
|
74
|
+
sessionId: 'source_record_id',
|
|
75
|
+
conversation_id: 'source_record_id',
|
|
76
|
+
timestamp: 'created_at',
|
|
77
|
+
created_at: 'created_at',
|
|
78
|
+
createdAt: 'created_at',
|
|
79
|
+
time: 'created_at',
|
|
80
|
+
content: 'content',
|
|
81
|
+
text: 'content',
|
|
82
|
+
message: 'content',
|
|
83
|
+
prompt: 'content',
|
|
84
|
+
response: 'content',
|
|
85
|
+
output: 'content',
|
|
86
|
+
role: 'role',
|
|
87
|
+
type: 'role',
|
|
88
|
+
model: 'model',
|
|
89
|
+
model_name: 'model',
|
|
90
|
+
modelName: 'model',
|
|
91
|
+
},
|
|
92
|
+
fieldTransforms: transforms(),
|
|
93
|
+
schema: sourceRecordSchema,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/** Built-in source definitions keyed by source identifier. */
|
|
98
|
+
export const SOURCE_DEFINITIONS: Readonly<Record<LlmJsonlSource, SourceDefinition>> = {
|
|
99
|
+
pi: sourceDefinition('pi', 'Pi', ['.pi/history', '.pi'], ['*.jsonl', '*.json']),
|
|
100
|
+
claude: sourceDefinition('claude', 'Claude Code', ['.claude/projects', '.claude'], ['*.jsonl']),
|
|
101
|
+
codex: sourceDefinition('codex', 'Codex', ['.codex/sessions', '.codex'], ['*.jsonl']),
|
|
102
|
+
gemini: sourceDefinition('gemini', 'Gemini CLI', ['.gemini', '.config/gemini'], ['*.jsonl']),
|
|
103
|
+
opencode: sourceDefinition('opencode', 'OpenCode', ['.opencode', '.local/share/opencode'], ['*.jsonl']),
|
|
104
|
+
antigravity: sourceDefinition('antigravity', 'Antigravity', ['.antigravity'], ['*.jsonl']),
|
|
105
|
+
openclaw: sourceDefinition('openclaw', 'OpenClaw', ['.openclaw'], ['*.jsonl']),
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
/** Resolve a built-in source definition by identifier. */
|
|
109
|
+
export function getSourceDefinition(source: LlmJsonlSource): SourceDefinition {
|
|
110
|
+
return SOURCE_DEFINITIONS[source];
|
|
111
|
+
}
|