@martian-engineering/lossless-claw 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -11
- package/docs/configuration.md +44 -0
- package/openclaw.plugin.json +114 -0
- package/package.json +2 -1
- package/skills/lossless-claw/SKILL.md +33 -0
- package/skills/lossless-claw/references/architecture.md +52 -0
- package/skills/lossless-claw/references/config.md +263 -0
- package/skills/lossless-claw/references/diagnostics.md +79 -0
- package/skills/lossless-claw/references/recall-tools.md +55 -0
- package/skills/lossless-claw/references/session-lifecycle.md +59 -0
- package/src/assembler.ts +321 -34
- package/src/compaction.ts +220 -19
- package/src/db/config.ts +74 -21
- package/src/db/migration.ts +50 -13
- package/src/engine.ts +742 -133
- package/src/plugin/index.ts +156 -73
- package/src/plugin/lcm-command.ts +759 -0
- package/src/plugin/lcm-doctor-apply.ts +546 -0
- package/src/plugin/lcm-doctor-shared.ts +210 -0
- package/src/store/conversation-store.ts +60 -21
- package/src/store/parse-utc-timestamp.ts +25 -0
- package/src/store/summary-store.ts +460 -11
- package/src/summarize.ts +553 -224
- package/src/tools/lcm-expand-query-tool.ts +195 -59
- package/src/tools/lcm-expansion-recursion-guard.ts +87 -0
- package/src/types.ts +1 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type { DatabaseSync } from "node:sqlite";
|
|
2
2
|
import { sanitizeFts5Query } from "./fts5-sanitize.js";
|
|
3
3
|
import { buildLikeSearchPlan, containsCjk, createFallbackSnippet } from "./full-text-fallback.js";
|
|
4
|
+
import { parseUtcTimestamp, parseUtcTimestampOrNull } from "./parse-utc-timestamp.js";
|
|
4
5
|
|
|
5
6
|
export type SummaryKind = "leaf" | "condensed";
|
|
6
7
|
export type ContextItemType = "message" | "summary";
|
|
@@ -45,6 +46,11 @@ export type SummarySubtreeNodeRecord = SummaryRecord & {
|
|
|
45
46
|
childCount: number;
|
|
46
47
|
};
|
|
47
48
|
|
|
49
|
+
export type MessageLeafSummaryLinkRecord = {
|
|
50
|
+
messageId: number;
|
|
51
|
+
summaryId: string;
|
|
52
|
+
};
|
|
53
|
+
|
|
48
54
|
export type ContextItemRecord = {
|
|
49
55
|
conversationId: number;
|
|
50
56
|
ordinal: number;
|
|
@@ -112,6 +118,16 @@ export type ConversationBootstrapStateRecord = {
|
|
|
112
118
|
updatedAt: Date;
|
|
113
119
|
};
|
|
114
120
|
|
|
121
|
+
export type TranscriptGcCandidateRecord = {
|
|
122
|
+
messageId: number;
|
|
123
|
+
conversationId: number;
|
|
124
|
+
seq: number;
|
|
125
|
+
toolCallId: string;
|
|
126
|
+
toolName: string | null;
|
|
127
|
+
externalizedFileId: string | null;
|
|
128
|
+
originalByteSize: number | null;
|
|
129
|
+
};
|
|
130
|
+
|
|
115
131
|
// ── DB row shapes (snake_case) ────────────────────────────────────────────────
|
|
116
132
|
|
|
117
133
|
interface SummaryRow {
|
|
@@ -172,6 +188,15 @@ interface MessageIdRow {
|
|
|
172
188
|
message_id: number;
|
|
173
189
|
}
|
|
174
190
|
|
|
191
|
+
interface MaxDepthRow {
|
|
192
|
+
max_depth: number | null;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
interface MessageLeafSummaryLinkRow {
|
|
196
|
+
message_id: number;
|
|
197
|
+
summary_id: string;
|
|
198
|
+
}
|
|
199
|
+
|
|
175
200
|
interface LargeFileRow {
|
|
176
201
|
file_id: string;
|
|
177
202
|
conversation_id: number;
|
|
@@ -193,6 +218,17 @@ interface ConversationBootstrapStateRow {
|
|
|
193
218
|
updated_at: string;
|
|
194
219
|
}
|
|
195
220
|
|
|
221
|
+
const CJK_QUERY_SEGMENT_RE =
|
|
222
|
+
/[\u2E80-\u9FFF\u3400-\u4DBF\uF900-\uFAFF\uAC00-\uD7AF\u3040-\u309F\u30A0-\u30FF]+/g;
|
|
223
|
+
const LATIN_QUERY_TOKEN_RE = /[a-zA-Z0-9][\w./-]*/g;
|
|
224
|
+
interface TranscriptGcCandidateRow {
|
|
225
|
+
message_id: number;
|
|
226
|
+
conversation_id: number;
|
|
227
|
+
seq: number;
|
|
228
|
+
tool_call_id: string | null;
|
|
229
|
+
tool_name: string | null;
|
|
230
|
+
metadata: string | null;
|
|
231
|
+
}
|
|
196
232
|
// ── Row mappers ───────────────────────────────────────────────────────────────
|
|
197
233
|
|
|
198
234
|
function toSummaryRecord(row: SummaryRow): SummaryRecord {
|
|
@@ -210,8 +246,8 @@ function toSummaryRecord(row: SummaryRow): SummaryRecord {
|
|
|
210
246
|
content: row.content,
|
|
211
247
|
tokenCount: row.token_count,
|
|
212
248
|
fileIds,
|
|
213
|
-
earliestAt:
|
|
214
|
-
latestAt:
|
|
249
|
+
earliestAt: parseUtcTimestampOrNull(row.earliest_at),
|
|
250
|
+
latestAt: parseUtcTimestampOrNull(row.latest_at),
|
|
215
251
|
descendantCount:
|
|
216
252
|
typeof row.descendant_count === "number" &&
|
|
217
253
|
Number.isFinite(row.descendant_count) &&
|
|
@@ -231,7 +267,7 @@ function toSummaryRecord(row: SummaryRow): SummaryRecord {
|
|
|
231
267
|
? Math.floor(row.source_message_token_count)
|
|
232
268
|
: 0,
|
|
233
269
|
model: typeof row.model === "string" ? row.model : "unknown",
|
|
234
|
-
createdAt:
|
|
270
|
+
createdAt: parseUtcTimestamp(row.created_at),
|
|
235
271
|
};
|
|
236
272
|
}
|
|
237
273
|
|
|
@@ -242,7 +278,7 @@ function toContextItemRecord(row: ContextItemRow): ContextItemRecord {
|
|
|
242
278
|
itemType: row.item_type,
|
|
243
279
|
messageId: row.message_id,
|
|
244
280
|
summaryId: row.summary_id,
|
|
245
|
-
createdAt:
|
|
281
|
+
createdAt: parseUtcTimestamp(row.created_at),
|
|
246
282
|
};
|
|
247
283
|
}
|
|
248
284
|
|
|
@@ -252,7 +288,7 @@ function toSearchResult(row: SummarySearchRow): SummarySearchResult {
|
|
|
252
288
|
conversationId: row.conversation_id,
|
|
253
289
|
kind: row.kind,
|
|
254
290
|
snippet: row.snippet,
|
|
255
|
-
createdAt:
|
|
291
|
+
createdAt: parseUtcTimestamp(row.created_at),
|
|
256
292
|
rank: row.rank,
|
|
257
293
|
};
|
|
258
294
|
}
|
|
@@ -266,7 +302,7 @@ function toLargeFileRecord(row: LargeFileRow): LargeFileRecord {
|
|
|
266
302
|
byteSize: row.byte_size,
|
|
267
303
|
storageUri: row.storage_uri,
|
|
268
304
|
explorationSummary: row.exploration_summary,
|
|
269
|
-
createdAt:
|
|
305
|
+
createdAt: parseUtcTimestamp(row.created_at),
|
|
270
306
|
};
|
|
271
307
|
}
|
|
272
308
|
|
|
@@ -280,7 +316,43 @@ function toConversationBootstrapStateRecord(
|
|
|
280
316
|
lastSeenMtimeMs: row.last_seen_mtime_ms,
|
|
281
317
|
lastProcessedOffset: row.last_processed_offset,
|
|
282
318
|
lastProcessedEntryHash: row.last_processed_entry_hash,
|
|
283
|
-
updatedAt:
|
|
319
|
+
updatedAt: parseUtcTimestamp(row.updated_at),
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
function toTranscriptGcCandidateRecord(
|
|
324
|
+
row: TranscriptGcCandidateRow,
|
|
325
|
+
): TranscriptGcCandidateRecord | null {
|
|
326
|
+
if (typeof row.tool_call_id !== "string" || row.tool_call_id.length === 0) {
|
|
327
|
+
return null;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
let metadata: Record<string, unknown> | null = null;
|
|
331
|
+
try {
|
|
332
|
+
metadata =
|
|
333
|
+
typeof row.metadata === "string" && row.metadata.length > 0
|
|
334
|
+
? (JSON.parse(row.metadata) as Record<string, unknown>)
|
|
335
|
+
: null;
|
|
336
|
+
} catch {
|
|
337
|
+
metadata = null;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
if (!metadata || metadata.toolOutputExternalized !== true) {
|
|
341
|
+
return null;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
messageId: row.message_id,
|
|
346
|
+
conversationId: row.conversation_id,
|
|
347
|
+
seq: row.seq,
|
|
348
|
+
toolCallId: row.tool_call_id,
|
|
349
|
+
toolName: row.tool_name,
|
|
350
|
+
externalizedFileId:
|
|
351
|
+
typeof metadata.externalizedFileId === "string" ? metadata.externalizedFileId : null,
|
|
352
|
+
originalByteSize:
|
|
353
|
+
typeof metadata.originalByteSize === "number" && Number.isFinite(metadata.originalByteSize)
|
|
354
|
+
? Math.max(0, Math.floor(metadata.originalByteSize))
|
|
355
|
+
: null,
|
|
284
356
|
};
|
|
285
357
|
}
|
|
286
358
|
|
|
@@ -386,6 +458,17 @@ export class SummaryStore {
|
|
|
386
458
|
// compaction and assembly will still work correctly.
|
|
387
459
|
}
|
|
388
460
|
|
|
461
|
+
// Also index into the CJK trigram FTS table for CJK substring search.
|
|
462
|
+
try {
|
|
463
|
+
this.db
|
|
464
|
+
.prepare(
|
|
465
|
+
`INSERT INTO summaries_fts_cjk(summary_id, content) VALUES (?, ?)`,
|
|
466
|
+
)
|
|
467
|
+
.run(input.summaryId, input.content);
|
|
468
|
+
} catch {
|
|
469
|
+
// CJK trigram FTS table may not exist yet (pre-migration); ignore.
|
|
470
|
+
}
|
|
471
|
+
|
|
389
472
|
return toSummaryRecord(row);
|
|
390
473
|
}
|
|
391
474
|
|
|
@@ -460,6 +543,136 @@ export class SummaryStore {
|
|
|
460
543
|
return rows.map((r) => r.message_id);
|
|
461
544
|
}
|
|
462
545
|
|
|
546
|
+
/**
|
|
547
|
+
* Return the deepest persisted summary depth for a conversation.
|
|
548
|
+
*/
|
|
549
|
+
async getConversationMaxSummaryDepth(conversationId: number): Promise<number | null> {
|
|
550
|
+
const row = this.db
|
|
551
|
+
.prepare(
|
|
552
|
+
`SELECT MAX(depth) AS max_depth
|
|
553
|
+
FROM summaries
|
|
554
|
+
WHERE conversation_id = ?`,
|
|
555
|
+
)
|
|
556
|
+
.get(conversationId) as unknown as MaxDepthRow | undefined;
|
|
557
|
+
return typeof row?.max_depth === "number" ? row.max_depth : null;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/**
|
|
561
|
+
* Resolve raw message hits back to their linked leaf summaries.
|
|
562
|
+
*/
|
|
563
|
+
async getLeafSummaryLinksForMessageIds(
|
|
564
|
+
conversationId: number,
|
|
565
|
+
messageIds: number[],
|
|
566
|
+
): Promise<MessageLeafSummaryLinkRecord[]> {
|
|
567
|
+
const normalizedMessageIds = Array.from(
|
|
568
|
+
new Set(
|
|
569
|
+
messageIds.filter(
|
|
570
|
+
(messageId): messageId is number => Number.isInteger(messageId) && messageId > 0,
|
|
571
|
+
),
|
|
572
|
+
),
|
|
573
|
+
);
|
|
574
|
+
if (normalizedMessageIds.length === 0) {
|
|
575
|
+
return [];
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
const placeholders = normalizedMessageIds.map(() => "?").join(", ");
|
|
579
|
+
const rows = this.db
|
|
580
|
+
.prepare(
|
|
581
|
+
`SELECT sm.message_id, sm.summary_id
|
|
582
|
+
FROM summary_messages sm
|
|
583
|
+
JOIN summaries s ON s.summary_id = sm.summary_id
|
|
584
|
+
WHERE s.conversation_id = ?
|
|
585
|
+
AND s.kind = 'leaf'
|
|
586
|
+
AND sm.message_id IN (${placeholders})
|
|
587
|
+
ORDER BY sm.ordinal ASC, s.created_at ASC`,
|
|
588
|
+
)
|
|
589
|
+
.all(conversationId, ...normalizedMessageIds) as unknown as MessageLeafSummaryLinkRow[];
|
|
590
|
+
|
|
591
|
+
const summaryIdsByMessageId = new Map<number, string[]>();
|
|
592
|
+
for (const row of rows) {
|
|
593
|
+
const existing = summaryIdsByMessageId.get(row.message_id) ?? [];
|
|
594
|
+
if (!existing.includes(row.summary_id)) {
|
|
595
|
+
existing.push(row.summary_id);
|
|
596
|
+
summaryIdsByMessageId.set(row.message_id, existing);
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
const orderedLinks: MessageLeafSummaryLinkRecord[] = [];
|
|
601
|
+
for (const messageId of normalizedMessageIds) {
|
|
602
|
+
for (const summaryId of summaryIdsByMessageId.get(messageId) ?? []) {
|
|
603
|
+
orderedLinks.push({
|
|
604
|
+
messageId,
|
|
605
|
+
summaryId,
|
|
606
|
+
});
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
return orderedLinks;
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Return summarized tool-result messages that are safe candidates for
|
|
613
|
+
* transcript GC because they are no longer present as raw context items.
|
|
614
|
+
*/
|
|
615
|
+
async listTranscriptGcCandidates(
|
|
616
|
+
conversationId: number,
|
|
617
|
+
options?: { limit?: number },
|
|
618
|
+
): Promise<TranscriptGcCandidateRecord[]> {
|
|
619
|
+
const limit =
|
|
620
|
+
typeof options?.limit === "number" && Number.isFinite(options.limit) && options.limit > 0
|
|
621
|
+
? Math.max(1, Math.floor(options.limit))
|
|
622
|
+
: 25;
|
|
623
|
+
|
|
624
|
+
const rows = this.db
|
|
625
|
+
.prepare(
|
|
626
|
+
`SELECT
|
|
627
|
+
m.message_id,
|
|
628
|
+
m.conversation_id,
|
|
629
|
+
m.seq,
|
|
630
|
+
mp.tool_call_id,
|
|
631
|
+
mp.tool_name,
|
|
632
|
+
mp.metadata
|
|
633
|
+
FROM messages m
|
|
634
|
+
JOIN message_parts mp
|
|
635
|
+
ON mp.message_id = m.message_id
|
|
636
|
+
WHERE m.conversation_id = ?
|
|
637
|
+
AND m.role = 'tool'
|
|
638
|
+
AND mp.part_type = 'tool'
|
|
639
|
+
AND mp.tool_call_id IS NOT NULL
|
|
640
|
+
AND mp.tool_call_id != ''
|
|
641
|
+
AND EXISTS (
|
|
642
|
+
SELECT 1
|
|
643
|
+
FROM summary_messages sm
|
|
644
|
+
WHERE sm.message_id = m.message_id
|
|
645
|
+
)
|
|
646
|
+
AND NOT EXISTS (
|
|
647
|
+
SELECT 1
|
|
648
|
+
FROM context_items ci
|
|
649
|
+
WHERE ci.conversation_id = m.conversation_id
|
|
650
|
+
AND ci.item_type = 'message'
|
|
651
|
+
AND ci.message_id = m.message_id
|
|
652
|
+
)
|
|
653
|
+
ORDER BY m.seq ASC, mp.ordinal ASC`,
|
|
654
|
+
)
|
|
655
|
+
.all(conversationId) as unknown as TranscriptGcCandidateRow[];
|
|
656
|
+
|
|
657
|
+
const seenMessageIds = new Set<number>();
|
|
658
|
+
const candidates: TranscriptGcCandidateRecord[] = [];
|
|
659
|
+
for (const row of rows) {
|
|
660
|
+
if (seenMessageIds.has(row.message_id)) {
|
|
661
|
+
continue;
|
|
662
|
+
}
|
|
663
|
+
const candidate = toTranscriptGcCandidateRecord(row);
|
|
664
|
+
if (!candidate) {
|
|
665
|
+
continue;
|
|
666
|
+
}
|
|
667
|
+
seenMessageIds.add(candidate.messageId);
|
|
668
|
+
candidates.push(candidate);
|
|
669
|
+
if (candidates.length >= limit) {
|
|
670
|
+
break;
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
return candidates;
|
|
675
|
+
}
|
|
463
676
|
async getSummaryChildren(parentSummaryId: string): Promise<SummaryRecord[]> {
|
|
464
677
|
const rows = this.db
|
|
465
678
|
.prepare(
|
|
@@ -607,6 +820,45 @@ export class SummaryStore {
|
|
|
607
820
|
return rows.map((row) => row.depth);
|
|
608
821
|
}
|
|
609
822
|
|
|
823
|
+
async pruneForNewSession(conversationId: number, retainDepth: number): Promise<void> {
|
|
824
|
+
if (Number.isFinite(retainDepth) && retainDepth < 0) {
|
|
825
|
+
return;
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
this.db
|
|
829
|
+
.prepare(
|
|
830
|
+
`DELETE FROM context_items
|
|
831
|
+
WHERE conversation_id = ?
|
|
832
|
+
AND item_type = 'message'`,
|
|
833
|
+
)
|
|
834
|
+
.run(conversationId);
|
|
835
|
+
|
|
836
|
+
if (!Number.isFinite(retainDepth)) {
|
|
837
|
+
this.db
|
|
838
|
+
.prepare(
|
|
839
|
+
`DELETE FROM context_items
|
|
840
|
+
WHERE conversation_id = ?
|
|
841
|
+
AND item_type = 'summary'`,
|
|
842
|
+
)
|
|
843
|
+
.run(conversationId);
|
|
844
|
+
return;
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
this.db
|
|
848
|
+
.prepare(
|
|
849
|
+
`DELETE FROM context_items
|
|
850
|
+
WHERE conversation_id = ?
|
|
851
|
+
AND item_type = 'summary'
|
|
852
|
+
AND summary_id IN (
|
|
853
|
+
SELECT summary_id
|
|
854
|
+
FROM summaries
|
|
855
|
+
WHERE conversation_id = ?
|
|
856
|
+
AND depth < ?
|
|
857
|
+
)`,
|
|
858
|
+
)
|
|
859
|
+
.run(conversationId, conversationId, Math.floor(retainDepth));
|
|
860
|
+
}
|
|
861
|
+
|
|
610
862
|
async appendContextMessage(conversationId: number, messageId: number): Promise<void> {
|
|
611
863
|
const row = this.db
|
|
612
864
|
.prepare(
|
|
@@ -750,10 +1002,30 @@ export class SummaryStore {
|
|
|
750
1002
|
const limit = input.limit ?? 50;
|
|
751
1003
|
|
|
752
1004
|
if (input.mode === "full_text") {
|
|
753
|
-
// FTS5 unicode61
|
|
754
|
-
//
|
|
1005
|
+
// FTS5 unicode61 cannot segment CJK ideographs, so CJK queries route
|
|
1006
|
+
// through the trigram FTS table first, then fall back to LIKE with OR
|
|
1007
|
+
// semantics (instead of the original AND logic which fails when the
|
|
1008
|
+
// user's phrasing doesn't exactly match the summary text).
|
|
755
1009
|
if (containsCjk(input.query)) {
|
|
756
|
-
|
|
1010
|
+
const cjkSegments = this.extractCjkSegments(input.query);
|
|
1011
|
+
const hasShortCjkSegment = cjkSegments.some((segment) => segment.length < 3);
|
|
1012
|
+
if (!hasShortCjkSegment) {
|
|
1013
|
+
try {
|
|
1014
|
+
const trigramResults = this.searchCjkTrigram(
|
|
1015
|
+
input.query,
|
|
1016
|
+
limit,
|
|
1017
|
+
input.conversationId,
|
|
1018
|
+
input.since,
|
|
1019
|
+
input.before,
|
|
1020
|
+
);
|
|
1021
|
+
if (trigramResults.length > 0) {
|
|
1022
|
+
return trigramResults;
|
|
1023
|
+
}
|
|
1024
|
+
} catch {
|
|
1025
|
+
// trigram table may not exist; fall through to LIKE OR
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
return this.searchLikeCjk(
|
|
757
1029
|
input.query,
|
|
758
1030
|
limit,
|
|
759
1031
|
input.conversationId,
|
|
@@ -870,6 +1142,183 @@ export class SummaryStore {
|
|
|
870
1142
|
conversationId: row.conversation_id,
|
|
871
1143
|
kind: row.kind,
|
|
872
1144
|
snippet: createFallbackSnippet(row.content, plan.terms),
|
|
1145
|
+
createdAt: parseUtcTimestamp(row.created_at),
|
|
1146
|
+
rank: 0,
|
|
1147
|
+
}));
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
private extractCjkSegments(query: string): string[] {
|
|
1151
|
+
return query.match(CJK_QUERY_SEGMENT_RE) ?? [];
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
private extractLatinTokens(query: string): string[] {
|
|
1155
|
+
const tokens = query.match(LATIN_QUERY_TOKEN_RE) ?? [];
|
|
1156
|
+
return [...new Set(tokens.map((token) => token.toLowerCase()))];
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
private escapeLikeTerm(term: string): string {
|
|
1160
|
+
return term.replace(/([\\%_])/g, "\\$1");
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
// ── CJK trigram FTS search ──────────────────────────────────────────────
|
|
1164
|
+
// Each CJK segment of 3+ chars is split into overlapping 4-char chunks for
|
|
1165
|
+
// trigram MATCH with OR semantics within the segment. Segment groups are
|
|
1166
|
+
// combined with AND, and Latin tokens are applied as LIKE filters so mixed
|
|
1167
|
+
// queries still require every part of the user's intent.
|
|
1168
|
+
|
|
1169
|
+
/**
|
|
1170
|
+
* Split a CJK string into overlapping chunks of `size` characters.
|
|
1171
|
+
* E.g. "端到端测试结果" with size=4 →
|
|
1172
|
+
* ["端到端测", "到端测试", "端测试结", "测试结果"]
|
|
1173
|
+
*/
|
|
1174
|
+
private splitCjkChunks(text: string, size: number): string[] {
|
|
1175
|
+
const chunks: string[] = [];
|
|
1176
|
+
for (let i = 0; i <= text.length - size; i++) {
|
|
1177
|
+
const chunk = text.slice(i, i + size);
|
|
1178
|
+
if (!chunks.includes(chunk)) {
|
|
1179
|
+
chunks.push(chunk);
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
return chunks;
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
private searchCjkTrigram(
|
|
1186
|
+
query: string,
|
|
1187
|
+
limit: number,
|
|
1188
|
+
conversationId?: number,
|
|
1189
|
+
since?: Date,
|
|
1190
|
+
before?: Date,
|
|
1191
|
+
): SummarySearchResult[] {
|
|
1192
|
+
const cjkSegments = this.extractCjkSegments(query).filter((segment) => segment.length >= 3);
|
|
1193
|
+
if (cjkSegments.length === 0) {
|
|
1194
|
+
return [];
|
|
1195
|
+
}
|
|
1196
|
+
const latinTokens = this.extractLatinTokens(query);
|
|
1197
|
+
|
|
1198
|
+
// Build one OR group per CJK segment, then require every segment group and
|
|
1199
|
+
// every Latin token to match so mixed queries preserve full-intent search.
|
|
1200
|
+
const cjkGroups: string[] = [];
|
|
1201
|
+
for (const segment of cjkSegments) {
|
|
1202
|
+
const segmentTerms =
|
|
1203
|
+
segment.length <= 4 ? [segment] : this.splitCjkChunks(segment, 4);
|
|
1204
|
+
const groupExpr = [...new Set(segmentTerms)]
|
|
1205
|
+
.map((term) => `"${term.replace(/"/g, '""')}"`)
|
|
1206
|
+
.join(" OR ");
|
|
1207
|
+
cjkGroups.push(`(${groupExpr})`);
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
const where: string[] = ["summaries_fts_cjk MATCH ?"];
|
|
1211
|
+
const args: Array<string | number> = [cjkGroups.join(" AND ")];
|
|
1212
|
+
for (const token of latinTokens) {
|
|
1213
|
+
where.push("LOWER(s.content) LIKE ? ESCAPE '\\'");
|
|
1214
|
+
args.push(`%${this.escapeLikeTerm(token)}%`);
|
|
1215
|
+
}
|
|
1216
|
+
if (conversationId != null) {
|
|
1217
|
+
where.push("s.conversation_id = ?");
|
|
1218
|
+
args.push(conversationId);
|
|
1219
|
+
}
|
|
1220
|
+
if (since) {
|
|
1221
|
+
where.push("julianday(s.created_at) >= julianday(?)");
|
|
1222
|
+
args.push(since.toISOString());
|
|
1223
|
+
}
|
|
1224
|
+
if (before) {
|
|
1225
|
+
where.push("julianday(s.created_at) < julianday(?)");
|
|
1226
|
+
args.push(before.toISOString());
|
|
1227
|
+
}
|
|
1228
|
+
args.push(limit);
|
|
1229
|
+
|
|
1230
|
+
const sql = `SELECT
|
|
1231
|
+
f.summary_id,
|
|
1232
|
+
s.conversation_id,
|
|
1233
|
+
s.kind,
|
|
1234
|
+
snippet(summaries_fts_cjk, 1, '', '', '...', 32) AS snippet,
|
|
1235
|
+
rank,
|
|
1236
|
+
s.created_at
|
|
1237
|
+
FROM summaries_fts_cjk f
|
|
1238
|
+
JOIN summaries s ON s.summary_id = f.summary_id
|
|
1239
|
+
WHERE ${where.join(" AND ")}
|
|
1240
|
+
ORDER BY rank
|
|
1241
|
+
LIMIT ?`;
|
|
1242
|
+
const rows = this.db.prepare(sql).all(...args) as unknown as SummarySearchRow[];
|
|
1243
|
+
return rows.map(toSearchResult);
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
// ── CJK LIKE fallback ────────────────────────────────────────────────────
|
|
1247
|
+
// When the trigram table is unavailable, split each CJK segment into
|
|
1248
|
+
// sliding-window terms so partial matches still work. Terms within a single
|
|
1249
|
+
// segment are ORed together, but each segment and Latin token still has to
|
|
1250
|
+
// match so mixed queries keep full-intent semantics.
|
|
1251
|
+
|
|
1252
|
+
private searchLikeCjk(
|
|
1253
|
+
query: string,
|
|
1254
|
+
limit: number,
|
|
1255
|
+
conversationId?: number,
|
|
1256
|
+
since?: Date,
|
|
1257
|
+
before?: Date,
|
|
1258
|
+
): SummarySearchResult[] {
|
|
1259
|
+
const cjkSegments = this.extractCjkSegments(query);
|
|
1260
|
+
const latinTokens = this.extractLatinTokens(query);
|
|
1261
|
+
if (cjkSegments.length === 0 && latinTokens.length === 0) {
|
|
1262
|
+
return [];
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
const cjkTerms: string[] = [];
|
|
1266
|
+
const cjkClauses: string[] = [];
|
|
1267
|
+
const cjkArgs: string[] = [];
|
|
1268
|
+
for (const segment of cjkSegments) {
|
|
1269
|
+
const segmentTerms =
|
|
1270
|
+
segment.length === 1
|
|
1271
|
+
? [segment]
|
|
1272
|
+
: segment.length === 2
|
|
1273
|
+
? [segment]
|
|
1274
|
+
: this.splitCjkChunks(segment, 2);
|
|
1275
|
+
const uniqueTerms = [...new Set(segmentTerms)];
|
|
1276
|
+
cjkTerms.push(...uniqueTerms);
|
|
1277
|
+
cjkClauses.push(
|
|
1278
|
+
`(${uniqueTerms.map(() => `LOWER(content) LIKE ? ESCAPE '\\'`).join(" OR ")})`,
|
|
1279
|
+
);
|
|
1280
|
+
cjkArgs.push(
|
|
1281
|
+
...uniqueTerms.map((term) => `%${this.escapeLikeTerm(term.toLowerCase())}%`),
|
|
1282
|
+
);
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
const latinClauses = latinTokens.map(() => `LOWER(content) LIKE ? ESCAPE '\\'`);
|
|
1286
|
+
const latinArgs = latinTokens.map((token) => `%${this.escapeLikeTerm(token)}%`);
|
|
1287
|
+
|
|
1288
|
+
const where: string[] = [...cjkClauses, ...latinClauses];
|
|
1289
|
+
const args: Array<string | number> = [...cjkArgs, ...latinArgs];
|
|
1290
|
+
if (conversationId != null) {
|
|
1291
|
+
where.push("conversation_id = ?");
|
|
1292
|
+
args.push(conversationId);
|
|
1293
|
+
}
|
|
1294
|
+
if (since) {
|
|
1295
|
+
where.push("julianday(created_at) >= julianday(?)");
|
|
1296
|
+
args.push(since.toISOString());
|
|
1297
|
+
}
|
|
1298
|
+
if (before) {
|
|
1299
|
+
where.push("julianday(created_at) < julianday(?)");
|
|
1300
|
+
args.push(before.toISOString());
|
|
1301
|
+
}
|
|
1302
|
+
args.push(limit);
|
|
1303
|
+
|
|
1304
|
+
const rows = this.db
|
|
1305
|
+
.prepare(
|
|
1306
|
+
`SELECT summary_id, conversation_id, kind, depth, content, token_count, file_ids,
|
|
1307
|
+
earliest_at, latest_at, descendant_count, descendant_token_count,
|
|
1308
|
+
source_message_token_count, model, created_at
|
|
1309
|
+
FROM summaries
|
|
1310
|
+
WHERE ${where.join(" AND ")}
|
|
1311
|
+
ORDER BY created_at DESC
|
|
1312
|
+
LIMIT ?`,
|
|
1313
|
+
)
|
|
1314
|
+
.all(...args) as unknown as SummaryRow[];
|
|
1315
|
+
|
|
1316
|
+
const snippetTerms = cjkTerms.length > 0 ? [...new Set([...cjkTerms, ...latinTokens])] : latinTokens;
|
|
1317
|
+
return rows.map((row) => ({
|
|
1318
|
+
summaryId: row.summary_id,
|
|
1319
|
+
conversationId: row.conversation_id,
|
|
1320
|
+
kind: row.kind,
|
|
1321
|
+
snippet: createFallbackSnippet(row.content, snippetTerms),
|
|
873
1322
|
createdAt: new Date(row.created_at),
|
|
874
1323
|
rank: 0,
|
|
875
1324
|
}));
|
|
@@ -934,7 +1383,7 @@ export class SummaryStore {
|
|
|
934
1383
|
conversationId: row.conversation_id,
|
|
935
1384
|
kind: row.kind,
|
|
936
1385
|
snippet: match[0],
|
|
937
|
-
createdAt:
|
|
1386
|
+
createdAt: parseUtcTimestamp(row.created_at),
|
|
938
1387
|
rank: 0,
|
|
939
1388
|
});
|
|
940
1389
|
}
|