ei-tui 0.9.4 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -3
- package/package.json +5 -1
- package/src/README.md +9 -25
- package/src/core/handlers/document-segmentation.ts +113 -0
- package/src/core/handlers/index.ts +2 -0
- package/src/core/handlers/rewrite.ts +13 -9
- package/src/core/heartbeat-manager.ts +2 -2
- package/src/core/llm-client.ts +11 -1
- package/src/core/message-manager.ts +20 -18
- package/src/core/orchestrators/ceremony.ts +83 -40
- package/src/core/orchestrators/human-extraction.ts +5 -1
- package/src/core/persona-manager.ts +4 -0
- package/src/core/processor.ts +90 -1
- package/src/core/queue-manager.ts +35 -0
- package/src/core/state/queue.ts +9 -1
- package/src/core/state-manager.ts +4 -0
- package/src/core/types/entities.ts +15 -0
- package/src/core/types/enums.ts +1 -0
- package/src/core/types/integrations.ts +2 -0
- package/src/core/types/llm.ts +9 -0
- package/src/integrations/document/chunker.ts +88 -0
- package/src/integrations/document/importer.ts +82 -0
- package/src/integrations/document/index.ts +2 -0
- package/src/integrations/document/invoice.ts +63 -0
- package/src/integrations/document/types.ts +16 -0
- package/src/integrations/document/unsource.ts +164 -0
- package/src/integrations/persona-history/importer.ts +197 -0
- package/src/integrations/persona-history/index.ts +3 -0
- package/src/integrations/persona-history/types.ts +7 -0
- package/src/prompts/ceremony/dedup.ts +7 -3
- package/src/prompts/ceremony/index.ts +2 -1
- package/src/prompts/ceremony/people-rewrite.ts +190 -0
- package/src/prompts/ceremony/{rewrite.ts → topic-rewrite.ts} +103 -78
- package/src/prompts/human/person-scan.ts +13 -4
- package/src/prompts/human/topic-scan.ts +16 -2
- package/src/prompts/human/topic-update.ts +36 -4
- package/src/prompts/human/types.ts +1 -0
- package/src/storage/indexed.ts +4 -0
- package/src/storage/interface.ts +1 -0
- package/src/storage/local.ts +4 -0
- package/src/templates/emmett.ts +49 -0
- package/tui/README.md +22 -0
- package/tui/src/app.tsx +9 -6
- package/tui/src/commands/delete.tsx +7 -1
- package/tui/src/commands/import.tsx +30 -0
- package/tui/src/commands/unsource.tsx +115 -0
- package/tui/src/components/PromptInput.tsx +4 -0
- package/tui/src/components/WelcomeOverlay.tsx +58 -32
- package/tui/src/context/ei.tsx +80 -60
- package/tui/src/index.tsx +14 -0
- package/tui/src/storage/file.ts +11 -5
- package/tui/src/util/e2e-flags.ts +4 -3
- package/tui/src/util/help-content.ts +20 -0
- package/tui/src/util/provider-detection.ts +251 -0
- package/tui/src/util/yaml-human.ts +7 -1
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import {
|
|
2
2
|
RESERVED_PERSONA_NAMES,
|
|
3
3
|
isReservedPersonaName,
|
|
4
|
+
isReservedPersonaId,
|
|
4
5
|
type PersonaSummary,
|
|
5
6
|
type PersonaEntity,
|
|
6
7
|
type PersonaCreationInput,
|
|
@@ -110,6 +111,9 @@ export async function deletePersona(
|
|
|
110
111
|
personaId: string,
|
|
111
112
|
_deleteHumanData: boolean
|
|
112
113
|
): Promise<boolean> {
|
|
114
|
+
if (isReservedPersonaId(personaId)) {
|
|
115
|
+
throw new Error(`Cannot delete reserved persona "${personaId}". Use archive instead.`);
|
|
116
|
+
}
|
|
113
117
|
const persona = sm.persona_getById(personaId);
|
|
114
118
|
if (!persona) return false;
|
|
115
119
|
sm.persona_delete(personaId);
|
package/src/core/processor.ts
CHANGED
|
@@ -39,7 +39,9 @@ import { ContextStatus as ContextStatusEnum, RoomMode } from "./types.js";
|
|
|
39
39
|
import { registerReadMemoryExecutor, registerFileReadExecutor } from "./tools/index.js";
|
|
40
40
|
import { createReadMemoryExecutor } from "./tools/builtin/read-memory.js";
|
|
41
41
|
import { EI_WELCOME_MESSAGE, EI_PERSONA_DEFINITION } from "../templates/welcome.js";
|
|
42
|
+
import { EMMETT_PERSONA_DEFINITION } from "../templates/emmett.js";
|
|
42
43
|
import { shouldStartCeremony, startCeremony, handleCeremonyProgress, queueReflectionDrain, queueUserDedupRequest, queueRoomCapture, queuePersonaCapture, checkAndQueueRoomExtraction, queueTargetedPersonUpdate, queueTargetedTopicUpdate } from "./orchestrators/index.js";
|
|
44
|
+
import { finishDocumentBatch } from "./handlers/document-segmentation.js";
|
|
43
45
|
import { BUILT_IN_FACTS } from "./constants/built-in-facts.js";
|
|
44
46
|
import { DEFAULT_SEED_TRAITS } from "./constants/seed-traits.js";
|
|
45
47
|
|
|
@@ -132,6 +134,8 @@ import {
|
|
|
132
134
|
markAllRoomMessagesRead,
|
|
133
135
|
} from "./room-manager.js";
|
|
134
136
|
import type { RoomCreationInput, RoomEntity, RoomMessage, RoomSummary } from "./types.js";
|
|
137
|
+
import { previewUnsource as _previewUnsource } from "../integrations/document/unsource.js";
|
|
138
|
+
import type { UnsourcePreview, UnsourceResult } from "../integrations/document/unsource.js";
|
|
135
139
|
|
|
136
140
|
const DEFAULT_LOOP_INTERVAL_MS = 100;
|
|
137
141
|
const DEFAULT_OPENCODE_POLLING_MS = 60000;
|
|
@@ -282,6 +286,46 @@ export class Processor {
|
|
|
282
286
|
this.interface.onMessageAdded?.(eiEntity.id);
|
|
283
287
|
}
|
|
284
288
|
|
|
289
|
+
private bootstrapEmmett(): void {
|
|
290
|
+
const existing = this.stateManager.persona_getById("emmet");
|
|
291
|
+
if (existing) {
|
|
292
|
+
if (existing.is_archived) {
|
|
293
|
+
this.stateManager.persona_unarchive("emmet");
|
|
294
|
+
}
|
|
295
|
+
return;
|
|
296
|
+
}
|
|
297
|
+
const readMemoryTool = this.stateManager.tools_getByName("read_memory");
|
|
298
|
+
const emmettEntity: PersonaEntity = {
|
|
299
|
+
...EMMETT_PERSONA_DEFINITION,
|
|
300
|
+
id: "emmet",
|
|
301
|
+
display_name: "Emmett",
|
|
302
|
+
last_updated: new Date().toISOString(),
|
|
303
|
+
tools: readMemoryTool ? [readMemoryTool.id] : [],
|
|
304
|
+
};
|
|
305
|
+
this.stateManager.persona_add(emmettEntity);
|
|
306
|
+
this.interface.onPersonaAdded?.();
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
async importDocument(content: string, filename: string): Promise<import("../integrations/document/types.js").DocumentImportResult> {
|
|
310
|
+
this.bootstrapEmmett();
|
|
311
|
+
const { importDocument } = await import("../integrations/document/importer.js");
|
|
312
|
+
return importDocument({
|
|
313
|
+
stateManager: this.stateManager,
|
|
314
|
+
interface: this.interface,
|
|
315
|
+
content,
|
|
316
|
+
filename,
|
|
317
|
+
});
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
getUnsourcePreview(sourceTag: string): UnsourcePreview {
|
|
321
|
+
return _previewUnsource(sourceTag, this.stateManager);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
async executeUnsource(preview: UnsourcePreview): Promise<UnsourceResult> {
|
|
325
|
+
const { executeUnsource } = await import("../integrations/document/unsource.js");
|
|
326
|
+
return executeUnsource(preview, this.stateManager);
|
|
327
|
+
}
|
|
328
|
+
|
|
285
329
|
/**
|
|
286
330
|
* Seed built-in tool providers and tools if they don't exist yet.
|
|
287
331
|
* Called on every startup (after state load/restore) — safe to call repeatedly.
|
|
@@ -1168,6 +1212,15 @@ const toolNextSteps = new Set([
|
|
|
1168
1212
|
await this.checkAndSyncCursor(human, now);
|
|
1169
1213
|
}
|
|
1170
1214
|
|
|
1215
|
+
if (
|
|
1216
|
+
this.isTUI &&
|
|
1217
|
+
human.settings?.personaHistory?.integration &&
|
|
1218
|
+
!human.settings.personaHistory.complete &&
|
|
1219
|
+
this.stateManager.queue_length() === 0
|
|
1220
|
+
) {
|
|
1221
|
+
await this.checkAndSyncPersonaHistory(human);
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1171
1224
|
if (human.settings?.ceremony && shouldStartCeremony(human.settings.ceremony, this.stateManager)) {
|
|
1172
1225
|
if (human.settings?.sync && remoteSync.isConfigured()) {
|
|
1173
1226
|
const state = this.stateManager.getStorageState();
|
|
@@ -1180,7 +1233,7 @@ const toolNextSteps = new Set([
|
|
|
1180
1233
|
}
|
|
1181
1234
|
|
|
1182
1235
|
for (const persona of this.stateManager.persona_getAll()) {
|
|
1183
|
-
if (persona.is_paused || persona.is_archived) continue;
|
|
1236
|
+
if (persona.is_paused || persona.is_archived || persona.is_static) continue;
|
|
1184
1237
|
|
|
1185
1238
|
const defaultHeartbeatMs = this.stateManager.getHuman().settings?.default_heartbeat_ms ?? 1800000;
|
|
1186
1239
|
const heartbeatDelay = persona.heartbeat_delay_ms ?? defaultHeartbeatMs;
|
|
@@ -1408,6 +1461,32 @@ const toolNextSteps = new Set([
|
|
|
1408
1461
|
});
|
|
1409
1462
|
}
|
|
1410
1463
|
|
|
1464
|
+
private personaHistoryImportInProgress = false;
|
|
1465
|
+
|
|
1466
|
+
private async checkAndSyncPersonaHistory(_human: HumanEntity): Promise<void> {
|
|
1467
|
+
if (this.personaHistoryImportInProgress) return;
|
|
1468
|
+
|
|
1469
|
+
this.personaHistoryImportInProgress = true;
|
|
1470
|
+
import("../integrations/persona-history/importer.js")
|
|
1471
|
+
.then(({ importPersonaHistory }) =>
|
|
1472
|
+
importPersonaHistory({ stateManager: this.stateManager })
|
|
1473
|
+
)
|
|
1474
|
+
.then((result) => {
|
|
1475
|
+
if (result.scansQueued > 0) {
|
|
1476
|
+
console.log(
|
|
1477
|
+
`[Processor] PersonaHistory: ${result.scansQueued} scans queued` +
|
|
1478
|
+
(result.complete ? " — import complete" : "")
|
|
1479
|
+
);
|
|
1480
|
+
}
|
|
1481
|
+
})
|
|
1482
|
+
.catch((err) => {
|
|
1483
|
+
console.warn(`[Processor] PersonaHistory sync failed:`, err);
|
|
1484
|
+
})
|
|
1485
|
+
.finally(() => {
|
|
1486
|
+
this.personaHistoryImportInProgress = false;
|
|
1487
|
+
});
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1411
1490
|
private augmentRoomRequest(request: LLMRequest): LLMRequest {
|
|
1412
1491
|
if (request.next_step !== LLMNextStep.HandleRoomResponse) return request;
|
|
1413
1492
|
|
|
@@ -1675,6 +1754,16 @@ const toolNextSteps = new Set([
|
|
|
1675
1754
|
if (typeof response.request.data.ceremony_progress === "number") {
|
|
1676
1755
|
handleCeremonyProgress(this.stateManager, response.request.data.ceremony_progress);
|
|
1677
1756
|
}
|
|
1757
|
+
|
|
1758
|
+
if (response.request.next_step === LLMNextStep.HandleDocumentSegmentation) {
|
|
1759
|
+
const batchId = response.request.data.batchId as string;
|
|
1760
|
+
const filename = response.request.data.filename as string;
|
|
1761
|
+
if (batchId && !this.stateManager.queue_hasPendingDocumentSegments(batchId)) {
|
|
1762
|
+
finishDocumentBatch(batchId, filename, this.stateManager);
|
|
1763
|
+
this.interface.onMessageAdded?.("emmet");
|
|
1764
|
+
this.interface.onHumanUpdated?.();
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1678
1767
|
} catch (err) {
|
|
1679
1768
|
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1680
1769
|
const result = this.stateManager.queue_fail(response.request.id, errorMsg);
|
|
@@ -18,6 +18,39 @@ export async function resumeQueue(sm: StateManager): Promise<void> {
|
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
export async function getQueueStatus(sm: StateManager): Promise<QueueStatus> {
|
|
21
|
+
const activeItems = sm.queue_getAllActiveItems();
|
|
22
|
+
const segmentationItems = activeItems.filter(
|
|
23
|
+
r => r.next_step === LLMNextStep.HandleDocumentSegmentation
|
|
24
|
+
);
|
|
25
|
+
|
|
26
|
+
const batchMap = new Map<string, { filename: string; count: number }>();
|
|
27
|
+
for (const item of segmentationItems) {
|
|
28
|
+
const { batchId, filename } = item.data as { batchId: string; filename: string };
|
|
29
|
+
if (!batchId || !filename) continue;
|
|
30
|
+
const existing = batchMap.get(batchId);
|
|
31
|
+
if (existing) {
|
|
32
|
+
existing.count++;
|
|
33
|
+
} else {
|
|
34
|
+
batchMap.set(batchId, { filename, count: 1 });
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const pending_documents = batchMap.size > 0
|
|
39
|
+
? Array.from(batchMap.entries()).map(([batchId, { filename, count }]) => ({ batchId, filename, count }))
|
|
40
|
+
: undefined;
|
|
41
|
+
|
|
42
|
+
const extractingSet = new Set<string>();
|
|
43
|
+
for (const item of activeItems) {
|
|
44
|
+
const sources = item.data.sources as string[] | undefined;
|
|
45
|
+
if (!Array.isArray(sources)) continue;
|
|
46
|
+
for (const s of sources) {
|
|
47
|
+
if (typeof s === "string" && s.startsWith("import:document:")) {
|
|
48
|
+
extractingSet.add(s.slice("import:document:".length));
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
const extracting_documents = extractingSet.size > 0 ? Array.from(extractingSet) : undefined;
|
|
53
|
+
|
|
21
54
|
return {
|
|
22
55
|
state: sm.queue_isPaused()
|
|
23
56
|
? "paused"
|
|
@@ -27,6 +60,8 @@ export async function getQueueStatus(sm: StateManager): Promise<QueueStatus> {
|
|
|
27
60
|
pending_count: sm.queue_length(),
|
|
28
61
|
dlq_count: sm.queue_dlqLength(),
|
|
29
62
|
embedding_warning: sm.embedding_getWarning() || undefined,
|
|
63
|
+
pending_documents,
|
|
64
|
+
extracting_documents,
|
|
30
65
|
};
|
|
31
66
|
}
|
|
32
67
|
|
package/src/core/state/queue.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { LLMRequest, QueueFailResult } from "../types.js";
|
|
2
|
-
import { DLQ_MAX_COUNT, DLQ_MAX_AGE_DAYS } from "../types.js";
|
|
2
|
+
import { DLQ_MAX_COUNT, DLQ_MAX_AGE_DAYS, LLMNextStep } from "../types.js";
|
|
3
3
|
|
|
4
4
|
const BASE_BACKOFF_MS = 2_000;
|
|
5
5
|
const MAX_BACKOFF_MS = 30_000;
|
|
@@ -200,6 +200,14 @@ export class QueueState {
|
|
|
200
200
|
return this.queue.some(r => r.state !== "dlq" && typeof r.data.ceremony_progress === "number" && r.data.ceremony_progress > 0);
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
+
hasPendingDocumentSegments(batchId: string): boolean {
|
|
204
|
+
return this.queue.some(r =>
|
|
205
|
+
r.state !== "dlq" &&
|
|
206
|
+
r.next_step === LLMNextStep.HandleDocumentSegmentation &&
|
|
207
|
+
r.data.batchId === batchId
|
|
208
|
+
);
|
|
209
|
+
}
|
|
210
|
+
|
|
203
211
|
clear(): number {
|
|
204
212
|
const count = this.queue.filter(r => r.state !== "dlq").length;
|
|
205
213
|
this.queue = this.queue.filter(r => r.state === "dlq");
|
|
@@ -917,6 +917,10 @@ export class StateManager {
|
|
|
917
917
|
return this.queueState.hasPendingCeremonies();
|
|
918
918
|
}
|
|
919
919
|
|
|
920
|
+
queue_hasPendingDocumentSegments(batchId: string): boolean {
|
|
921
|
+
return this.queueState.hasPendingDocumentSegments(batchId);
|
|
922
|
+
}
|
|
923
|
+
|
|
920
924
|
queue_clear(): number {
|
|
921
925
|
const result = this.queueState.clear();
|
|
922
926
|
this.scheduleSave();
|
|
@@ -20,6 +20,11 @@ export interface OpenCodeSettings {
|
|
|
20
20
|
processed_sessions?: Record<string, string>; // sessionId → ISO timestamp of last import
|
|
21
21
|
}
|
|
22
22
|
|
|
23
|
+
export interface DocumentSettings {
|
|
24
|
+
extraction_model?: string;
|
|
25
|
+
processed_documents?: Record<string, string>;
|
|
26
|
+
}
|
|
27
|
+
|
|
23
28
|
export interface CeremonyConfig {
|
|
24
29
|
time: string; // "HH:MM" format (e.g., "09:00")
|
|
25
30
|
last_ceremony?: string; // ISO timestamp
|
|
@@ -117,8 +122,10 @@ export interface HumanSettings {
|
|
|
117
122
|
backup?: BackupConfig;
|
|
118
123
|
claudeCode?: import("../../integrations/claude-code/types.js").ClaudeCodeSettings;
|
|
119
124
|
cursor?: import("../../integrations/cursor/types.js").CursorSettings;
|
|
125
|
+
document?: DocumentSettings;
|
|
120
126
|
active_theme?: string;
|
|
121
127
|
custom_themes?: ThemeDefinition[];
|
|
128
|
+
personaHistory?: import("../../integrations/persona-history/types.js").PersonaHistorySettings;
|
|
122
129
|
}
|
|
123
130
|
|
|
124
131
|
export interface HumanEntity {
|
|
@@ -202,3 +209,11 @@ export type ReservedPersonaName = typeof RESERVED_PERSONA_NAMES[number];
|
|
|
202
209
|
export function isReservedPersonaName(name: string): boolean {
|
|
203
210
|
return RESERVED_PERSONA_NAMES.includes(name.toLowerCase() as ReservedPersonaName);
|
|
204
211
|
}
|
|
212
|
+
|
|
213
|
+
// Reserved persona IDs (built-in system personas that cannot be deleted)
|
|
214
|
+
export const RESERVED_PERSONA_IDS = ["ei", "emmet"] as const;
|
|
215
|
+
export type ReservedPersonaId = typeof RESERVED_PERSONA_IDS[number];
|
|
216
|
+
|
|
217
|
+
export function isReservedPersonaId(id: string): boolean {
|
|
218
|
+
return (RESERVED_PERSONA_IDS as readonly string[]).includes(id);
|
|
219
|
+
}
|
package/src/core/types/enums.ts
CHANGED
|
@@ -52,6 +52,7 @@ export enum LLMNextStep {
|
|
|
52
52
|
HandlePersonaPreview = "handlePersonaPreview",
|
|
53
53
|
HandleTopicValidate = "handleTopicValidate",
|
|
54
54
|
HandleReflectionCritic = "handleReflectionCritic",
|
|
55
|
+
HandleDocumentSegmentation = "handleDocumentSegmentation",
|
|
55
56
|
}
|
|
56
57
|
|
|
57
58
|
export enum ProviderType {
|
|
@@ -75,6 +75,8 @@ export interface QueueStatus {
|
|
|
75
75
|
current_operation?: string;
|
|
76
76
|
/** True when the embedding service failed and topic/person matching fell back to recent items. */
|
|
77
77
|
embedding_warning?: boolean;
|
|
78
|
+
pending_documents?: Array<{ batchId: string; filename: string; count: number }>;
|
|
79
|
+
extracting_documents?: string[];
|
|
78
80
|
}
|
|
79
81
|
|
|
80
82
|
export interface EiError {
|
package/src/core/types/llm.ts
CHANGED
|
@@ -27,6 +27,15 @@ export interface Message {
|
|
|
27
27
|
|
|
28
28
|
external?: boolean; // Set by integration importers (OpenCode, Cursor, Claude Code); invisible to LLM context
|
|
29
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Integration source tag. Set ONLY on external: true messages by importers (document, Slack, etc.)
|
|
32
|
+
* to identify which external source this synthetic message came from.
|
|
33
|
+
* Format: "import:document:filename" | "slack:channelId" | etc.
|
|
34
|
+
* Enables quote provenance tracing: quote.message_id → message.source_tag → original source.
|
|
35
|
+
* Never set on conversational messages.
|
|
36
|
+
*/
|
|
37
|
+
source_tag?: string;
|
|
38
|
+
|
|
30
39
|
}
|
|
31
40
|
|
|
32
41
|
export interface ChatMessage {
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { expandToWordBoundaries } from "../../core/handlers/human-matching.js";
|
|
2
|
+
|
|
3
|
+
const DEFAULT_CHUNK_CHARS = 6000;
|
|
4
|
+
const DEFAULT_OVERLAP_CHARS = 300;
|
|
5
|
+
|
|
6
|
+
const MARKDOWN_SEPARATORS = ["\n## ", "\n### ", "\n#### ", "\n\n", "\n", ". ", " ", ""];
|
|
7
|
+
const DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""];
|
|
8
|
+
|
|
9
|
+
function splitOnSeparator(text: string, separator: string): string[] {
|
|
10
|
+
if (separator === "") {
|
|
11
|
+
return text.split("");
|
|
12
|
+
}
|
|
13
|
+
return text.split(separator);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function mergeChunks(pieces: string[], separator: string, chunkSize: number): string[] {
|
|
17
|
+
const merged: string[] = [];
|
|
18
|
+
let current = "";
|
|
19
|
+
|
|
20
|
+
for (const piece of pieces) {
|
|
21
|
+
const candidate = current ? current + separator + piece : piece;
|
|
22
|
+
if (candidate.length <= chunkSize) {
|
|
23
|
+
current = candidate;
|
|
24
|
+
} else {
|
|
25
|
+
if (current) merged.push(current);
|
|
26
|
+
current = piece.length <= chunkSize ? piece : piece;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
if (current) merged.push(current);
|
|
30
|
+
return merged;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function recursiveSplit(
|
|
34
|
+
text: string,
|
|
35
|
+
separators: string[],
|
|
36
|
+
chunkSize: number
|
|
37
|
+
): string[] {
|
|
38
|
+
if (text.length <= chunkSize) {
|
|
39
|
+
return [text];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const [separator, ...remainingSeparators] = separators;
|
|
43
|
+
|
|
44
|
+
if (separator === undefined) {
|
|
45
|
+
return [text];
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const pieces = splitOnSeparator(text, separator);
|
|
49
|
+
const result: string[] = [];
|
|
50
|
+
|
|
51
|
+
for (const piece of pieces) {
|
|
52
|
+
if (piece.length <= chunkSize) {
|
|
53
|
+
result.push(piece);
|
|
54
|
+
} else if (remainingSeparators.length > 0) {
|
|
55
|
+
result.push(...recursiveSplit(piece, remainingSeparators, chunkSize));
|
|
56
|
+
} else {
|
|
57
|
+
result.push(piece);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return mergeChunks(result, separator, chunkSize);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function applyOverlap(chunks: string[], overlapChars: number): string[] {
|
|
65
|
+
if (overlapChars <= 0 || chunks.length <= 1) return chunks;
|
|
66
|
+
|
|
67
|
+
return chunks.map((chunk, i) => {
|
|
68
|
+
if (i === 0) return chunk;
|
|
69
|
+
const prev = chunks[i - 1];
|
|
70
|
+
const rawStart = Math.max(0, prev.length - overlapChars);
|
|
71
|
+
const { start } = expandToWordBoundaries(prev, rawStart, rawStart);
|
|
72
|
+
const prefix = prev.slice(start);
|
|
73
|
+
return prefix + chunk;
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export function recursiveCharacterSplit(
|
|
78
|
+
text: string,
|
|
79
|
+
options?: { chunkSize?: number; overlap?: number; isMarkdown?: boolean }
|
|
80
|
+
): string[] {
|
|
81
|
+
const chunkSize = options?.chunkSize ?? DEFAULT_CHUNK_CHARS;
|
|
82
|
+
const overlap = options?.overlap ?? DEFAULT_OVERLAP_CHARS;
|
|
83
|
+
const separators = options?.isMarkdown ? MARKDOWN_SEPARATORS : DEFAULT_SEPARATORS;
|
|
84
|
+
|
|
85
|
+
const rawChunks = recursiveSplit(text, separators, chunkSize);
|
|
86
|
+
const nonEmpty = rawChunks.filter(c => c.trim().length > 0);
|
|
87
|
+
return applyOverlap(nonEmpty, overlap);
|
|
88
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import type { PersonaEntity } from "../../core/types.js";
|
|
2
|
+
import { LLMRequestType, LLMPriority, LLMNextStep } from "../../core/types.js";
|
|
3
|
+
import { EMMETT_PERSONA_DEFINITION } from "../../templates/emmett.js";
|
|
4
|
+
import { recursiveCharacterSplit } from "./chunker.js";
|
|
5
|
+
import type { DocumentImportOptions, DocumentImportResult } from "./types.js";
|
|
6
|
+
|
|
7
|
+
const SEGMENTATION_SYSTEM_PROMPT = `You are a document segmentation assistant. Your job is to identify natural conceptual boundaries in document content and split it into coherent segments suitable for knowledge extraction. Each segment should be a self-contained unit of information.`;
|
|
8
|
+
|
|
9
|
+
const SEGMENTATION_USER_TEMPLATE = `Split the following document content into conceptual segments. Return a JSON array of strings, where each string is one segment. Preserve all original text — do not summarize or paraphrase. Identify boundaries at topic shifts, section changes, or logical breaks.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
{content}`;
|
|
14
|
+
|
|
15
|
+
export async function importDocument(options: DocumentImportOptions): Promise<DocumentImportResult> {
|
|
16
|
+
const { stateManager, interface: eiInterface, content: rawContent, filename, signal } = options;
|
|
17
|
+
|
|
18
|
+
const isMarkdown = filename.toLowerCase().endsWith(".md") || filename.toLowerCase().endsWith(".markdown");
|
|
19
|
+
|
|
20
|
+
const result: DocumentImportResult = {
|
|
21
|
+
chunksQueued: 0,
|
|
22
|
+
documentName: filename,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
let emmett = stateManager.persona_getById("emmet");
|
|
26
|
+
if (emmett?.is_archived) {
|
|
27
|
+
stateManager.persona_unarchive("emmet");
|
|
28
|
+
emmett = stateManager.persona_getById("emmet")!;
|
|
29
|
+
}
|
|
30
|
+
if (!emmett) {
|
|
31
|
+
const emmettEntity: PersonaEntity = {
|
|
32
|
+
...EMMETT_PERSONA_DEFINITION,
|
|
33
|
+
id: "emmet",
|
|
34
|
+
display_name: "Emmett",
|
|
35
|
+
last_updated: new Date().toISOString(),
|
|
36
|
+
};
|
|
37
|
+
stateManager.persona_add(emmettEntity);
|
|
38
|
+
eiInterface.onPersonaAdded?.();
|
|
39
|
+
emmett = stateManager.persona_getById("emmet")!;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const sourceTag = `import:document:${filename}`;
|
|
43
|
+
const existingMsgs = stateManager.messages_get("emmet");
|
|
44
|
+
const staleIds = existingMsgs
|
|
45
|
+
.filter(m => m.external === true && m.source_tag === sourceTag)
|
|
46
|
+
.map(m => m.id);
|
|
47
|
+
if (staleIds.length > 0) {
|
|
48
|
+
stateManager.messages_remove("emmet", staleIds);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (signal?.aborted) return result;
|
|
52
|
+
|
|
53
|
+
const preChunks = recursiveCharacterSplit(rawContent, { isMarkdown });
|
|
54
|
+
|
|
55
|
+
if (preChunks.length === 0) return result;
|
|
56
|
+
|
|
57
|
+
const batchId = crypto.randomUUID();
|
|
58
|
+
const docSettings = stateManager.getHuman().settings?.document;
|
|
59
|
+
const model = docSettings?.extraction_model ?? stateManager.getHuman().settings?.default_model;
|
|
60
|
+
|
|
61
|
+
for (let i = 0; i < preChunks.length; i++) {
|
|
62
|
+
const chunk = preChunks[i];
|
|
63
|
+
stateManager.queue_enqueue({
|
|
64
|
+
type: LLMRequestType.JSON,
|
|
65
|
+
priority: LLMPriority.Low,
|
|
66
|
+
system: SEGMENTATION_SYSTEM_PROMPT,
|
|
67
|
+
user: SEGMENTATION_USER_TEMPLATE.replace("{content}", chunk),
|
|
68
|
+
next_step: LLMNextStep.HandleDocumentSegmentation,
|
|
69
|
+
model,
|
|
70
|
+
data: {
|
|
71
|
+
batchId,
|
|
72
|
+
filename,
|
|
73
|
+
chunkIndex: i,
|
|
74
|
+
originalContent: chunk,
|
|
75
|
+
},
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
result.chunksQueued = preChunks.length;
|
|
80
|
+
result.batchId = batchId;
|
|
81
|
+
return result;
|
|
82
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import type { UnsourcePreview, UnsourceResult } from "./unsource.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Pure function — no filesystem access. Safe to call in any environment.
|
|
7
|
+
* Returns the invoice as a markdown string.
|
|
8
|
+
*/
|
|
9
|
+
export function generateInvoiceMarkdown(preview: UnsourcePreview, result: UnsourceResult): string {
|
|
10
|
+
const timestamp = new Date().toISOString();
|
|
11
|
+
|
|
12
|
+
const deletedCount = result.deleted.facts + result.deleted.topics + result.deleted.people;
|
|
13
|
+
const strippedCount = result.stripped.facts + result.stripped.topics + result.stripped.people;
|
|
14
|
+
|
|
15
|
+
const lines: string[] = [
|
|
16
|
+
`# Unsource: ${preview.sourceTag}`,
|
|
17
|
+
`Run at: ${timestamp}`,
|
|
18
|
+
"",
|
|
19
|
+
`## Deleted (${deletedCount} items, ${result.deleted.quotes} quotes)`,
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
for (const f of preview.toDelete.facts) lines.push(`- [Fact] ${f.name}`);
|
|
23
|
+
for (const t of preview.toDelete.topics) lines.push(`- [Topic] ${t.name}`);
|
|
24
|
+
for (const p of preview.toDelete.people) lines.push(`- [Person] ${p.name}`);
|
|
25
|
+
for (const q of preview.toDelete.quotes) {
|
|
26
|
+
const excerpt = q.text.length > 80 ? `${q.text.slice(0, 80)}...` : q.text;
|
|
27
|
+
lines.push(`- [Quote] "${excerpt}"`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if (
|
|
31
|
+
preview.toStrip.facts.length > 0 ||
|
|
32
|
+
preview.toStrip.topics.length > 0 ||
|
|
33
|
+
preview.toStrip.people.length > 0
|
|
34
|
+
) {
|
|
35
|
+
lines.push("");
|
|
36
|
+
lines.push(`## Retained — shared with other sources (${strippedCount} items)`);
|
|
37
|
+
lines.push(`Source removed from these items. They had additional sources or non-Emmett personas.`);
|
|
38
|
+
lines.push("");
|
|
39
|
+
for (const f of preview.toStrip.facts) lines.push(`- [Fact] ${f.name}`);
|
|
40
|
+
for (const t of preview.toStrip.topics) lines.push(`- [Topic] ${t.name}`);
|
|
41
|
+
for (const p of preview.toStrip.people) lines.push(`- [Person] ${p.name}`);
|
|
42
|
+
lines.push("");
|
|
43
|
+
lines.push(`Run \`/me topics\` or \`/me people\` to review or delete retained items manually.`);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return lines.join("\n") + "\n";
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export async function writeUnsourceInvoice(
|
|
50
|
+
preview: UnsourcePreview,
|
|
51
|
+
result: UnsourceResult,
|
|
52
|
+
dataPath: string
|
|
53
|
+
): Promise<string> {
|
|
54
|
+
const timestamp = new Date().toISOString();
|
|
55
|
+
const sanitizedTag = preview.sourceTag.replace(/[^a-zA-Z0-9._-]/g, "_");
|
|
56
|
+
const filename = `${timestamp.replace(/[:.]/g, "-")}-${sanitizedTag}.md`;
|
|
57
|
+
const dir = join(dataPath, "unsourced");
|
|
58
|
+
const filePath = join(dir, filename);
|
|
59
|
+
|
|
60
|
+
await mkdir(dir, { recursive: true });
|
|
61
|
+
await writeFile(filePath, generateInvoiceMarkdown(preview, result), "utf8");
|
|
62
|
+
return filePath;
|
|
63
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { StateManager } from "../../core/state-manager.js";
|
|
2
|
+
import type { Ei_Interface } from "../../core/types.js";
|
|
3
|
+
|
|
4
|
+
export interface DocumentImportOptions {
|
|
5
|
+
stateManager: StateManager;
|
|
6
|
+
interface: Ei_Interface;
|
|
7
|
+
content: string;
|
|
8
|
+
filename: string;
|
|
9
|
+
signal?: AbortSignal;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface DocumentImportResult {
|
|
13
|
+
chunksQueued: number;
|
|
14
|
+
documentName: string;
|
|
15
|
+
batchId?: string;
|
|
16
|
+
}
|