@nguyentamdat/mempalace 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/miner.ts ADDED
@@ -0,0 +1,612 @@
1
+ import { createHash } from "node:crypto";
2
+ import {
3
+ existsSync,
4
+ mkdirSync,
5
+ readdirSync,
6
+ readFileSync,
7
+ statSync,
8
+ } from "node:fs";
9
+ import { basename, extname, relative, resolve } from "node:path";
10
+ import { ChromaClient, DefaultEmbeddingFunction, IncludeEnum } from "chromadb";
11
+ import yaml from "js-yaml";
12
+
13
+ export const READABLE_EXTENSIONS = new Set([
14
+ ".txt",
15
+ ".md",
16
+ ".py",
17
+ ".js",
18
+ ".ts",
19
+ ".jsx",
20
+ ".tsx",
21
+ ".json",
22
+ ".yaml",
23
+ ".yml",
24
+ ".html",
25
+ ".css",
26
+ ".java",
27
+ ".go",
28
+ ".rs",
29
+ ".rb",
30
+ ".sh",
31
+ ".csv",
32
+ ".sql",
33
+ ".toml",
34
+ ]);
35
+
36
+ export const SKIP_DIRS = new Set([
37
+ ".git",
38
+ "node_modules",
39
+ "__pycache__",
40
+ ".venv",
41
+ "venv",
42
+ "env",
43
+ "dist",
44
+ "build",
45
+ ".next",
46
+ "coverage",
47
+ ".mempalace",
48
+ ]);
49
+
50
+ export const CHUNK_SIZE = 800;
51
+ export const CHUNK_OVERLAP = 100;
52
+ export const MIN_CHUNK_SIZE = 50;
53
+
54
+ const COLLECTION_NAME = "mempalace_drawers";
55
+ const SKIP_FILES = new Set([
56
+ "mempalace.yaml",
57
+ "mempalace.yml",
58
+ "mempal.yaml",
59
+ "mempal.yml",
60
+ ".gitignore",
61
+ "package-lock.json",
62
+ ]);
63
+
64
+ type RoomConfig = {
65
+ name: string;
66
+ description?: string;
67
+ keywords?: string[];
68
+ };
69
+
70
+ type LoadedConfig = {
71
+ wing: string;
72
+ rooms?: RoomConfig[];
73
+ };
74
+
75
+ type Chunk = {
76
+ content: string;
77
+ chunkIndex: number;
78
+ };
79
+
80
+ type MineOptions = {
81
+ projectDir: string;
82
+ palacePath: string;
83
+ wingOverride?: string;
84
+ agent?: string;
85
+ limit?: number;
86
+ dryRun?: boolean;
87
+ };
88
+
89
+ type DrawerCollection = Awaited<ReturnType<ChromaClient["getCollection"]>>;
90
+
91
+ function isRecord(value: unknown): value is Record<string, unknown> {
92
+ return typeof value === "object" && value !== null;
93
+ }
94
+
95
+ function parseRooms(value: unknown): RoomConfig[] | undefined {
96
+ if (!Array.isArray(value)) {
97
+ return undefined;
98
+ }
99
+
100
+ const rooms: RoomConfig[] = [];
101
+ for (const item of value) {
102
+ if (!isRecord(item) || typeof item.name !== "string") {
103
+ continue;
104
+ }
105
+
106
+ const keywords = Array.isArray(item.keywords)
107
+ ? item.keywords.filter(
108
+ (keyword): keyword is string => typeof keyword === "string",
109
+ )
110
+ : undefined;
111
+
112
+ rooms.push({
113
+ name: item.name,
114
+ description:
115
+ typeof item.description === "string" ? item.description : undefined,
116
+ keywords,
117
+ });
118
+ }
119
+
120
+ return rooms;
121
+ }
122
+
123
+ function formatRule(char: string, width = 55): string {
124
+ return char.repeat(width);
125
+ }
126
+
127
+ function normalizeMineArgs(
128
+ projectDirOrOptions: string | MineOptions,
129
+ palacePath?: string,
130
+ wingOverride?: string,
131
+ agent = "mempalace",
132
+ limit = 0,
133
+ dryRun = false,
134
+ ): MineOptions {
135
+ if (typeof projectDirOrOptions === "string") {
136
+ if (typeof palacePath !== "string") {
137
+ throw new Error("palacePath is required");
138
+ }
139
+
140
+ return {
141
+ projectDir: projectDirOrOptions,
142
+ palacePath,
143
+ wingOverride,
144
+ agent,
145
+ limit,
146
+ dryRun,
147
+ };
148
+ }
149
+
150
+ return {
151
+ projectDir: projectDirOrOptions.projectDir,
152
+ palacePath: projectDirOrOptions.palacePath,
153
+ wingOverride: projectDirOrOptions.wingOverride,
154
+ agent: projectDirOrOptions.agent ?? "mempalace",
155
+ limit: projectDirOrOptions.limit ?? 0,
156
+ dryRun: projectDirOrOptions.dryRun ?? false,
157
+ };
158
+ }
159
+
160
+ export function loadConfig(projectDir: string): LoadedConfig {
161
+ const projectPath = resolve(projectDir);
162
+ const configPath = `${projectPath}/mempalace.yaml`;
163
+ const legacyPath = `${projectPath}/mempal.yaml`;
164
+
165
+ let selectedPath = configPath;
166
+ if (!existsSync(selectedPath)) {
167
+ if (existsSync(legacyPath)) {
168
+ selectedPath = legacyPath;
169
+ } else {
170
+ console.log(`ERROR: No mempalace.yaml found in ${projectDir}`);
171
+ console.log(`Run: mempalace init ${projectDir}`);
172
+ process.exit(1);
173
+ }
174
+ }
175
+
176
+ const parsed = yaml.load(readFileSync(selectedPath, "utf-8"));
177
+ if (!isRecord(parsed) || typeof parsed.wing !== "string") {
178
+ throw new Error(`Invalid MemPalace config: ${selectedPath}`);
179
+ }
180
+
181
+ return {
182
+ wing: parsed.wing,
183
+ rooms: parseRooms(parsed.rooms),
184
+ };
185
+ }
186
+
187
+ export function detectRoom(
188
+ filepath: string,
189
+ content: string,
190
+ rooms: RoomConfig[],
191
+ projectPath: string,
192
+ ): string {
193
+ const relativePath = relative(projectPath, filepath).toLowerCase();
194
+ const filename = basename(filepath, extname(filepath)).toLowerCase();
195
+ const contentLower = content.slice(0, 2000).toLowerCase();
196
+
197
+ const pathParts = relativePath.replace(/\\/g, "/").split("/");
198
+ for (const part of pathParts.slice(0, -1)) {
199
+ for (const room of rooms) {
200
+ const roomName = room.name.toLowerCase();
201
+ if (roomName.includes(part) || part.includes(roomName)) {
202
+ return room.name;
203
+ }
204
+ }
205
+ }
206
+
207
+ for (const room of rooms) {
208
+ const roomName = room.name.toLowerCase();
209
+ if (roomName.includes(filename) || filename.includes(roomName)) {
210
+ return room.name;
211
+ }
212
+ }
213
+
214
+ const scores = new Map<string, number>();
215
+ for (const room of rooms) {
216
+ const keywords = [...(room.keywords ?? []), room.name];
217
+ for (const keyword of keywords) {
218
+ const lowered = keyword.toLowerCase();
219
+ let count = 0;
220
+ let position = 0;
221
+
222
+ while (true) {
223
+ const index = contentLower.indexOf(lowered, position);
224
+ if (index === -1) {
225
+ break;
226
+ }
227
+ count += 1;
228
+ position = index + lowered.length;
229
+ }
230
+
231
+ scores.set(room.name, (scores.get(room.name) ?? 0) + count);
232
+ }
233
+ }
234
+
235
+ let bestRoom = "general";
236
+ let bestScore = 0;
237
+ for (const [roomName, score] of scores.entries()) {
238
+ if (score > bestScore) {
239
+ bestRoom = roomName;
240
+ bestScore = score;
241
+ }
242
+ }
243
+
244
+ return bestScore > 0 ? bestRoom : "general";
245
+ }
246
+
247
+ export function chunkText(content: string, _sourceFile: string): Chunk[] {
248
+ const trimmed = content.trim();
249
+ if (!trimmed) {
250
+ return [];
251
+ }
252
+
253
+ const chunks: Chunk[] = [];
254
+ let start = 0;
255
+ let chunkIndex = 0;
256
+
257
+ while (start < trimmed.length) {
258
+ let end = Math.min(start + CHUNK_SIZE, trimmed.length);
259
+
260
+ if (end < trimmed.length) {
261
+ let newlinePos = trimmed.lastIndexOf("\n\n", end);
262
+ if (
263
+ newlinePos >= start &&
264
+ newlinePos > start + Math.floor(CHUNK_SIZE / 2)
265
+ ) {
266
+ end = newlinePos;
267
+ } else {
268
+ newlinePos = trimmed.lastIndexOf("\n", end);
269
+ if (
270
+ newlinePos >= start &&
271
+ newlinePos > start + Math.floor(CHUNK_SIZE / 2)
272
+ ) {
273
+ end = newlinePos;
274
+ }
275
+ }
276
+ }
277
+
278
+ const chunk = trimmed.slice(start, end).trim();
279
+ if (chunk.length >= MIN_CHUNK_SIZE) {
280
+ chunks.push({ content: chunk, chunkIndex });
281
+ chunkIndex += 1;
282
+ }
283
+
284
+ start = end < trimmed.length ? end - CHUNK_OVERLAP : end;
285
+ }
286
+
287
+ return chunks;
288
+ }
289
+
290
+ export async function getCollection(
291
+ palacePath: string,
292
+ ): Promise<DrawerCollection> {
293
+ mkdirSync(palacePath, { recursive: true });
294
+ const client = new ChromaClient();
295
+ return client.getOrCreateCollection({ name: COLLECTION_NAME });
296
+ }
297
+
298
+ export async function fileAlreadyMined(
299
+ collection: DrawerCollection,
300
+ sourceFile: string,
301
+ ): Promise<boolean> {
302
+ try {
303
+ const results = await collection.get({
304
+ where: { source_file: sourceFile },
305
+ limit: 1,
306
+ });
307
+ return results.ids.length > 0;
308
+ } catch {
309
+ return false;
310
+ }
311
+ }
312
+
313
+ export async function addDrawer(
314
+ collection: DrawerCollection,
315
+ wing: string,
316
+ room: string,
317
+ content: string,
318
+ sourceFile: string,
319
+ chunkIndex: number,
320
+ agent: string,
321
+ ): Promise<boolean> {
322
+ const drawerId = `drawer_${wing}_${room}_${createHash("md5")
323
+ .update(sourceFile + String(chunkIndex))
324
+ .digest("hex")
325
+ .slice(0, 16)}`;
326
+
327
+ try {
328
+ await collection.add({
329
+ documents: [content],
330
+ ids: [drawerId],
331
+ metadatas: [
332
+ {
333
+ wing,
334
+ room,
335
+ source_file: sourceFile,
336
+ chunk_index: chunkIndex,
337
+ added_by: agent,
338
+ filed_at: new Date().toISOString(),
339
+ },
340
+ ],
341
+ });
342
+ return true;
343
+ } catch (error) {
344
+ if (error instanceof Error) {
345
+ const message = error.message.toLowerCase();
346
+ if (message.includes("already exists") || message.includes("duplicate")) {
347
+ return false;
348
+ }
349
+ }
350
+ throw error;
351
+ }
352
+ }
353
+
354
+ export async function processFile(
355
+ filepath: string,
356
+ projectPath: string,
357
+ collection: DrawerCollection | null,
358
+ wing: string,
359
+ rooms: RoomConfig[],
360
+ agent: string,
361
+ dryRun: boolean,
362
+ ): Promise<number> {
363
+ const sourceFile = filepath;
364
+ const activeCollection: DrawerCollection | null = collection;
365
+
366
+ if (!dryRun) {
367
+ if (activeCollection === null) {
368
+ throw new Error("Collection is required when dryRun is false");
369
+ }
370
+ if (await fileAlreadyMined(activeCollection, sourceFile)) {
371
+ return 0;
372
+ }
373
+ }
374
+
375
+ let content: string;
376
+ try {
377
+ content = readFileSync(filepath, { encoding: "utf-8" });
378
+ } catch {
379
+ return 0;
380
+ }
381
+
382
+ content = content.trim();
383
+ if (content.length < MIN_CHUNK_SIZE) {
384
+ return 0;
385
+ }
386
+
387
+ const room = detectRoom(filepath, content, rooms, projectPath);
388
+ const chunks = chunkText(content, sourceFile);
389
+
390
+ if (dryRun) {
391
+ console.log(
392
+ ` [DRY RUN] ${basename(filepath)} → room:${room} (${chunks.length} drawers)`,
393
+ );
394
+ return chunks.length;
395
+ }
396
+
397
+ let drawersAdded = 0;
398
+ for (const chunk of chunks) {
399
+ if (activeCollection === null) {
400
+ throw new Error("Collection is required when dryRun is false");
401
+ }
402
+
403
+ const added = await addDrawer(
404
+ activeCollection,
405
+ wing,
406
+ room,
407
+ chunk.content,
408
+ sourceFile,
409
+ chunk.chunkIndex,
410
+ agent,
411
+ );
412
+ if (added) {
413
+ drawersAdded += 1;
414
+ }
415
+ }
416
+
417
+ return drawersAdded;
418
+ }
419
+
420
+ export function scanProject(projectDir: string): string[] {
421
+ const projectPath = resolve(projectDir);
422
+ const files: string[] = [];
423
+
424
+ const walk = (currentDir: string) => {
425
+ for (const entry of readdirSync(currentDir, { withFileTypes: true })) {
426
+ if (entry.isDirectory()) {
427
+ if (!SKIP_DIRS.has(entry.name)) {
428
+ walk(`${currentDir}/${entry.name}`);
429
+ }
430
+ continue;
431
+ }
432
+
433
+ if (!entry.isFile()) {
434
+ continue;
435
+ }
436
+
437
+ const filepath = `${currentDir}/${entry.name}`;
438
+ if (!READABLE_EXTENSIONS.has(extname(entry.name).toLowerCase())) {
439
+ continue;
440
+ }
441
+
442
+ if (SKIP_FILES.has(entry.name)) {
443
+ continue;
444
+ }
445
+
446
+ try {
447
+ statSync(filepath);
448
+ files.push(filepath);
449
+ } catch {}
450
+ }
451
+ };
452
+
453
+ walk(projectPath);
454
+ return files;
455
+ }
456
+
457
+ export async function mine(
458
+ projectDir: string,
459
+ palacePath: string,
460
+ wingOverride?: string,
461
+ agent?: string,
462
+ limit?: number,
463
+ dryRun?: boolean,
464
+ ): Promise<void>;
465
+ export async function mine(options: MineOptions): Promise<void>;
466
+ export async function mine(
467
+ projectDirOrOptions: string | MineOptions,
468
+ palacePath?: string,
469
+ wingOverride?: string,
470
+ agent = "mempalace",
471
+ limit = 0,
472
+ dryRun = false,
473
+ ): Promise<void> {
474
+ const options = normalizeMineArgs(
475
+ projectDirOrOptions,
476
+ palacePath,
477
+ wingOverride,
478
+ agent,
479
+ limit,
480
+ dryRun,
481
+ );
482
+ const projectPath = resolve(options.projectDir);
483
+ const config = loadConfig(options.projectDir);
484
+
485
+ const wing = options.wingOverride ?? config.wing;
486
+ const rooms = config.rooms ?? [
487
+ { name: "general", description: "All project files" },
488
+ ];
489
+
490
+ let files = scanProject(options.projectDir);
491
+ if ((options.limit ?? 0) > 0) {
492
+ files = files.slice(0, options.limit);
493
+ }
494
+
495
+ console.log(`\n${formatRule("=")}`);
496
+ console.log(" MemPalace Mine");
497
+ console.log(formatRule("="));
498
+ console.log(` Wing: ${wing}`);
499
+ console.log(` Rooms: ${rooms.map((room) => room.name).join(", ")}`);
500
+ console.log(` Files: ${files.length}`);
501
+ console.log(` Palace: ${options.palacePath}`);
502
+ if (options.dryRun) {
503
+ console.log(" DRY RUN — nothing will be filed");
504
+ }
505
+ console.log(`${formatRule("─")}\n`);
506
+
507
+ const collection = options.dryRun
508
+ ? null
509
+ : await getCollection(options.palacePath);
510
+
511
+ let totalDrawers = 0;
512
+ let filesSkipped = 0;
513
+ const roomCounts = new Map<string, number>();
514
+
515
+ for (const [index, filepath] of files.entries()) {
516
+ const drawers = await processFile(
517
+ filepath,
518
+ projectPath,
519
+ collection,
520
+ wing,
521
+ rooms,
522
+ options.agent ?? "mempalace",
523
+ options.dryRun ?? false,
524
+ );
525
+
526
+ if (drawers === 0 && !options.dryRun) {
527
+ filesSkipped += 1;
528
+ } else {
529
+ totalDrawers += drawers;
530
+ const room = detectRoom(filepath, "", rooms, projectPath);
531
+ roomCounts.set(room, (roomCounts.get(room) ?? 0) + 1);
532
+
533
+ if (!options.dryRun) {
534
+ console.log(
535
+ ` ✓ [${String(index + 1).padStart(4)}/${files.length}] ${basename(
536
+ filepath,
537
+ )
538
+ .slice(0, 50)
539
+ .padEnd(50)} +${drawers}`,
540
+ );
541
+ }
542
+ }
543
+ }
544
+
545
+ console.log(`\n${formatRule("=")}`);
546
+ console.log(" Done.");
547
+ console.log(` Files processed: ${files.length - filesSkipped}`);
548
+ console.log(` Files skipped (already filed): ${filesSkipped}`);
549
+ console.log(` Drawers filed: ${totalDrawers}`);
550
+ console.log("\n By room:");
551
+
552
+ for (const [room, count] of [...roomCounts.entries()].sort(
553
+ (left, right) => right[1] - left[1],
554
+ )) {
555
+ console.log(` ${room.padEnd(20)} ${count} files`);
556
+ }
557
+
558
+ console.log('\n Next: mempalace search "what you\'re looking for"');
559
+ console.log(`${formatRule("=")}\n`);
560
+ }
561
+
562
+ export async function status(palacePath: string): Promise<void> {
563
+ try {
564
+ mkdirSync(palacePath, { recursive: true });
565
+ const client = new ChromaClient();
566
+ const collection = await client.getCollection({
567
+ name: COLLECTION_NAME,
568
+ embeddingFunction: new DefaultEmbeddingFunction(),
569
+ });
570
+ const result = await collection.get({
571
+ limit: 10000,
572
+ include: [IncludeEnum.Metadatas],
573
+ });
574
+ const metadatas = result.metadatas;
575
+
576
+ const wingRooms = new Map<string, Map<string, number>>();
577
+ for (const metadata of metadatas) {
578
+ if (metadata === null) {
579
+ continue;
580
+ }
581
+
582
+ const wing = String(metadata.wing ?? "?");
583
+ const room = String(metadata.room ?? "?");
584
+ const rooms = wingRooms.get(wing) ?? new Map<string, number>();
585
+ rooms.set(room, (rooms.get(room) ?? 0) + 1);
586
+ wingRooms.set(wing, rooms);
587
+ }
588
+
589
+ console.log(`\n${formatRule("=")}`);
590
+ console.log(` MemPalace Status — ${metadatas.length} drawers`);
591
+ console.log(`${formatRule("=")}\n`);
592
+
593
+ for (const [wing, rooms] of [...wingRooms.entries()].sort(
594
+ ([left], [right]) => left.localeCompare(right),
595
+ )) {
596
+ console.log(` WING: ${wing}`);
597
+ for (const [room, count] of [...rooms.entries()].sort(
598
+ (left, right) => right[1] - left[1],
599
+ )) {
600
+ console.log(
601
+ ` ROOM: ${room.padEnd(20)} ${String(count).padStart(5)} drawers`,
602
+ );
603
+ }
604
+ console.log();
605
+ }
606
+
607
+ console.log(`${formatRule("=")}\n`);
608
+ } catch {
609
+ console.log(`\n No palace found at ${palacePath}`);
610
+ console.log(" Run: mempalace init <dir> then mempalace mine <dir>");
611
+ }
612
+ }
@@ -0,0 +1,43 @@
1
+ declare module "../entity-detector" {
2
+ export type DetectedEntities = {
3
+ people: unknown[];
4
+ projects: unknown[];
5
+ uncertain: unknown[];
6
+ };
7
+
8
+ export function scanForDetection(dir: string): string[];
9
+ export function detectEntities(files: string[]): DetectedEntities;
10
+ export function confirmEntities(
11
+ detected: DetectedEntities,
12
+ yes: boolean,
13
+ ): Promise<DetectedEntities>;
14
+ }
15
+
16
+ declare module "../room-detector" {
17
+ export function detectRoomsLocal(dir: string): Promise<void>;
18
+ }
19
+
20
+ declare module "../convo-miner" {
21
+ export type MineConvosOptions = {
22
+ convoDir: string;
23
+ palacePath: string;
24
+ wing?: string;
25
+ agent?: string;
26
+ limit?: number;
27
+ dryRun?: boolean;
28
+ extractMode: "exchange" | "general";
29
+ };
30
+
31
+ export function mineConvos(options: MineConvosOptions): Promise<void>;
32
+ }
33
+
34
+ declare module "../split-mega-files" {
35
+ export type SplitMegaFilesOptions = {
36
+ dir: string;
37
+ outputDir?: string;
38
+ dryRun?: boolean;
39
+ minSessions?: number;
40
+ };
41
+
42
+ export function splitMegaFiles(options: SplitMegaFilesOptions): void;
43
+ }