@voidwire/lore 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/indexer.ts +11 -6
- package/lib/indexers/flux.ts +7 -1
- package/lib/indexers/obsidian.ts +2 -0
- package/package.json +1 -1
package/lib/indexer.ts
CHANGED
|
@@ -129,6 +129,10 @@ export function createIndexerContext(
|
|
|
129
129
|
"INSERT INTO search (source, title, content, metadata, topic, type, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
130
130
|
);
|
|
131
131
|
|
|
132
|
+
// Separate sets: entry-level avoids re-chunking identical docs,
|
|
133
|
+
// chunk-level catches duplicate chunks across different documents
|
|
134
|
+
const seenEntryHashes = new Set<string>();
|
|
135
|
+
|
|
132
136
|
return {
|
|
133
137
|
db,
|
|
134
138
|
config,
|
|
@@ -136,22 +140,23 @@ export function createIndexerContext(
|
|
|
136
140
|
insert: (entry: IndexEntry) => {
|
|
137
141
|
validateEntry(entry);
|
|
138
142
|
|
|
139
|
-
//
|
|
143
|
+
// Entry-level dedup: skip re-chunking identical documents
|
|
140
144
|
const contentHash = createHash("sha256")
|
|
141
145
|
.update(entry.content)
|
|
142
146
|
.digest("hex");
|
|
143
|
-
|
|
144
|
-
// Skip if already indexed
|
|
145
|
-
if (seenHashes.has(contentHash)) {
|
|
147
|
+
if (seenEntryHashes.has(contentHash)) {
|
|
146
148
|
return;
|
|
147
149
|
}
|
|
148
|
-
|
|
150
|
+
seenEntryHashes.add(contentHash);
|
|
149
151
|
|
|
150
152
|
// Chunk content if needed
|
|
151
153
|
const chunks = chunkContent(entry.content);
|
|
152
154
|
|
|
153
|
-
//
|
|
155
|
+
// Chunk-level dedup: skip duplicate chunks across documents
|
|
154
156
|
for (const chunk of chunks) {
|
|
157
|
+
const chunkHash = createHash("sha256").update(chunk).digest("hex");
|
|
158
|
+
if (seenHashes.has(chunkHash)) continue;
|
|
159
|
+
seenHashes.add(chunkHash);
|
|
155
160
|
insertStmt.run(
|
|
156
161
|
entry.source,
|
|
157
162
|
entry.title,
|
package/lib/indexers/flux.ts
CHANGED
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* Timestamp: captured date if present, otherwise empty
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
|
-
import { readdirSync, readFileSync, existsSync } from "fs";
|
|
15
|
+
import { readdirSync, readFileSync, existsSync, statSync } from "fs";
|
|
16
16
|
import { join, basename } from "path";
|
|
17
17
|
import type { IndexerContext } from "../indexer";
|
|
18
18
|
|
|
@@ -82,6 +82,7 @@ function parseFluxFile(
|
|
|
82
82
|
status: string,
|
|
83
83
|
): void {
|
|
84
84
|
const raw = readFileSync(filePath, "utf-8");
|
|
85
|
+
const mtime = statSync(filePath).mtime;
|
|
85
86
|
const lines = raw.split("\n");
|
|
86
87
|
|
|
87
88
|
for (const line of lines) {
|
|
@@ -112,6 +113,11 @@ function parseFluxFile(
|
|
|
112
113
|
);
|
|
113
114
|
}
|
|
114
115
|
|
|
116
|
+
// Fall back to file mtime if no captured date
|
|
117
|
+
if (!timestamp) {
|
|
118
|
+
timestamp = mtime.toISOString();
|
|
119
|
+
}
|
|
120
|
+
|
|
115
121
|
// Extract archived date if present (strip from description)
|
|
116
122
|
rest = rest.replace(/\s*archived::\s*\S+/, "");
|
|
117
123
|
|
package/lib/indexers/obsidian.ts
CHANGED