@s-hirano-ist/s-scripts 1.5.2 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/rag/chunker.d.ts +10 -0
- package/dist/rag/chunker.d.ts.map +1 -0
- package/dist/rag/chunker.js +188 -0
- package/dist/rag/chunker.js.map +1 -0
- package/dist/rag/config.d.ts +44 -0
- package/dist/rag/config.d.ts.map +1 -0
- package/dist/rag/config.js +34 -0
- package/dist/rag/config.js.map +1 -0
- package/dist/rag/embedding.d.ts +15 -0
- package/dist/rag/embedding.d.ts.map +1 -0
- package/dist/rag/embedding.js +61 -0
- package/dist/rag/embedding.js.map +1 -0
- package/dist/rag/ingest.d.ts +3 -0
- package/dist/rag/ingest.d.ts.map +1 -0
- package/dist/rag/ingest.js +148 -0
- package/dist/rag/ingest.js.map +1 -0
- package/dist/rag/qdrant-client.d.ts +40 -0
- package/dist/rag/qdrant-client.d.ts.map +1 -0
- package/dist/rag/qdrant-client.js +160 -0
- package/dist/rag/qdrant-client.js.map +1 -0
- package/dist/rag/search.d.ts +3 -0
- package/dist/rag/search.d.ts.map +1 -0
- package/dist/rag/search.js +105 -0
- package/dist/rag/search.js.map +1 -0
- package/dist/update-raw-articles.js +1 -1
- package/dist/update-raw-articles.js.map +1 -1
- package/package.json +11 -4
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { type QdrantPayload } from "./config.js";
|
|
2
|
+
/**
|
|
3
|
+
* Parse JSON article file and generate chunks
|
|
4
|
+
*/
|
|
5
|
+
export declare function parseJsonArticle(filePath: string, content: string): QdrantPayload[];
|
|
6
|
+
/**
|
|
7
|
+
* Parse Markdown file and generate chunks
|
|
8
|
+
*/
|
|
9
|
+
export declare function parseMarkdown(filePath: string, content: string): QdrantPayload[];
|
|
10
|
+
//# sourceMappingURL=chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../src/rag/chunker.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,aAAa,EAAc,MAAM,aAAa,CAAC;AAsC7D;;GAEG;AACH,wBAAgB,gBAAgB,CAC/B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GACb,aAAa,EAAE,CAsCjB;AAiID;;GAEG;AACH,wBAAgB,aAAa,CAC5B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GACb,aAAa,EAAE,CA2CjB"}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import { createHash } from "crypto";
|
|
2
|
+
import { RAG_CONFIG } from "./config.js";
|
|
3
|
+
/**
|
|
4
|
+
* Generate content hash for change detection
|
|
5
|
+
*/
|
|
6
|
+
function generateHash(content) {
|
|
7
|
+
return createHash("sha256").update(content).digest("hex").slice(0, 16);
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Generate chunk ID from doc_id and index
|
|
11
|
+
*/
|
|
12
|
+
function generateChunkId(docId, index) {
|
|
13
|
+
return `${docId}#${index}`;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Parse JSON article file and generate chunks
|
|
17
|
+
*/
|
|
18
|
+
export function parseJsonArticle(filePath, content) {
|
|
19
|
+
const json = JSON.parse(content);
|
|
20
|
+
const docId = `file:${filePath}`;
|
|
21
|
+
const chunks = [];
|
|
22
|
+
for (let i = 0; i < json.body.length; i++) {
|
|
23
|
+
const item = json.body[i];
|
|
24
|
+
// Build text from available fields
|
|
25
|
+
const textParts = [];
|
|
26
|
+
if (item.title)
|
|
27
|
+
textParts.push(item.title);
|
|
28
|
+
if (item.ogTitle && item.ogTitle !== item.title)
|
|
29
|
+
textParts.push(item.ogTitle);
|
|
30
|
+
if (item.ogDescription)
|
|
31
|
+
textParts.push(item.ogDescription);
|
|
32
|
+
if (item.quote)
|
|
33
|
+
textParts.push(item.quote);
|
|
34
|
+
if (item.url)
|
|
35
|
+
textParts.push(item.url);
|
|
36
|
+
const text = textParts.join("\n");
|
|
37
|
+
// Skip empty items
|
|
38
|
+
if (!text.trim())
|
|
39
|
+
continue;
|
|
40
|
+
const chunkId = generateChunkId(docId, i);
|
|
41
|
+
chunks.push({
|
|
42
|
+
type: "bookmark_json",
|
|
43
|
+
top_heading: json.heading,
|
|
44
|
+
doc_id: docId,
|
|
45
|
+
chunk_id: chunkId,
|
|
46
|
+
title: item.title || item.ogTitle || "Untitled",
|
|
47
|
+
url: item.url,
|
|
48
|
+
heading_path: [json.heading],
|
|
49
|
+
text,
|
|
50
|
+
content_hash: generateHash(text),
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
return chunks;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Parse Markdown frontmatter
|
|
57
|
+
*/
|
|
58
|
+
function parseFrontmatter(content) {
|
|
59
|
+
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
60
|
+
if (!frontmatterMatch) {
|
|
61
|
+
return {
|
|
62
|
+
frontmatter: { heading: "unknown" },
|
|
63
|
+
body: content,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
const frontmatterStr = frontmatterMatch[1];
|
|
67
|
+
const body = frontmatterMatch[2];
|
|
68
|
+
// Simple YAML parsing for our needs
|
|
69
|
+
const frontmatter = { heading: "unknown" };
|
|
70
|
+
for (const line of frontmatterStr.split("\n")) {
|
|
71
|
+
const [key, ...valueParts] = line.split(":");
|
|
72
|
+
const value = valueParts.join(":").trim();
|
|
73
|
+
if (key === "heading") {
|
|
74
|
+
frontmatter.heading = value;
|
|
75
|
+
}
|
|
76
|
+
else if (key === "description") {
|
|
77
|
+
frontmatter.description = value;
|
|
78
|
+
}
|
|
79
|
+
else if (key === "draft") {
|
|
80
|
+
frontmatter.draft = value === "true";
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return { frontmatter, body };
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Split markdown into sections by headings
|
|
87
|
+
*/
|
|
88
|
+
function splitMarkdownByHeadings(content) {
|
|
89
|
+
const lines = content.split("\n");
|
|
90
|
+
const sections = [];
|
|
91
|
+
let currentHeadingPath = [];
|
|
92
|
+
let currentSection = null;
|
|
93
|
+
const headingStack = [];
|
|
94
|
+
for (const line of lines) {
|
|
95
|
+
const headingMatch = line.match(/^(#{2,3})\s+(.+)$/);
|
|
96
|
+
if (headingMatch) {
|
|
97
|
+
// Save previous section
|
|
98
|
+
if (currentSection && currentSection.content.trim()) {
|
|
99
|
+
sections.push(currentSection);
|
|
100
|
+
}
|
|
101
|
+
const level = headingMatch[1].length;
|
|
102
|
+
const title = headingMatch[2];
|
|
103
|
+
// Update heading stack
|
|
104
|
+
while (headingStack.length > 0 &&
|
|
105
|
+
headingStack[headingStack.length - 1].level >= level) {
|
|
106
|
+
headingStack.pop();
|
|
107
|
+
}
|
|
108
|
+
headingStack.push({ level, title });
|
|
109
|
+
// Update heading path
|
|
110
|
+
currentHeadingPath = headingStack.map((h) => h.title);
|
|
111
|
+
currentSection = {
|
|
112
|
+
headingPath: [...currentHeadingPath],
|
|
113
|
+
title,
|
|
114
|
+
content: "",
|
|
115
|
+
level,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
else if (currentSection) {
|
|
119
|
+
currentSection.content += line + "\n";
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
// Save last section
|
|
123
|
+
if (currentSection && currentSection.content.trim()) {
|
|
124
|
+
sections.push(currentSection);
|
|
125
|
+
}
|
|
126
|
+
return sections;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Split long text into smaller chunks by paragraphs
|
|
130
|
+
*/
|
|
131
|
+
function splitByParagraphs(text, maxLength) {
|
|
132
|
+
if (text.length <= maxLength) {
|
|
133
|
+
return [text];
|
|
134
|
+
}
|
|
135
|
+
const paragraphs = text.split(/\n\n+/);
|
|
136
|
+
const chunks = [];
|
|
137
|
+
let currentChunk = "";
|
|
138
|
+
for (const para of paragraphs) {
|
|
139
|
+
if (currentChunk.length + para.length > maxLength && currentChunk) {
|
|
140
|
+
chunks.push(currentChunk.trim());
|
|
141
|
+
currentChunk = para;
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
currentChunk += (currentChunk ? "\n\n" : "") + para;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
if (currentChunk.trim()) {
|
|
148
|
+
chunks.push(currentChunk.trim());
|
|
149
|
+
}
|
|
150
|
+
return chunks;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Parse Markdown file and generate chunks
|
|
154
|
+
*/
|
|
155
|
+
export function parseMarkdown(filePath, content) {
|
|
156
|
+
const { frontmatter, body } = parseFrontmatter(content);
|
|
157
|
+
const docId = `file:${filePath}`;
|
|
158
|
+
const chunks = [];
|
|
159
|
+
// Skip draft files
|
|
160
|
+
if (frontmatter.draft) {
|
|
161
|
+
return [];
|
|
162
|
+
}
|
|
163
|
+
const sections = splitMarkdownByHeadings(body);
|
|
164
|
+
let chunkIndex = 0;
|
|
165
|
+
for (const section of sections) {
|
|
166
|
+
// Split long sections
|
|
167
|
+
const textChunks = splitByParagraphs(section.content, RAG_CONFIG.chunking.maxChunkLength);
|
|
168
|
+
for (const text of textChunks) {
|
|
169
|
+
if (!text.trim())
|
|
170
|
+
continue;
|
|
171
|
+
const fullHeadingPath = [frontmatter.heading, ...section.headingPath];
|
|
172
|
+
const chunkId = generateChunkId(docId, chunkIndex);
|
|
173
|
+
chunks.push({
|
|
174
|
+
type: "markdown_note",
|
|
175
|
+
top_heading: frontmatter.heading,
|
|
176
|
+
doc_id: docId,
|
|
177
|
+
chunk_id: chunkId,
|
|
178
|
+
title: section.title,
|
|
179
|
+
heading_path: fullHeadingPath,
|
|
180
|
+
text,
|
|
181
|
+
content_hash: generateHash(text),
|
|
182
|
+
});
|
|
183
|
+
chunkIndex++;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return chunks;
|
|
187
|
+
}
|
|
188
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../../src/rag/chunker.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AACpC,OAAO,EAAsB,UAAU,EAAE,MAAM,aAAa,CAAC;AAwB7D;;GAEG;AACH,SAAS,YAAY,CAAC,OAAe;IACpC,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AACxE,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,KAAa,EAAE,KAAa;IACpD,OAAO,GAAG,KAAK,IAAI,KAAK,EAAE,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC/B,QAAgB,EAChB,OAAe;IAEf,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAgB,CAAC;IAChD,MAAM,KAAK,GAAG,QAAQ,QAAQ,EAAE,CAAC;IACjC,MAAM,MAAM,GAAoB,EAAE,CAAC;IAEnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAE1B,mCAAmC;QACnC,MAAM,SAAS,GAAa,EAAE,CAAC;QAC/B,IAAI,IAAI,CAAC,KAAK;YAAE,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3C,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,CAAC,KAAK;YAC9C,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC9B,IAAI,IAAI,CAAC,aAAa;YAAE,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC3D,IAAI,IAAI,CAAC,KAAK;YAAE,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3C,IAAI,IAAI,CAAC,GAAG;YAAE,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEvC,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAElC,mBAAmB;QACnB,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,SAAS;QAE3B,MAAM,OAAO,GAAG,eAAe,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAE1C,MAAM,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,eAAe;YACrB,WAAW,EAAE,IAAI,CAAC,OAAO;YACzB,MAAM,EAAE,KAAK;YACb,QAAQ,EAAE,OAAO;YACjB,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,OAAO,IAAI,UAAU;YAC/C,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,YAAY,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC;YAC5B,IAAI;YACJ,YAAY,EAAE,YAAY,CAAC,IAAI,CAAC;SAChC,CAAC,CAAC;IACJ,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,OAAe;IAIxC,MAAM,gBAAgB,GAAG,OAAO,CAAC,KAAK,CAAC,mCAAmC,CAAC,CAAC;IAE5E,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACvB,OAAO;YACN,WAAW,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE;YACnC,IAAI,EAAE,OAAO;SACb,CAAC;IACH,CAAC;IAED,MAAM,cAAc,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAC3C,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAEjC,oCAAoC;IACpC,MAAM,WAAW,GAAwB,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;IAEhE,KAAK,MAAM,IAAI,IAAI,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QAC/C,MAAM,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC7C,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAE1C,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;YACvB,WAAW,CAAC,OAAO,GAAG,KAAK,CAAC;QAC7B,CAAC;aAAM,IAAI,GAAG,KAAK,aAAa,EAAE,CAAC;YAClC,WAAW,CAAC,WAAW,GAAG,KAAK,CAAC;QACjC,CAAC;aAAM,IAAI,GAAG,KAAK,OAAO,EAAE,CAAC;YAC5B,WAAW,CAAC,KAAK,GAAG,KAAK,KAAK,MAAM,CAAC;QACtC,CAAC;IACF,CAAC;IAED,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;AAC9B,CAAC;AASD;;GAEG;AACH,SAAS,uBAAuB,CAAC,OAAe;IAC/C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAsB,EAAE,CAAC;IAEvC,IAAI,kBAAkB,GAAa,EAAE,CAAC;IACtC,IAAI,cAAc,GAA2B,IAAI,CAAC;IAClD,MAAM,YAAY,GAAuC,EAAE,CAAC;IAE5D,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAErD,IAAI,YAAY,EAAE,CAAC;YAClB,wBAAwB;YACxB,IAAI,cAAc,IAAI,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;gBACrD,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAC/B,CAAC;YAED,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YACrC,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;YAE9B,uBAAuB;YACvB,OACC,YAAY,CAAC,MAAM,GAAG,CAAC;gBACvB,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,KAAK,IAAI,KAAK,EACnD,CAAC;gBACF,YAAY,CAAC,GAAG,EAAE,CAAC;YACpB,CAAC;YACD,YAAY,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YAEpC,sBAAsB;YACtB,kBAAkB,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YAEtD,cAAc,GAAG;gBAChB,WAAW,EAAE,CAAC,GAAG,kBAAkB,CAAC;gBACpC,KAAK;gBACL,OAAO,EAAE,EAAE;gBACX,KAAK;aACL,CAAC;QACH,CAAC;aAAM,IAAI,cAAc,EAAE,CAAC;YAC3B,cAAc,CAAC,OAAO,IAAI,IAAI,GAAG,IAAI,CAAC;QACvC,CAAC;IACF,CAAC;IAED,oBAAoB;IACpB,IAAI,cAAc,IAAI,cAAc,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QACrD,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAC/B,CAAC;IAED,OAAO,QAAQ,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,IAAY,EAAE,SAAiB;IACzD,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,CAAC,IAAI,CAAC,CAAC;IACf,CAAC;IAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACvC,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,YAAY,GAAG,EAAE,CAAC;IAEtB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC/B,IAAI,YAAY,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,SAAS,IAAI,YAAY,EAAE,CAAC;YACnE,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;YACjC,YAAY,GAAG,IAAI,CAAC;QACrB,CAAC;aAAM,CAAC;YACP,YAAY,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC;QACrD,CAAC;IACF,CAAC;IAED,IAAI,YAAY,CAAC,IAAI,EAAE,EAAE,CAAC;QACzB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,aAAa,CAC5B,QAAgB,EAChB,OAAe;IAEf,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IACxD,MAAM,KAAK,GAAG,QAAQ,QAAQ,EAAE,CAAC;IACjC,MAAM,MAAM,GAAoB,EAAE,CAAC;IAEnC,mBAAmB;IACnB,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;QACvB,OAAO,EAAE,CAAC;IACX,CAAC;IAED,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAE/C,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAChC,sBAAsB;QACtB,MAAM,UAAU,GAAG,iBAAiB,CACnC,OAAO,CAAC,OAAO,EACf,UAAU,CAAC,QAAQ,CAAC,cAAc,CAClC,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;gBAAE,SAAS;YAE3B,MAAM,eAAe,GAAG,CAAC,WAAW,CAAC,OAAO,EAAE,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YACtE,MAAM,OAAO,GAAG,eAAe,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;YAEnD,MAAM,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,eAAe;gBACrB,WAAW,EAAE,WAAW,CAAC,OAAO;gBAChC,MAAM,EAAE,KAAK;gBACb,QAAQ,EAAE,OAAO;gBACjB,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,YAAY,EAAE,eAAe;gBAC7B,IAAI;gBACJ,YAAY,EAAE,YAAY,CAAC,IAAI,CAAC;aAChC,CAAC,CAAC;YAEH,UAAU,EAAE,CAAC;QACd,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export declare const RAG_CONFIG: {
|
|
2
|
+
readonly qdrant: {
|
|
3
|
+
readonly collectionName: "knowledge_v1";
|
|
4
|
+
readonly vectorSize: 384;
|
|
5
|
+
readonly distance: "Cosine";
|
|
6
|
+
};
|
|
7
|
+
readonly embedding: {
|
|
8
|
+
readonly model: "intfloat/multilingual-e5-small";
|
|
9
|
+
readonly prefix: {
|
|
10
|
+
readonly query: "query: ";
|
|
11
|
+
readonly passage: "passage: ";
|
|
12
|
+
};
|
|
13
|
+
};
|
|
14
|
+
readonly paths: {
|
|
15
|
+
readonly markdown: readonly ["markdown/note/**/*.md", "markdown/book/**/*.md", "raw/article/**/*.md"];
|
|
16
|
+
readonly json: "json/article/**/*.json";
|
|
17
|
+
};
|
|
18
|
+
readonly chunking: {
|
|
19
|
+
readonly maxChunkLength: 2000;
|
|
20
|
+
readonly headingLevels: readonly [2, 3];
|
|
21
|
+
};
|
|
22
|
+
readonly hashCachePath: ".rag-hash-cache.json";
|
|
23
|
+
};
|
|
24
|
+
export type QdrantPayload = {
|
|
25
|
+
type: "markdown_note" | "bookmark_json";
|
|
26
|
+
top_heading: string;
|
|
27
|
+
doc_id: string;
|
|
28
|
+
chunk_id: string;
|
|
29
|
+
title: string;
|
|
30
|
+
url?: string;
|
|
31
|
+
heading_path: string[];
|
|
32
|
+
text: string;
|
|
33
|
+
content_hash: string;
|
|
34
|
+
};
|
|
35
|
+
export type SearchResult = {
|
|
36
|
+
score: number;
|
|
37
|
+
text: string;
|
|
38
|
+
title: string;
|
|
39
|
+
url?: string;
|
|
40
|
+
heading_path: string[];
|
|
41
|
+
type: "markdown_note" | "bookmark_json";
|
|
42
|
+
doc_id: string;
|
|
43
|
+
};
|
|
44
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../src/rag/config.ts"],"names":[],"mappings":"AACA,eAAO,MAAM,UAAU;;;;;;;;;;;;;;;;;;;;;;CAmCb,CAAC;AAGX,MAAM,MAAM,aAAa,GAAG;IAC3B,IAAI,EAAE,eAAe,GAAG,eAAe,CAAC;IACxC,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,CAAC;CACrB,CAAC;AAGF,MAAM,MAAM,YAAY,GAAG;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,eAAe,GAAG,eAAe,CAAC;IACxC,MAAM,EAAE,MAAM,CAAC;CACf,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
// RAG Configuration
|
|
2
|
+
export const RAG_CONFIG = {
|
|
3
|
+
// Qdrant settings
|
|
4
|
+
qdrant: {
|
|
5
|
+
collectionName: "knowledge_v1",
|
|
6
|
+
vectorSize: 384, // multilingual-e5-small
|
|
7
|
+
distance: "Cosine",
|
|
8
|
+
},
|
|
9
|
+
// Embedding settings
|
|
10
|
+
embedding: {
|
|
11
|
+
model: "intfloat/multilingual-e5-small",
|
|
12
|
+
prefix: {
|
|
13
|
+
query: "query: ",
|
|
14
|
+
passage: "passage: ",
|
|
15
|
+
},
|
|
16
|
+
},
|
|
17
|
+
// File paths
|
|
18
|
+
paths: {
|
|
19
|
+
markdown: [
|
|
20
|
+
"markdown/note/**/*.md",
|
|
21
|
+
"markdown/book/**/*.md",
|
|
22
|
+
"raw/article/**/*.md",
|
|
23
|
+
],
|
|
24
|
+
json: "json/article/**/*.json",
|
|
25
|
+
},
|
|
26
|
+
// Chunking settings
|
|
27
|
+
chunking: {
|
|
28
|
+
maxChunkLength: 2000,
|
|
29
|
+
headingLevels: [2, 3], // ## and ###
|
|
30
|
+
},
|
|
31
|
+
// Cache file for hash comparison
|
|
32
|
+
hashCachePath: ".rag-hash-cache.json",
|
|
33
|
+
};
|
|
34
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/rag/config.ts"],"names":[],"mappings":"AAAA,oBAAoB;AACpB,MAAM,CAAC,MAAM,UAAU,GAAG;IACzB,kBAAkB;IAClB,MAAM,EAAE;QACP,cAAc,EAAE,cAAc;QAC9B,UAAU,EAAE,GAAG,EAAE,wBAAwB;QACzC,QAAQ,EAAE,QAAiB;KAC3B;IAED,qBAAqB;IACrB,SAAS,EAAE;QACV,KAAK,EAAE,gCAAgC;QACvC,MAAM,EAAE;YACP,KAAK,EAAE,SAAS;YAChB,OAAO,EAAE,WAAW;SACpB;KACD;IAED,aAAa;IACb,KAAK,EAAE;QACN,QAAQ,EAAE;YACT,uBAAuB;YACvB,uBAAuB;YACvB,qBAAqB;SACrB;QACD,IAAI,EAAE,wBAAwB;KAC9B;IAED,oBAAoB;IACpB,QAAQ,EAAE;QACT,cAAc,EAAE,IAAI;QACpB,aAAa,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,aAAa;KACpC;IAED,iCAAiC;IACjC,aAAa,EAAE,sBAAsB;CAC5B,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate embedding for a single text
|
|
3
|
+
* @param text - Input text to embed
|
|
4
|
+
* @param isQuery - Whether this is a query (vs passage)
|
|
5
|
+
* @returns Embedding vector
|
|
6
|
+
*/
|
|
7
|
+
export declare function embed(text: string, isQuery?: boolean): Promise<number[]>;
|
|
8
|
+
/**
|
|
9
|
+
* Generate embeddings for multiple texts in batch
|
|
10
|
+
* @param texts - Array of input texts
|
|
11
|
+
* @param isQuery - Whether these are queries (vs passages)
|
|
12
|
+
* @returns Array of embedding vectors
|
|
13
|
+
*/
|
|
14
|
+
export declare function embedBatch(texts: string[], isQuery?: boolean): Promise<number[][]>;
|
|
15
|
+
//# sourceMappingURL=embedding.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedding.d.ts","sourceRoot":"","sources":["../../src/rag/embedding.ts"],"names":[],"mappings":"AAwBA;;;;;GAKG;AACH,wBAAsB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,UAAQ,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAiB5E;AAED;;;;;GAKG;AACH,wBAAsB,UAAU,CAC/B,KAAK,EAAE,MAAM,EAAE,EACf,OAAO,UAAQ,GACb,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CA2BrB"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { pipeline, } from "@huggingface/transformers";
|
|
2
|
+
import { RAG_CONFIG } from "./config.js";
|
|
3
|
+
let embeddingPipeline = null;
|
|
4
|
+
/**
|
|
5
|
+
* Initialize the embedding model (lazy loading)
|
|
6
|
+
*/
|
|
7
|
+
async function getEmbeddingPipeline() {
|
|
8
|
+
if (!embeddingPipeline) {
|
|
9
|
+
console.log(`Loading embedding model: ${RAG_CONFIG.embedding.model}...`);
|
|
10
|
+
embeddingPipeline = (await pipeline("feature-extraction", RAG_CONFIG.embedding.model, { dtype: "fp32" }));
|
|
11
|
+
console.log("Embedding model loaded successfully.");
|
|
12
|
+
}
|
|
13
|
+
return embeddingPipeline;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Generate embedding for a single text
|
|
17
|
+
* @param text - Input text to embed
|
|
18
|
+
* @param isQuery - Whether this is a query (vs passage)
|
|
19
|
+
* @returns Embedding vector
|
|
20
|
+
*/
|
|
21
|
+
export async function embed(text, isQuery = false) {
|
|
22
|
+
const pipe = await getEmbeddingPipeline();
|
|
23
|
+
// E5 models require prefixes
|
|
24
|
+
const prefix = isQuery
|
|
25
|
+
? RAG_CONFIG.embedding.prefix.query
|
|
26
|
+
: RAG_CONFIG.embedding.prefix.passage;
|
|
27
|
+
const prefixedText = prefix + text;
|
|
28
|
+
const output = await pipe(prefixedText, {
|
|
29
|
+
pooling: "mean",
|
|
30
|
+
normalize: true,
|
|
31
|
+
});
|
|
32
|
+
// Convert to array
|
|
33
|
+
return Array.from(output.data);
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Generate embeddings for multiple texts in batch
|
|
37
|
+
* @param texts - Array of input texts
|
|
38
|
+
* @param isQuery - Whether these are queries (vs passages)
|
|
39
|
+
* @returns Array of embedding vectors
|
|
40
|
+
*/
|
|
41
|
+
export async function embedBatch(texts, isQuery = false) {
|
|
42
|
+
const pipe = await getEmbeddingPipeline();
|
|
43
|
+
const prefix = isQuery
|
|
44
|
+
? RAG_CONFIG.embedding.prefix.query
|
|
45
|
+
: RAG_CONFIG.embedding.prefix.passage;
|
|
46
|
+
const prefixedTexts = texts.map((t) => prefix + t);
|
|
47
|
+
const outputs = await pipe(prefixedTexts, {
|
|
48
|
+
pooling: "mean",
|
|
49
|
+
normalize: true,
|
|
50
|
+
});
|
|
51
|
+
// outputs.data is a flat Float32Array, need to reshape
|
|
52
|
+
const embeddings = [];
|
|
53
|
+
const dim = RAG_CONFIG.qdrant.vectorSize;
|
|
54
|
+
for (let i = 0; i < texts.length; i++) {
|
|
55
|
+
const start = i * dim;
|
|
56
|
+
const end = start + dim;
|
|
57
|
+
embeddings.push(Array.from(outputs.data.slice(start, end)));
|
|
58
|
+
}
|
|
59
|
+
return embeddings;
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=embedding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedding.js","sourceRoot":"","sources":["../../src/rag/embedding.ts"],"names":[],"mappings":"AAAA,OAAO,EAEN,QAAQ,GACR,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC,IAAI,iBAAiB,GAAqC,IAAI,CAAC;AAE/D;;GAEG;AACH,KAAK,UAAU,oBAAoB;IAClC,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACxB,OAAO,CAAC,GAAG,CAAC,4BAA4B,UAAU,CAAC,SAAS,CAAC,KAAK,KAAK,CAAC,CAAC;QACzE,iBAAiB,GAAG,CAAC,MAAM,QAAQ,CAClC,oBAAoB,EACpB,UAAU,CAAC,SAAS,CAAC,KAAK,EAC1B,EAAE,KAAK,EAAE,MAAM,EAAE,CACjB,CAAyC,CAAC;QAC3C,OAAO,CAAC,GAAG,CAAC,sCAAsC,CAAC,CAAC;IACrD,CAAC;IACD,OAAO,iBAAiB,CAAC;AAC1B,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,IAAY,EAAE,OAAO,GAAG,KAAK;IACxD,MAAM,IAAI,GAAG,MAAM,oBAAoB,EAAE,CAAC;IAE1C,6BAA6B;IAC7B,MAAM,MAAM,GAAG,OAAO;QACrB,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK;QACnC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,MAAM,CAAC,OAAO,CAAC;IAEvC,MAAM,YAAY,GAAG,MAAM,GAAG,IAAI,CAAC;IAEnC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,EAAE;QACvC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,IAAI;KACf,CAAC,CAAC;IAEH,mBAAmB;IACnB,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAoB,CAAC,CAAC;AAChD,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,KAAe,EACf,OAAO,GAAG,KAAK;IAEf,MAAM,IAAI,GAAG,MAAM,oBAAoB,EAAE,CAAC;IAE1C,MAAM,MAAM,GAAG,OAAO;QACrB,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK;QACnC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,MAAM,CAAC,OAAO,CAAC;IAEvC,MAAM,aAAa,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAEnD,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,aAAa,EAAE;QACzC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,IAAI;KACf,CAAC,CAAC;IAEH,uDAAuD;IACvD,MAAM,UAAU,GAAe,EAAE,CAAC;IAClC,MAAM,GAAG,GAAG,UAAU,CAAC,MAAM,CAAC,UAAU,CAAC;IAEzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,CAAC,GAAG,GAAG,CAAC;QACtB,MAAM,GAAG,GAAG,KAAK,GAAG,GAAG,CAAC;QACxB,UAAU,CAAC,IAAI,CACd,KAAK,CAAC,IAAI,CAAE,OAAO,CAAC,IAAqB,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAC5D,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../../src/rag/ingest.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { readFileSync } from "fs";
|
|
3
|
+
import { glob } from "glob";
|
|
4
|
+
import { parseJsonArticle, parseMarkdown } from "./chunker.js";
|
|
5
|
+
import { RAG_CONFIG } from "./config.js";
|
|
6
|
+
import { embedBatch } from "./embedding.js";
|
|
7
|
+
import { ensureCollection, getCollectionStats, getExistingHashes, upsertPoints, } from "./qdrant-client.js";
|
|
8
|
+
const BATCH_SIZE = 20;
|
|
9
|
+
const MAX_RETRIES = 3;
|
|
10
|
+
const RETRY_DELAY_MS = 2000;
|
|
11
|
+
async function sleep(ms) {
|
|
12
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
13
|
+
}
|
|
14
|
+
async function withRetry(fn, retries = MAX_RETRIES) {
|
|
15
|
+
for (let i = 0; i < retries; i++) {
|
|
16
|
+
try {
|
|
17
|
+
return await fn();
|
|
18
|
+
}
|
|
19
|
+
catch (error) {
|
|
20
|
+
if (i === retries - 1)
|
|
21
|
+
throw error;
|
|
22
|
+
console.log(` Retry ${i + 1}/${retries} after error...`);
|
|
23
|
+
await sleep(RETRY_DELAY_MS);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
throw new Error("Unreachable");
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* List all files to process
|
|
30
|
+
*/
|
|
31
|
+
async function listFiles() {
|
|
32
|
+
const files = [];
|
|
33
|
+
// JSON files
|
|
34
|
+
const jsonFiles = await glob(RAG_CONFIG.paths.json);
|
|
35
|
+
for (const path of jsonFiles) {
|
|
36
|
+
files.push({ path, type: "json" });
|
|
37
|
+
}
|
|
38
|
+
// Markdown files (supports array of patterns)
|
|
39
|
+
const mdPatterns = Array.isArray(RAG_CONFIG.paths.markdown)
|
|
40
|
+
? RAG_CONFIG.paths.markdown
|
|
41
|
+
: [RAG_CONFIG.paths.markdown];
|
|
42
|
+
for (const pattern of mdPatterns) {
|
|
43
|
+
const mdFiles = await glob(pattern);
|
|
44
|
+
for (const path of mdFiles) {
|
|
45
|
+
files.push({ path, type: "markdown" });
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return files;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Parse a single file into chunks
|
|
52
|
+
*/
|
|
53
|
+
function parseFile(file) {
|
|
54
|
+
const content = readFileSync(file.path, "utf-8");
|
|
55
|
+
if (file.type === "json") {
|
|
56
|
+
return parseJsonArticle(file.path, content);
|
|
57
|
+
}
|
|
58
|
+
return parseMarkdown(file.path, content);
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Ingest all documents
|
|
62
|
+
*/
|
|
63
|
+
async function ingest() {
|
|
64
|
+
console.log("Starting ingest...\n");
|
|
65
|
+
// Ensure collection exists
|
|
66
|
+
await ensureCollection();
|
|
67
|
+
// Get initial stats
|
|
68
|
+
const initialStats = await getCollectionStats();
|
|
69
|
+
console.log(`Initial points count: ${initialStats.pointsCount}\n`);
|
|
70
|
+
// List all files
|
|
71
|
+
const files = await listFiles();
|
|
72
|
+
console.log(`Found ${files.length} files to process`);
|
|
73
|
+
console.log(` - JSON: ${files.filter((f) => f.type === "json").length}`);
|
|
74
|
+
console.log(` - Markdown: ${files.filter((f) => f.type === "markdown").length}\n`);
|
|
75
|
+
// Parse all files into chunks
|
|
76
|
+
console.log("Parsing files...");
|
|
77
|
+
const allChunks = [];
|
|
78
|
+
for (const file of files) {
|
|
79
|
+
try {
|
|
80
|
+
const chunks = parseFile(file);
|
|
81
|
+
allChunks.push(...chunks);
|
|
82
|
+
}
|
|
83
|
+
catch (error) {
|
|
84
|
+
console.error(`Error parsing ${file.path}:`, error);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
console.log(`Total chunks: ${allChunks.length}\n`);
|
|
88
|
+
// Get existing hashes for change detection
|
|
89
|
+
console.log("Checking for changes...");
|
|
90
|
+
const chunkIds = allChunks.map((c) => c.chunk_id);
|
|
91
|
+
const existingHashes = await getExistingHashes(chunkIds);
|
|
92
|
+
// Filter to only changed chunks
|
|
93
|
+
const changedChunks = allChunks.filter((chunk) => {
|
|
94
|
+
const existingHash = existingHashes.get(chunk.chunk_id);
|
|
95
|
+
return existingHash !== chunk.content_hash;
|
|
96
|
+
});
|
|
97
|
+
console.log(`Changed chunks: ${changedChunks.length}`);
|
|
98
|
+
console.log(`Skipped (unchanged): ${allChunks.length - changedChunks.length}\n`);
|
|
99
|
+
if (changedChunks.length === 0) {
|
|
100
|
+
console.log("No changes detected. Done!");
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
// Generate embeddings and upsert in batches
|
|
104
|
+
console.log("Generating embeddings and upserting...");
|
|
105
|
+
let processed = 0;
|
|
106
|
+
for (let i = 0; i < changedChunks.length; i += BATCH_SIZE) {
|
|
107
|
+
const batch = changedChunks.slice(i, i + BATCH_SIZE);
|
|
108
|
+
const texts = batch.map((c) => c.text);
|
|
109
|
+
// Generate embeddings
|
|
110
|
+
const embeddings = await embedBatch(texts, false);
|
|
111
|
+
// Prepare points
|
|
112
|
+
const points = batch.map((chunk, idx) => ({
|
|
113
|
+
id: chunk.chunk_id,
|
|
114
|
+
vector: embeddings[idx],
|
|
115
|
+
payload: chunk,
|
|
116
|
+
}));
|
|
117
|
+
// Upsert to Qdrant with retry
|
|
118
|
+
await withRetry(() => upsertPoints(points));
|
|
119
|
+
processed += batch.length;
|
|
120
|
+
console.log(` Progress: ${processed}/${changedChunks.length}`);
|
|
121
|
+
// Small delay between batches to avoid overwhelming Qdrant
|
|
122
|
+
await sleep(100);
|
|
123
|
+
}
|
|
124
|
+
// Get final stats
|
|
125
|
+
const finalStats = await getCollectionStats();
|
|
126
|
+
console.log(`\nFinal points count: ${finalStats.pointsCount}`);
|
|
127
|
+
console.log("Ingest completed successfully!");
|
|
128
|
+
}
|
|
129
|
+
async function main() {
|
|
130
|
+
const env = {
|
|
131
|
+
QDRANT_URL: process.env.QDRANT_URL,
|
|
132
|
+
};
|
|
133
|
+
if (!env.QDRANT_URL) {
|
|
134
|
+
throw new Error("QDRANT_URL environment variable is required.");
|
|
135
|
+
}
|
|
136
|
+
try {
|
|
137
|
+
await ingest();
|
|
138
|
+
}
|
|
139
|
+
catch (error) {
|
|
140
|
+
console.error("❌ エラーが発生しました:", error);
|
|
141
|
+
process.exit(1);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
main().catch((error) => {
|
|
145
|
+
console.error(error);
|
|
146
|
+
process.exit(1);
|
|
147
|
+
});
|
|
148
|
+
//# sourceMappingURL=ingest.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ingest.js","sourceRoot":"","sources":["../../src/rag/ingest.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC/D,OAAO,EAAsB,UAAU,EAAE,MAAM,aAAa,CAAC;AAC7D,OAAO,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AAC5C,OAAO,EACN,gBAAgB,EAChB,kBAAkB,EAClB,iBAAiB,EACjB,YAAY,GACZ,MAAM,oBAAoB,CAAC;AAE5B,MAAM,UAAU,GAAG,EAAE,CAAC;AACtB,MAAM,WAAW,GAAG,CAAC,CAAC;AACtB,MAAM,cAAc,GAAG,IAAI,CAAC;AAE5B,KAAK,UAAU,KAAK,CAAC,EAAU;IAC9B,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;AAC1D,CAAC;AAED,KAAK,UAAU,SAAS,CACvB,EAAoB,EACpB,OAAO,GAAG,WAAW;IAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,IAAI,CAAC;YACJ,OAAO,MAAM,EAAE,EAAE,CAAC;QACnB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,IAAI,CAAC,KAAK,OAAO,GAAG,CAAC;gBAAE,MAAM,KAAK,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,OAAO,iBAAiB,CAAC,CAAC;YAC1D,MAAM,KAAK,CAAC,cAAc,CAAC,CAAC;QAC7B,CAAC;IACF,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,CAAC;AAChC,CAAC;AAOD;;GAEG;AACH,KAAK,UAAU,SAAS;IACvB,MAAM,KAAK,GAAe,EAAE,CAAC;IAE7B,aAAa;IACb,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACpD,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC9B,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;IACpC,CAAC;IAED,8CAA8C;IAC9C,MAAM,UAAU,GAAG,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC;QAC1D,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,QAAQ;QAC3B,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IAE/B,KAAK,MAAM,OAAO,IAAI,UAAU,EAAE,CAAC;QAClC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,CAAC;QACpC,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;YAC5B,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC;QACxC,CAAC;IACF,CAAC;IAED,OAAO,KAAK,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,IAAc;IAChC,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAEjD,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;QAC1B,OAAO,gBAAgB,CAAC,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IAC7C,CAAC;IACD,OAAO,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,MAAM;IACpB,OAAO,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAC;IAEpC,2BAA2B;IAC3B,MAAM,gBAAgB,EAAE,CAAC;IAEzB,oBAAoB;IACpB,MAAM,YAAY,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAChD,OAAO,CAAC,GAAG,CAAC,yBAAyB,YAAY,CAAC,WAAW,IAAI,CAAC,CAAC;IAEnE,iBAAiB;IACjB,MAAM,KAAK,GAAG,MAAM,SAAS,EAAE,CAAC;IAChC,OAAO,CAAC,GAAG,CAAC,SAAS,KAAK,CAAC,MAAM,mBAAmB,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,aAAa,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IAC1E,OAAO,CAAC,GAAG,CACV,iBAAiB,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC,MAAM,IAAI,CACtE,CAAC;IAEF,8BAA8B;IAC9B,OAAO,CAAC,GAAG,CAAC,kBAAkB,CAAC,CAAC;IAChC,MAAM,SAAS,GAAoB,EAAE,CAAC;IAEtC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,CAAC;YACJ,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;YAC/B,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;QAC3B,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,OAAO,CAAC,KAAK,CAAC,iBAAiB,IAAI,CAAC,IAAI,GAAG,EAAE,KAAK,CAAC,CAAC;QACrD,CAAC;IACF,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,iBAAiB,SAAS,CAAC,MAAM,IAAI,CAAC,CAAC;IAEnD,2CAA2C;IAC3C,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC;IACvC,MAAM,QAAQ,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAEzD,gCAAgC;IAChC,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;QAChD,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACxD,OAAO,YAAY,KAAK,KAAK,CAAC,YAAY,CAAC;IAC5C,CAAC,CAAC,CAAC;IAEH,OAAO,CAAC,GAAG,CAAC,mBAAmB,aAAa,CAAC,MAAM,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CACV,wBAAwB,SAAS,CAAC,MAAM,GAAG,aAAa,CAAC,MAAM,IAAI,CACnE,CAAC;IAEF,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;QAC1C,OAAO;IACR,CAAC;IAED,4CAA4C;IAC5C,OAAO,CAAC,GAAG,CAAC,wCAAwC,CAAC,CAAC;IACtD,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3D,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,CAAC;QACrD,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAEvC,sBAAsB;QACtB,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAElD,iBAAiB;QACjB,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;YACzC,EAAE,EAAE,KAAK,CAAC,QAAQ;YAClB,MAAM,EAAE,UAAU,CAAC,GAAG,CAAC;YACvB,OAAO,EAAE,KAAK;SACd,CAAC,CAAC,CAAC;QAEJ,8BAA8B;QAC9B,MAAM,SAAS,CAAC,GAAG,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC;QAE5C,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC;QAC1B,OAAO,CAAC,GAAG,CAAC,eAAe,SAAS,IAAI,aAAa,CAAC,MAAM,EAAE,CAAC,CAAC;QAEhE,2DAA2D;QAC3D,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;IAClB,CAAC;IAED,kBAAkB;IAClB,MAAM,UAAU,GAAG,MAAM,kBAAkB,EAAE,CAAC;IAC9C,OAAO,CAAC,GAAG,CAAC,yBAAyB,UAAU,CAAC,WAAW,EAAE,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;AAC/C,CAAC;AAED,KAAK,UAAU,IAAI;IAClB,MAAM,GAAG,GAAG;QACX,UAAU,EAAE,OAAO,CAAC,GAAG,CAAC,UAAU;KACzB,CAAC;IAEX,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,CAAC;QACrB,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;IACjE,CAAC;IAED,IAAI,CAAC;QACJ,MAAM,MAAM,EAAE,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACtC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;AACF,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACtB,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACjB,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { QdrantClient } from "@qdrant/js-client-rest";
|
|
2
|
+
import { type QdrantPayload, type SearchResult } from "./config.js";
|
|
3
|
+
/**
|
|
4
|
+
* Get or create Qdrant client
|
|
5
|
+
*/
|
|
6
|
+
export declare function getQdrantClient(): QdrantClient;
|
|
7
|
+
/**
|
|
8
|
+
* Create collection if not exists
|
|
9
|
+
*/
|
|
10
|
+
export declare function ensureCollection(): Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* Upsert points to Qdrant
|
|
13
|
+
*/
|
|
14
|
+
export declare function upsertPoints(points: {
|
|
15
|
+
id: string;
|
|
16
|
+
vector: number[];
|
|
17
|
+
payload: QdrantPayload;
|
|
18
|
+
}[]): Promise<void>;
|
|
19
|
+
/**
|
|
20
|
+
* Get existing content hashes for a set of chunk IDs
|
|
21
|
+
*/
|
|
22
|
+
export declare function getExistingHashes(chunkIds: string[]): Promise<Map<string, string>>;
|
|
23
|
+
/**
|
|
24
|
+
* Search for similar documents
|
|
25
|
+
*/
|
|
26
|
+
export declare function search(queryVector: number[], options?: {
|
|
27
|
+
topK?: number;
|
|
28
|
+
filter?: {
|
|
29
|
+
type?: "markdown_note" | "bookmark_json";
|
|
30
|
+
top_heading?: string;
|
|
31
|
+
};
|
|
32
|
+
}): Promise<SearchResult[]>;
|
|
33
|
+
/**
|
|
34
|
+
* Get collection stats
|
|
35
|
+
*/
|
|
36
|
+
export declare function getCollectionStats(): Promise<{
|
|
37
|
+
pointsCount: number;
|
|
38
|
+
status: string;
|
|
39
|
+
}>;
|
|
40
|
+
//# sourceMappingURL=qdrant-client.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"qdrant-client.d.ts","sourceRoot":"","sources":["../../src/rag/qdrant-client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EAAE,KAAK,aAAa,EAAc,KAAK,YAAY,EAAE,MAAM,aAAa,CAAC;AAIhF;;GAEG;AACH,wBAAgB,eAAe,IAAI,YAAY,CAgB9C;AAED;;GAEG;AACH,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,IAAI,CAAC,CAmBtD;AAED;;GAEG;AACH,wBAAsB,YAAY,CACjC,MAAM,EAAE;IAAE,EAAE,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAC;IAAC,OAAO,EAAE,aAAa,CAAA;CAAE,EAAE,GAChE,OAAO,CAAC,IAAI,CAAC,CAef;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CACtC,QAAQ,EAAE,MAAM,EAAE,GAChB,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CA4B9B;AAED;;GAEG;AACH,wBAAsB,MAAM,CAC3B,WAAW,EAAE,MAAM,EAAE,EACrB,OAAO,GAAE;IACR,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE;QACR,IAAI,CAAC,EAAE,eAAe,GAAG,eAAe,CAAC;QACzC,WAAW,CAAC,EAAE,MAAM,CAAC;KACrB,CAAC;CACG,GACJ,OAAO,CAAC,YAAY,EAAE,CAAC,CA6CzB;AAED;;GAEG;AACH,wBAAsB,kBAAkB,IAAI,OAAO,CAAC;IACnD,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;CACf,CAAC,CAgBD"}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import { QdrantClient } from "@qdrant/js-client-rest";
|
|
2
|
+
import { RAG_CONFIG } from "./config.js";
|
|
3
|
+
let client = null;
|
|
4
|
+
/**
|
|
5
|
+
* Get or create Qdrant client
|
|
6
|
+
*/
|
|
7
|
+
export function getQdrantClient() {
|
|
8
|
+
if (!client) {
|
|
9
|
+
const url = process.env.QDRANT_URL;
|
|
10
|
+
const apiKey = process.env.QDRANT_API_KEY;
|
|
11
|
+
if (!url) {
|
|
12
|
+
throw new Error("QDRANT_URL environment variable is required");
|
|
13
|
+
}
|
|
14
|
+
client = new QdrantClient({
|
|
15
|
+
url,
|
|
16
|
+
apiKey,
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
return client;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Create collection if not exists
|
|
23
|
+
*/
|
|
24
|
+
export async function ensureCollection() {
|
|
25
|
+
const qdrant = getQdrantClient();
|
|
26
|
+
const { collectionName, vectorSize, distance } = RAG_CONFIG.qdrant;
|
|
27
|
+
const collections = await qdrant.getCollections();
|
|
28
|
+
const exists = collections.collections.some((c) => c.name === collectionName);
|
|
29
|
+
if (!exists) {
|
|
30
|
+
console.log(`Creating collection: ${collectionName}`);
|
|
31
|
+
await qdrant.createCollection(collectionName, {
|
|
32
|
+
vectors: {
|
|
33
|
+
size: vectorSize,
|
|
34
|
+
distance,
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
console.log(`Collection ${collectionName} created successfully.`);
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
console.log(`Collection ${collectionName} already exists.`);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Upsert points to Qdrant
|
|
45
|
+
*/
|
|
46
|
+
export async function upsertPoints(points) {
|
|
47
|
+
const qdrant = getQdrantClient();
|
|
48
|
+
const { collectionName } = RAG_CONFIG.qdrant;
|
|
49
|
+
// Qdrant requires numeric or UUID IDs, so we hash the chunk_id
|
|
50
|
+
const qdrantPoints = points.map((p) => ({
|
|
51
|
+
id: hashToUint(p.id),
|
|
52
|
+
vector: p.vector,
|
|
53
|
+
payload: p.payload,
|
|
54
|
+
}));
|
|
55
|
+
await qdrant.upsert(collectionName, {
|
|
56
|
+
wait: true,
|
|
57
|
+
points: qdrantPoints,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Get existing content hashes for a set of chunk IDs
|
|
62
|
+
*/
|
|
63
|
+
export async function getExistingHashes(chunkIds) {
|
|
64
|
+
const qdrant = getQdrantClient();
|
|
65
|
+
const { collectionName } = RAG_CONFIG.qdrant;
|
|
66
|
+
const hashMap = new Map();
|
|
67
|
+
if (chunkIds.length === 0)
|
|
68
|
+
return hashMap;
|
|
69
|
+
// Convert chunk IDs to numeric IDs
|
|
70
|
+
const numericIds = chunkIds.map((id) => hashToUint(id));
|
|
71
|
+
try {
|
|
72
|
+
const result = await qdrant.retrieve(collectionName, {
|
|
73
|
+
ids: numericIds,
|
|
74
|
+
with_payload: ["chunk_id", "content_hash"],
|
|
75
|
+
});
|
|
76
|
+
for (const point of result) {
|
|
77
|
+
const payload = point.payload;
|
|
78
|
+
if (payload?.chunk_id && payload?.content_hash) {
|
|
79
|
+
hashMap.set(payload.chunk_id, payload.content_hash);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
catch {
|
|
84
|
+
// Collection might not exist or be empty
|
|
85
|
+
}
|
|
86
|
+
return hashMap;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Search for similar documents
|
|
90
|
+
*/
|
|
91
|
+
export async function search(queryVector, options = {}) {
|
|
92
|
+
const qdrant = getQdrantClient();
|
|
93
|
+
const { collectionName } = RAG_CONFIG.qdrant;
|
|
94
|
+
const { topK = 10, filter } = options;
|
|
95
|
+
// Build filter conditions
|
|
96
|
+
const filterConditions = [];
|
|
97
|
+
if (filter?.type) {
|
|
98
|
+
filterConditions.push({
|
|
99
|
+
key: "type",
|
|
100
|
+
match: { value: filter.type },
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
if (filter?.top_heading) {
|
|
104
|
+
filterConditions.push({
|
|
105
|
+
key: "top_heading",
|
|
106
|
+
match: { value: filter.top_heading },
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
const result = await qdrant.search(collectionName, {
|
|
110
|
+
vector: queryVector,
|
|
111
|
+
limit: topK,
|
|
112
|
+
with_payload: true,
|
|
113
|
+
filter: filterConditions.length > 0 ? { must: filterConditions } : undefined,
|
|
114
|
+
});
|
|
115
|
+
return result.map((r) => {
|
|
116
|
+
const payload = r.payload;
|
|
117
|
+
return {
|
|
118
|
+
score: r.score,
|
|
119
|
+
text: payload.text,
|
|
120
|
+
title: payload.title,
|
|
121
|
+
url: payload.url,
|
|
122
|
+
heading_path: payload.heading_path,
|
|
123
|
+
type: payload.type,
|
|
124
|
+
doc_id: payload.doc_id,
|
|
125
|
+
};
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Get collection stats
|
|
130
|
+
*/
|
|
131
|
+
export async function getCollectionStats() {
|
|
132
|
+
const qdrant = getQdrantClient();
|
|
133
|
+
const { collectionName } = RAG_CONFIG.qdrant;
|
|
134
|
+
try {
|
|
135
|
+
const info = await qdrant.getCollection(collectionName);
|
|
136
|
+
return {
|
|
137
|
+
pointsCount: info.points_count ?? 0,
|
|
138
|
+
status: info.status,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
catch {
|
|
142
|
+
return {
|
|
143
|
+
pointsCount: 0,
|
|
144
|
+
status: "not_found",
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Hash string to unsigned integer (for Qdrant point ID)
|
|
150
|
+
*/
|
|
151
|
+
function hashToUint(str) {
|
|
152
|
+
let hash = 0;
|
|
153
|
+
for (let i = 0; i < str.length; i++) {
|
|
154
|
+
const char = str.charCodeAt(i);
|
|
155
|
+
hash = (hash << 5) - hash + char;
|
|
156
|
+
hash = hash & hash; // Convert to 32bit integer
|
|
157
|
+
}
|
|
158
|
+
return Math.abs(hash);
|
|
159
|
+
}
|
|
160
|
+
//# sourceMappingURL=qdrant-client.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"qdrant-client.js","sourceRoot":"","sources":["../../src/rag/qdrant-client.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EAAsB,UAAU,EAAqB,MAAM,aAAa,CAAC;AAEhF,IAAI,MAAM,GAAwB,IAAI,CAAC;AAEvC;;GAEG;AACH,MAAM,UAAU,eAAe;IAC9B,IAAI,CAAC,MAAM,EAAE,CAAC;QACb,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;QACnC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC;QAE1C,IAAI,CAAC,GAAG,EAAE,CAAC;YACV,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;QAChE,CAAC;QAED,MAAM,GAAG,IAAI,YAAY,CAAC;YACzB,GAAG;YACH,MAAM;SACN,CAAC,CAAC;IACJ,CAAC;IAED,OAAO,MAAM,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB;IACrC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,EAAE,cAAc,EAAE,UAAU,EAAE,QAAQ,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC;IAEnE,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IAClD,MAAM,MAAM,GAAG,WAAW,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,cAAc,CAAC,CAAC;IAE9E,IAAI,CAAC,MAAM,EAAE,CAAC;QACb,OAAO,CAAC,GAAG,CAAC,wBAAwB,cAAc,EAAE,CAAC,CAAC;QACtD,MAAM,MAAM,CAAC,gBAAgB,CAAC,cAAc,EAAE;YAC7C,OAAO,EAAE;gBACR,IAAI,EAAE,UAAU;gBAChB,QAAQ;aACR;SACD,CAAC,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,cAAc,cAAc,wBAAwB,CAAC,CAAC;IACnE,CAAC;SAAM,CAAC;QACP,OAAO,CAAC,GAAG,CAAC,cAAc,cAAc,kBAAkB,CAAC,CAAC;IAC7D,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CACjC,MAAkE;IAElE,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,EAAE,cAAc,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC;IAE7C,+DAA+D;IAC/D,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACvC,EAAE,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC;QACpB,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,OAAO,EAAE,CAAC,CAAC,OAAO;KAClB,CAAC,CAAC,CAAC;IAEJ,MAAM,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE;QACnC,IAAI,EAAE,IAAI;QACV,MAAM,EAAE,YAAY;KACpB,CAAC,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACtC,QAAkB;IAElB,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,EAAE,cAAc,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC;IAE7C,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkB,CAAC;IAE1C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,OAAO,CAAC;IAE1C,mCAAmC;IACnC,MAAM,UAAU,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC;IAExD,IAAI,CAAC;QACJ,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,cAAc,EAAE;YACpD,GAAG,EAAE,UAAU;YACf,YAAY,EAAE,CAAC,UAAU,EAAE,cAAc,CAAC;SAC1C,CAAC,CAAC;QAEH,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC5B,MAAM,OAAO,GAAG,KAAK,CAAC,OAAwB,CAAC;YAC/C,IAAI,OAAO,EAAE,QAAQ,IAAI,OAAO,EAAE,YAAY,EAAE,CAAC;gBAChD,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,YAAY,CAAC,CAAC;YACrD,CAAC;QACF,CAAC;IACF,CAAC;IAAC,MAAM,CAAC;QACR,yCAAyC;IAC1C,CAAC;IAED,OAAO,OAAO,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,MAAM,CAC3B,WAAqB,EACrB,UAMI,EAAE;IAEN,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,EAAE,cAAc,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC;IAC7C,MAAM,EAAE,IAAI,GAAG,EAAE,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC;IAEtC,0BAA0B;IAC1B,MAAM,gBAAgB,GAGjB,EAAE,CAAC;IAER,IAAI,MAAM,EAAE,IAAI,EAAE,CAAC;QAClB,gBAAgB,CAAC,IAAI,CAAC;YACrB,GAAG,EAAE,MAAM;YACX,KAAK,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE;SAC7B,CAAC,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,EAAE,WAAW,EAAE,CAAC;QACzB,gBAAgB,CAAC,IAAI,CAAC;YACrB,GAAG,EAAE,aAAa;YAClB,KAAK,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,WAAW,EAAE;SACpC,CAAC,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE;QAClD,MAAM,EAAE,WAAW;QACnB,KAAK,EAAE,IAAI;QACX,YAAY,EAAE,IAAI;QAClB,MAAM,EACL,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,CAAC,CAAC,CAAC,SAAS;KACrE,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QACvB,MAAM,OAAO,GAAG,CAAC,CAAC,OAAwB,CAAC;QAC3C,OAAO;YACN,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,IAAI,EAAE,OAAO,CAAC,IAAI;YAClB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,YAAY,EAAE,OAAO,CAAC,YAAY;YAClC,IAAI,EAAE,OAAO,CAAC,IAAI;YAClB,MAAM,EAAE,OAAO,CAAC,MAAM;SACtB,CAAC;IACH,CAAC,CAAC,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB;IAIvC,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,EAAE,cAAc,EAAE,GAAG,UAAU,CAAC,MAAM,CAAC;IAE7C,IAAI,CAAC;QACJ,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,cAAc,CAAC,CAAC;QACxD,OAAO;YACN,WAAW,EAAE,IAAI,CAAC,YAAY,IAAI,CAAC;YACnC,MAAM,EAAE,IAAI,CAAC,MAAM;SACnB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACR,OAAO;YACN,WAAW,EAAE,CAAC;YACd,MAAM,EAAE,WAAW;SACnB,CAAC;IACH,CAAC;AACF,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,GAAW;IAC9B,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,IAAI,GAAG,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC;QACjC,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,2BAA2B;IAChD,CAAC;IACD,OAAO,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;AACvB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../../src/rag/search.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { embed } from "./embedding.js";
|
|
3
|
+
import { getCollectionStats, search } from "./qdrant-client.js";
|
|
4
|
+
/**
|
|
5
|
+
* Search for documents matching a query
|
|
6
|
+
*/
|
|
7
|
+
async function runSearch() {
|
|
8
|
+
// Parse command line arguments
|
|
9
|
+
const args = process.argv.slice(2);
|
|
10
|
+
if (args.length === 0) {
|
|
11
|
+
console.log("Usage: rag-search <query> [options]");
|
|
12
|
+
console.log("");
|
|
13
|
+
console.log("Options:");
|
|
14
|
+
console.log(" --top-k <number> Number of results (default: 5)");
|
|
15
|
+
console.log(" --type <type> Filter by type: markdown_note | bookmark_json");
|
|
16
|
+
console.log(" --heading <heading> Filter by top_heading");
|
|
17
|
+
console.log("");
|
|
18
|
+
console.log("Examples:");
|
|
19
|
+
console.log(' rag-search "ルネサンス 遠近法"');
|
|
20
|
+
console.log(' rag-search "AI 脆弱性" --type bookmark_json');
|
|
21
|
+
console.log(' rag-search "React" --heading javascript --top-k 10');
|
|
22
|
+
process.exit(1);
|
|
23
|
+
}
|
|
24
|
+
// Parse options
|
|
25
|
+
let query = "";
|
|
26
|
+
let topK = 5;
|
|
27
|
+
let filterType;
|
|
28
|
+
let filterHeading;
|
|
29
|
+
for (let i = 0; i < args.length; i++) {
|
|
30
|
+
if (args[i] === "--top-k" && args[i + 1]) {
|
|
31
|
+
topK = Number.parseInt(args[i + 1], 10);
|
|
32
|
+
i++;
|
|
33
|
+
}
|
|
34
|
+
else if (args[i] === "--type" && args[i + 1]) {
|
|
35
|
+
filterType = args[i + 1];
|
|
36
|
+
i++;
|
|
37
|
+
}
|
|
38
|
+
else if (args[i] === "--heading" && args[i + 1]) {
|
|
39
|
+
filterHeading = args[i + 1];
|
|
40
|
+
i++;
|
|
41
|
+
}
|
|
42
|
+
else if (!args[i].startsWith("--")) {
|
|
43
|
+
query = args[i];
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (!query) {
|
|
47
|
+
console.error("Error: Query is required");
|
|
48
|
+
process.exit(1);
|
|
49
|
+
}
|
|
50
|
+
// Check collection status
|
|
51
|
+
const stats = await getCollectionStats();
|
|
52
|
+
if (stats.status === "not_found") {
|
|
53
|
+
console.error("Error: Collection not found. Run ingest first.");
|
|
54
|
+
process.exit(1);
|
|
55
|
+
}
|
|
56
|
+
console.log(`Searching for: "${query}"`);
|
|
57
|
+
console.log(`Collection has ${stats.pointsCount} points\n`);
|
|
58
|
+
// Generate query embedding
|
|
59
|
+
console.log("Generating query embedding...");
|
|
60
|
+
const queryVector = await embed(query, true);
|
|
61
|
+
// Search
|
|
62
|
+
console.log("Searching...\n");
|
|
63
|
+
const results = await search(queryVector, {
|
|
64
|
+
topK,
|
|
65
|
+
filter: {
|
|
66
|
+
type: filterType,
|
|
67
|
+
top_heading: filterHeading,
|
|
68
|
+
},
|
|
69
|
+
});
|
|
70
|
+
// Display results
|
|
71
|
+
console.log(`Found ${results.length} results:\n`);
|
|
72
|
+
console.log("=".repeat(80));
|
|
73
|
+
for (let i = 0; i < results.length; i++) {
|
|
74
|
+
const r = results[i];
|
|
75
|
+
console.log(`\n[${i + 1}] Score: ${r.score.toFixed(4)}`);
|
|
76
|
+
console.log(` Title: ${r.title}`);
|
|
77
|
+
console.log(` Type: ${r.type}`);
|
|
78
|
+
console.log(` Path: ${r.heading_path.join(" > ")}`);
|
|
79
|
+
if (r.url) {
|
|
80
|
+
console.log(` URL: ${r.url}`);
|
|
81
|
+
}
|
|
82
|
+
console.log(` Text: ${r.text.slice(0, 200)}${r.text.length > 200 ? "..." : ""}`);
|
|
83
|
+
console.log("-".repeat(80));
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
async function main() {
|
|
87
|
+
const env = {
|
|
88
|
+
QDRANT_URL: process.env.QDRANT_URL,
|
|
89
|
+
};
|
|
90
|
+
if (!env.QDRANT_URL) {
|
|
91
|
+
throw new Error("QDRANT_URL environment variable is required.");
|
|
92
|
+
}
|
|
93
|
+
try {
|
|
94
|
+
await runSearch();
|
|
95
|
+
}
|
|
96
|
+
catch (error) {
|
|
97
|
+
console.error("❌ エラーが発生しました:", error);
|
|
98
|
+
process.exit(1);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
main().catch((error) => {
|
|
102
|
+
console.error(error);
|
|
103
|
+
process.exit(1);
|
|
104
|
+
});
|
|
105
|
+
//# sourceMappingURL=search.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.js","sourceRoot":"","sources":["../../src/rag/search.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,KAAK,EAAE,MAAM,gBAAgB,CAAC;AACvC,OAAO,EAAE,kBAAkB,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEhE;;GAEG;AACH,KAAK,UAAU,SAAS;IACvB,+BAA+B;IAC/B,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAEnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC;QACnD,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAChB,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QACxB,OAAO,CAAC,GAAG,CAAC,uDAAuD,CAAC,CAAC;QACrE,OAAO,CAAC,GAAG,CACV,sEAAsE,CACtE,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,8CAA8C,CAAC,CAAC;QAC5D,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAChB,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACzB,OAAO,CAAC,GAAG,CAAC,0BAA0B,CAAC,CAAC;QACxC,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,sDAAsD,CAAC,CAAC;QACpE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;IAED,gBAAgB;IAChB,IAAI,KAAK,GAAG,EAAE,CAAC;IACf,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,IAAI,UAAyD,CAAC;IAC9D,IAAI,aAAiC,CAAC;IAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,SAAS,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC1C,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACxC,CAAC,EAAE,CAAC;QACL,CAAC;aAAM,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,QAAQ,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChD,UAAU,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAsC,CAAC;YAC9D,CAAC,EAAE,CAAC;QACL,CAAC;aAAM,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,WAAW,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YACnD,aAAa,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YAC5B,CAAC,EAAE,CAAC;QACL,CAAC;aAAM,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YACtC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACjB,CAAC;IACF,CAAC;IAED,IAAI,CAAC,KAAK,EAAE,CAAC;QACZ,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;IAED,0BAA0B;IAC1B,MAAM,KAAK,GAAG,MAAM,kBAAkB,EAAE,CAAC;IACzC,IAAI,KAAK,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;QAClC,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;QAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,mBAAmB,KAAK,GAAG,CAAC,CAAC;IACzC,OAAO,CAAC,GAAG,CAAC,kBAAkB,KAAK,CAAC,WAAW,WAAW,CAAC,CAAC;IAE5D,2BAA2B;IAC3B,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,MAAM,KAAK,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAE7C,SAAS;IACT,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;IAC9B,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,WAAW,EAAE;QACzC,IAAI;QACJ,MAAM,EAAE;YACP,IAAI,EAAE,UAAU;YAChB,WAAW,EAAE,aAAa;SAC1B;KACD,CAAC,CAAC;IAEH,kBAAkB;IAClB,OAAO,CAAC,GAAG,CAAC,SAAS,OAAO,CAAC,MAAM,aAAa,CAAC,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzD,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;YACX,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;QAClC,CAAC;QACD,OAAO,CAAC,GAAG,CACV,aAAa,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CACtE,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC7B,CAAC;AACF,CAAC;AAED,KAAK,UAAU,IAAI;IAClB,MAAM,GAAG,GAAG;QACX,UAAU,EAAE,OAAO,CAAC,GAAG,CAAC,UAAU;KACzB,CAAC;IAEX,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,CAAC;QACrB,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;IACjE,CAAC;IAED,IAAI,CAAC;QACJ,MAAM,SAAS,EAAE,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,eAAe,EAAE,KAAK,CAAC,CAAC;QACtC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;AACF,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACtB,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACjB,CAAC,CAAC,CAAC"}
|
|
@@ -4,7 +4,7 @@ import { mkdir, readdir, readFile, writeFile } from "node:fs/promises";
|
|
|
4
4
|
import path from "node:path";
|
|
5
5
|
import { createPushoverService } from "@s-hirano-ist/s-notification";
|
|
6
6
|
import TurndownService from "turndown";
|
|
7
|
-
const FETCHED_URLS_FILE = "
|
|
7
|
+
const FETCHED_URLS_FILE = "fetched_urls.txt";
|
|
8
8
|
const JSON_DIR = "json/article";
|
|
9
9
|
const OUTPUT_DIR = "raw/article";
|
|
10
10
|
async function loadFetchedUrls() {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"update-raw-articles.js","sourceRoot":"","sources":["../src/update-raw-articles.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACvE,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,eAAe,MAAM,UAAU,CAAC;AAEvC,MAAM,iBAAiB,GAAG,
|
|
1
|
+
{"version":3,"file":"update-raw-articles.js","sourceRoot":"","sources":["../src/update-raw-articles.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACvE,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,eAAe,MAAM,UAAU,CAAC;AAEvC,MAAM,iBAAiB,GAAG,kBAAkB,CAAC;AAC7C,MAAM,QAAQ,GAAG,cAAc,CAAC;AAChC,MAAM,UAAU,GAAG,aAAa,CAAC;AAEjC,KAAK,UAAU,eAAe;IAC7B,IAAI,CAAC;QACJ,IAAI,UAAU,CAAC,iBAAiB,CAAC,EAAE,CAAC;YACnC,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC;YAC3D,OAAO,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QACjE,CAAC;IACF,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,6BAA6B,EAAE,KAAK,CAAC,CAAC;IACrD,CAAC;IACD,OAAO,IAAI,GAAG,EAAE,CAAC;AAClB,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,IAAiB;IAC/C,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;IAC3C,MAAM,SAAS,CAAC,iBAAiB,EAAE,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,KAAK,UAAU,oBAAoB,CAAC,GAAW;IAC9C,IAAI,CAAC;QACJ,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YACjC,OAAO,EAAE;gBACR,YAAY,EACX,2HAA2H;aAC5H;SACD,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAEnC,MAAM,eAAe,GAAG,IAAI,eAAe,CAAC;YAC3C,YAAY,EAAE,KAAK;YACnB,cAAc,EAAE,QAAQ;YACxB,gBAAgB,EAAE,GAAG;YACrB,eAAe,EAAE,IAAI;YACrB,WAAW,EAAE,GAAG;SAChB,CAAC,CAAC;QAEH,eAAe,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;QAEtE,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE;YACxC,MAAM,EAAE,CAAC,GAAG,CAAC;YACb,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC9B,MAAM,OAAO,GAAG,IAAyB,CAAC;gBAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;gBAC1C,IAAI,CAAC,IAAI;oBAAE,OAAO,OAAO,CAAC;gBAE1B,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;oBACzD,OAAO,IAAI,OAAO,KAAK,IAAI,GAAG,CAAC;gBAChC,CAAC;gBACD,IAAI,CAAC;oBACJ,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC;oBAC5C,OAAO,IAAI,OAAO,KAAK,WAAW,GAAG,CAAC;gBACvC,CAAC;gBAAC,MAAM,CAAC;oBACR,OAAO,OAAO,CAAC;gBAChB,CAAC;YACF,CAAC;SACD,CAAC,CAAC;QAEH,eAAe,CAAC,OAAO,CAAC,gBAAgB,EAAE;YACzC,MAAM,EAAE,CAAC,KAAK,CAAC;YACf,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;gBAC/B,MAAM,OAAO,GAAG,IAAwB,CAAC;gBACzC,MAAM,GAAG,GAAG,OAAO,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;gBACxC,MAAM,GAAG,GAAG,OAAO,CAAC,YAAY,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAC9C,IAAI,CAAC,GAAG;oBAAE,OAAO,EAAE,CAAC;gBAEpB,IAAI,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;oBACvD,OAAO,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;gBAC5B,CAAC;gBACD,IAAI,CAAC;oBACJ,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC;oBAC3C,OAAO,KAAK,GAAG,KAAK,WAAW,GAAG,CAAC;gBACpC,CAAC;gBAAC,MAAM,CAAC;oBACR,OAAO,EAAE,CAAC;gBACX,CAAC;YACF,CAAC;SACD,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAChD,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;IACxB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,mBAAmB,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;QAChD,OAAO,EAAE,CAAC;IACX,CAAC;AACF,CAAC;AAeD,KAAK,UAAU,cAAc,CAC5B,QAAgB,EAChB,WAAwB;IAExB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,MAAM,IAAI,GAAiB,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAE/C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;IAEjC,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC7B,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC7B,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,IAAI,CAAC;QACzC,IAAI,CAAC;YACJ,IAAI,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC;gBAClC,SAAS;YACV,CAAC;YAED,MAAM,WAAW,GAAG,MAAM,oBAAoB,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,CAAC,WAAW,EAAE,CAAC;gBAClB,SAAS;YACV,CAAC;YAED,MAAM,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;YACpD,MAAM,OAAO,GAAG,kBAAkB,CAAC,WAAW,CAAC,CAAC;YAChD,MAAM,cAAc,GAAG,GAAG,OAAO,KAAK,CAAC;YACvC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,cAAc,CAAC,CAAC;YAEzD,MAAM,eAAe,GAAG,MAAM,KAAK,KAAK,GAAG;;;;;;EAM5C,KAAK;;;;EAIL,WAAW;CACZ,CAAC;YAEC,MAAM,SAAS,CAAC,UAAU,EAAE,eAAe,EAAE,OAAO,CAAC,CAAC;YACtD,OAAO,CAAC,GAAG,CAAC,aAAa,UAAU,EAAE,CAAC,CAAC;YAEvC,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC;QAAC,OAAO,MAAM,EAAE,CAAC;YACjB,OAAO,CAAC,KAAK,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAC;QACtC,CAAC;IACF,CAAC;AACF,CAAC;AAED,KAAK,UAAU,IAAI;IAClB,MAAM,GAAG,GAAG;QACX,YAAY,EAAE,OAAO,CAAC,GAAG,CAAC,YAAY;QACtC,iBAAiB,EAAE,OAAO,CAAC,GAAG,CAAC,iBAAiB;QAChD,kBAAkB,EAAE,OAAO,CAAC,GAAG,CAAC,kBAAkB;KACzC,CAAC;IAEX,IAAI,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IAChE,CAAC;IAED,MAAM,mBAAmB,GAAG,qBAAqB,CAAC;QACjD,GAAG,EAAE,GAAG,CAAC,YAAY,IAAI,EAAE;QAC3B,OAAO,EAAE,GAAG,CAAC,iBAAiB,IAAI,EAAE;QACpC,QAAQ,EAAE,GAAG,CAAC,kBAAkB,IAAI,EAAE;KACtC,CAAC,CAAC;IAEH,IAAI,CAAC;QACJ,MAAM,WAAW,GAAG,MAAM,eAAe,EAAE,CAAC;QAE5C,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,CAAC;QAC1C,MAAM,aAAa,GAAG,SAAS;aAC7B,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;aACxC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;QAE3C,KAAK,MAAM,QAAQ,IAAI,aAAa,EAAE,CAAC;YACtC,OAAO,CAAC,GAAG,CAAC,eAAe,QAAQ,EAAE,CAAC,CAAC;YACvC,MAAM,cAAc,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,eAAe,CAAC,WAAW,CAAC,CAAC;QACnC,MAAM,mBAAmB,CAAC,UAAU,CAAC,+BAA+B,EAAE;YACrE,MAAM,EAAE,qBAAqB;SAC7B,CAAC,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,wBAAwB,EAAE,KAAK,CAAC,CAAC;QAC/C,MAAM,mBAAmB,CAAC,WAAW,CACpC,+BAA+B,KAAK,EAAE,EACtC;YACC,MAAM,EAAE,qBAAqB;SAC7B,CACD,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjB,CAAC;AACF,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACtB,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACjB,CAAC,CAAC,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@s-hirano-ist/s-scripts",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "CLI scripts for s-private data operations",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -9,6 +9,8 @@
|
|
|
9
9
|
"fetch-images": "./dist/fetch-images.js",
|
|
10
10
|
"fetch-notes": "./dist/fetch-notes.js",
|
|
11
11
|
"find-duplicate-json-articles": "./dist/find-duplicate-json-articles.js",
|
|
12
|
+
"rag-ingest": "./dist/rag/ingest.js",
|
|
13
|
+
"rag-search": "./dist/rag/search.js",
|
|
12
14
|
"reset-articles": "./dist/reset-articles.js",
|
|
13
15
|
"reset-books": "./dist/reset-books.js",
|
|
14
16
|
"reset-images": "./dist/reset-images.js",
|
|
@@ -35,12 +37,15 @@
|
|
|
35
37
|
"access": "public"
|
|
36
38
|
},
|
|
37
39
|
"dependencies": {
|
|
40
|
+
"@huggingface/transformers": "3.5.1",
|
|
41
|
+
"@qdrant/js-client-rest": "1.13.0",
|
|
42
|
+
"glob": "11.0.2",
|
|
38
43
|
"jsdom": "26.0.0",
|
|
39
44
|
"minio": "8.0.5",
|
|
40
45
|
"turndown": "7.2.0",
|
|
41
|
-
"@s-hirano-ist/s-core": "1.
|
|
42
|
-
"@s-hirano-ist/s-database": "1.
|
|
43
|
-
"@s-hirano-ist/s-notification": "1.
|
|
46
|
+
"@s-hirano-ist/s-core": "1.6.0",
|
|
47
|
+
"@s-hirano-ist/s-database": "1.6.0",
|
|
48
|
+
"@s-hirano-ist/s-notification": "1.6.0"
|
|
44
49
|
},
|
|
45
50
|
"devDependencies": {
|
|
46
51
|
"@types/jsdom": "21.1.7",
|
|
@@ -58,6 +63,8 @@
|
|
|
58
63
|
"fetch-images": "tsx src/fetch-images.ts",
|
|
59
64
|
"fetch-notes": "tsx src/fetch-notes.ts",
|
|
60
65
|
"find-duplicate-json-articles": "tsx src/find-duplicate-json-articles.ts",
|
|
66
|
+
"rag-ingest": "tsx src/rag/ingest.ts",
|
|
67
|
+
"rag-search": "tsx src/rag/search.ts",
|
|
61
68
|
"reset-articles": "tsx src/reset-articles.ts",
|
|
62
69
|
"reset-books": "tsx src/reset-books.ts",
|
|
63
70
|
"reset-images": "tsx src/reset-images.ts",
|