@nahisaho/katashiro-rag 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/dist/RAGEngine.d.ts +58 -0
  3. package/dist/RAGEngine.d.ts.map +1 -0
  4. package/dist/RAGEngine.js +97 -0
  5. package/dist/RAGEngine.js.map +1 -0
  6. package/dist/RAGPipeline.d.ts +162 -0
  7. package/dist/RAGPipeline.d.ts.map +1 -0
  8. package/dist/RAGPipeline.js +222 -0
  9. package/dist/RAGPipeline.js.map +1 -0
  10. package/dist/Retriever.d.ts +49 -0
  11. package/dist/Retriever.d.ts.map +1 -0
  12. package/dist/Retriever.js +96 -0
  13. package/dist/Retriever.js.map +1 -0
  14. package/dist/chunking/DocumentChunker.d.ts +47 -0
  15. package/dist/chunking/DocumentChunker.d.ts.map +1 -0
  16. package/dist/chunking/DocumentChunker.js +171 -0
  17. package/dist/chunking/DocumentChunker.js.map +1 -0
  18. package/dist/chunking/index.d.ts +5 -0
  19. package/dist/chunking/index.d.ts.map +1 -0
  20. package/dist/chunking/index.js +5 -0
  21. package/dist/chunking/index.js.map +1 -0
  22. package/dist/embedding/AzureOpenAIEmbeddingProvider.d.ts +63 -0
  23. package/dist/embedding/AzureOpenAIEmbeddingProvider.d.ts.map +1 -0
  24. package/dist/embedding/AzureOpenAIEmbeddingProvider.js +133 -0
  25. package/dist/embedding/AzureOpenAIEmbeddingProvider.js.map +1 -0
  26. package/dist/embedding/BaseEmbeddingProvider.d.ts +43 -0
  27. package/dist/embedding/BaseEmbeddingProvider.d.ts.map +1 -0
  28. package/dist/embedding/BaseEmbeddingProvider.js +98 -0
  29. package/dist/embedding/BaseEmbeddingProvider.js.map +1 -0
  30. package/dist/embedding/EmbeddingFactory.d.ts +75 -0
  31. package/dist/embedding/EmbeddingFactory.d.ts.map +1 -0
  32. package/dist/embedding/EmbeddingFactory.js +153 -0
  33. package/dist/embedding/EmbeddingFactory.js.map +1 -0
  34. package/dist/embedding/EmbeddingManager.d.ts +41 -0
  35. package/dist/embedding/EmbeddingManager.d.ts.map +1 -0
  36. package/dist/embedding/EmbeddingManager.js +93 -0
  37. package/dist/embedding/EmbeddingManager.js.map +1 -0
  38. package/dist/embedding/MockEmbeddingProvider.d.ts +54 -0
  39. package/dist/embedding/MockEmbeddingProvider.d.ts.map +1 -0
  40. package/dist/embedding/MockEmbeddingProvider.js +91 -0
  41. package/dist/embedding/MockEmbeddingProvider.js.map +1 -0
  42. package/dist/embedding/OllamaEmbeddingProvider.d.ts +69 -0
  43. package/dist/embedding/OllamaEmbeddingProvider.d.ts.map +1 -0
  44. package/dist/embedding/OllamaEmbeddingProvider.js +136 -0
  45. package/dist/embedding/OllamaEmbeddingProvider.js.map +1 -0
  46. package/dist/embedding/OpenAIEmbeddingProvider.d.ts +83 -0
  47. package/dist/embedding/OpenAIEmbeddingProvider.d.ts.map +1 -0
  48. package/dist/embedding/OpenAIEmbeddingProvider.js +150 -0
  49. package/dist/embedding/OpenAIEmbeddingProvider.js.map +1 -0
  50. package/dist/embedding/index.d.ts +16 -0
  51. package/dist/embedding/index.d.ts.map +1 -0
  52. package/dist/embedding/index.js +15 -0
  53. package/dist/embedding/index.js.map +1 -0
  54. package/dist/index.d.ts +20 -0
  55. package/dist/index.d.ts.map +1 -0
  56. package/dist/index.js +22 -0
  57. package/dist/index.js.map +1 -0
  58. package/dist/reranking/LLMReranker.d.ts +147 -0
  59. package/dist/reranking/LLMReranker.d.ts.map +1 -0
  60. package/dist/reranking/LLMReranker.js +262 -0
  61. package/dist/reranking/LLMReranker.js.map +1 -0
  62. package/dist/reranking/index.d.ts +7 -0
  63. package/dist/reranking/index.d.ts.map +1 -0
  64. package/dist/reranking/index.js +7 -0
  65. package/dist/reranking/index.js.map +1 -0
  66. package/dist/types.d.ts +144 -0
  67. package/dist/types.d.ts.map +1 -0
  68. package/dist/types.js +8 -0
  69. package/dist/types.js.map +1 -0
  70. package/dist/vectordb/FileVectorStore.d.ts +93 -0
  71. package/dist/vectordb/FileVectorStore.d.ts.map +1 -0
  72. package/dist/vectordb/FileVectorStore.js +218 -0
  73. package/dist/vectordb/FileVectorStore.js.map +1 -0
  74. package/dist/vectordb/InMemoryVectorStore.d.ts +48 -0
  75. package/dist/vectordb/InMemoryVectorStore.d.ts.map +1 -0
  76. package/dist/vectordb/InMemoryVectorStore.js +86 -0
  77. package/dist/vectordb/InMemoryVectorStore.js.map +1 -0
  78. package/dist/vectordb/index.d.ts +8 -0
  79. package/dist/vectordb/index.d.ts.map +1 -0
  80. package/dist/vectordb/index.js +6 -0
  81. package/dist/vectordb/index.js.map +1 -0
  82. package/package.json +37 -0
  83. package/src/RAGEngine.ts +127 -0
  84. package/src/RAGPipeline.ts +357 -0
  85. package/src/Retriever.ts +121 -0
  86. package/src/chunking/DocumentChunker.ts +207 -0
  87. package/src/chunking/index.ts +5 -0
  88. package/src/embedding/AzureOpenAIEmbeddingProvider.ts +208 -0
  89. package/src/embedding/BaseEmbeddingProvider.ts +133 -0
  90. package/src/embedding/EmbeddingFactory.ts +225 -0
  91. package/src/embedding/EmbeddingManager.ts +110 -0
  92. package/src/embedding/MockEmbeddingProvider.ts +123 -0
  93. package/src/embedding/OllamaEmbeddingProvider.ts +197 -0
  94. package/src/embedding/OpenAIEmbeddingProvider.ts +226 -0
  95. package/src/embedding/index.ts +33 -0
  96. package/src/index.ts +55 -0
  97. package/src/reranking/LLMReranker.ts +401 -0
  98. package/src/reranking/index.ts +15 -0
  99. package/src/types.ts +157 -0
  100. package/src/vectordb/FileVectorStore.ts +289 -0
  101. package/src/vectordb/InMemoryVectorStore.ts +121 -0
  102. package/src/vectordb/index.ts +9 -0
@@ -0,0 +1,357 @@
1
+ /**
2
+ * RAG Pipeline - End-to-End RAG処理パイプライン
3
+ *
4
+ * 検索→コンテキスト構築→回答生成を統合したパイプライン
5
+ *
6
+ * @requirement REQ-RAG-101
7
+ * @design DES-KATASHIRO-003-RAG §3.6
8
+ */
9
+
10
+ import { RAGEngine } from './RAGEngine.js';
11
+ import type {
12
+ Document,
13
+ Chunk,
14
+ SearchResult,
15
+ RAGEngineConfig,
16
+ EmbeddingProvider,
17
+ VectorStore,
18
+ } from './types.js';
19
+
20
+ // LLMProvider型を定義(循環依存を避けるため)
21
+ interface LLMProviderLike {
22
+ generate(request: {
23
+ messages: Array<{ role: string; content: string }>;
24
+ temperature?: number;
25
+ maxTokens?: number;
26
+ }): Promise<{
27
+ content: string;
28
+ usage?: {
29
+ promptTokens: number;
30
+ completionTokens: number;
31
+ totalTokens: number;
32
+ };
33
+ }>;
34
+ }
35
+
36
+ /**
37
+ * RAGパイプライン設定
38
+ */
39
+ export interface RAGPipelineConfig extends RAGEngineConfig {
40
+ /** システムプロンプト */
41
+ systemPrompt?: string;
42
+ /** コンテキストテンプレート */
43
+ contextTemplate?: string;
44
+ /** 最大コンテキスト長(文字数) */
45
+ maxContextLength?: number;
46
+ /** 回答生成温度パラメータ */
47
+ temperature?: number;
48
+ /** 最大回答トークン数 */
49
+ maxAnswerTokens?: number;
50
+ /** デフォルトのtopK */
51
+ defaultTopK?: number;
52
+ /** 検索結果の最小スコア */
53
+ minSearchScore?: number;
54
+ /** ストリーミング有効化 */
55
+ enableStreaming?: boolean;
56
+ }
57
+
58
+ /**
59
+ * RAGパイプライン結果
60
+ */
61
+ export interface RAGPipelineResult {
62
+ /** 生成された回答 */
63
+ answer: string;
64
+ /** 使用されたコンテキスト */
65
+ contexts: string[];
66
+ /** 検索されたチャンク */
67
+ retrievedChunks: SearchResult[];
68
+ /** 使用されたクエリ */
69
+ query: string;
70
+ /** トークン使用量 */
71
+ tokenUsage?: {
72
+ promptTokens: number;
73
+ completionTokens: number;
74
+ totalTokens: number;
75
+ };
76
+ /** 処理時間(ミリ秒) */
77
+ processingTimeMs: number;
78
+ /** メタデータ */
79
+ metadata?: Record<string, unknown>;
80
+ }
81
+
82
+ /**
83
+ * RAGパイプラインオプション(クエリごと)
84
+ */
85
+ export interface RAGQueryOptions {
86
+ /** 検索件数 */
87
+ topK?: number;
88
+ /** 最小スコア */
89
+ minScore?: number;
90
+ /** 温度パラメータ */
91
+ temperature?: number;
92
+ /** 追加コンテキスト */
93
+ additionalContext?: string;
94
+ /** システムプロンプトオーバーライド */
95
+ systemPromptOverride?: string;
96
+ /** 会話履歴 */
97
+ conversationHistory?: Array<{ role: 'user' | 'assistant'; content: string }>;
98
+ }
99
+
100
+ /**
101
+ * デフォルトのシステムプロンプト
102
+ */
103
+ export const DEFAULT_RAG_SYSTEM_PROMPT = `あなたは与えられたコンテキスト情報に基づいて質問に回答するアシスタントです。
104
+ 以下のルールに従ってください:
105
+ 1. コンテキストに含まれる情報のみを使用して回答してください
106
+ 2. コンテキストに情報がない場合は、その旨を正直に伝えてください
107
+ 3. 回答は簡潔かつ正確にしてください
108
+ 4. 推測や仮定を避け、事実に基づいた回答をしてください`;
109
+
110
+ /**
111
+ * デフォルトのコンテキストテンプレート
112
+ */
113
+ export const DEFAULT_CONTEXT_TEMPLATE = `## 関連情報
114
+ {{contexts}}
115
+
116
+ ## 質問
117
+ {{query}}
118
+
119
+ 上記の情報に基づいて質問に回答してください。`;
120
+
121
+ /**
122
+ * RAGパイプライン
123
+ * 検索→コンテキスト構築→回答生成を統合
124
+ */
125
+ export class RAGPipeline {
126
+ private ragEngine: RAGEngine;
127
+ private llmProvider: LLMProviderLike;
128
+ private config: Required<Omit<RAGPipelineConfig, keyof RAGEngineConfig>>;
129
+ private ragEngineConfig: RAGEngineConfig;
130
+
131
+ constructor(
132
+ embeddingProvider: EmbeddingProvider,
133
+ vectorStore: VectorStore,
134
+ llmProvider: LLMProviderLike,
135
+ config: RAGPipelineConfig = {},
136
+ ) {
137
+ const { systemPrompt, contextTemplate, maxContextLength, temperature, maxAnswerTokens, defaultTopK, minSearchScore, enableStreaming, ...ragEngineConfig } = config;
138
+
139
+ this.ragEngine = new RAGEngine(embeddingProvider, vectorStore, ragEngineConfig);
140
+ this.llmProvider = llmProvider;
141
+ this.ragEngineConfig = ragEngineConfig;
142
+
143
+ this.config = {
144
+ systemPrompt: systemPrompt ?? DEFAULT_RAG_SYSTEM_PROMPT,
145
+ contextTemplate: contextTemplate ?? DEFAULT_CONTEXT_TEMPLATE,
146
+ maxContextLength: maxContextLength ?? 8000,
147
+ temperature: temperature ?? 0.3,
148
+ maxAnswerTokens: maxAnswerTokens ?? 2048,
149
+ defaultTopK: defaultTopK ?? 5,
150
+ minSearchScore: minSearchScore ?? 0.3,
151
+ enableStreaming: enableStreaming ?? false,
152
+ };
153
+ }
154
+
155
+ /**
156
+ * ドキュメントをインジェスト
157
+ */
158
+ async ingest(document: Document): Promise<Chunk[]> {
159
+ return this.ragEngine.ingest(document);
160
+ }
161
+
162
+ /**
163
+ * 複数ドキュメントをバッチインジェスト
164
+ */
165
+ async ingestBatch(documents: Document[]): Promise<Chunk[]> {
166
+ return this.ragEngine.ingestBatch(documents);
167
+ }
168
+
169
+ /**
170
+ * RAGクエリを実行(検索→回答生成)
171
+ */
172
+ async query(query: string, options: RAGQueryOptions = {}): Promise<RAGPipelineResult> {
173
+ const startTime = Date.now();
174
+
175
+ // 1. 検索
176
+ const topK = options.topK ?? this.config.defaultTopK;
177
+ const minScore = options.minScore ?? this.config.minSearchScore;
178
+
179
+ this.ragEngine.updateRetrieverConfig({ topK, minScore });
180
+ const searchResults = await this.ragEngine.query(query);
181
+
182
+ // 2. コンテキスト構築
183
+ const contexts = this.buildContexts(searchResults);
184
+ const contextText = this.truncateContext(contexts.join('\n\n'));
185
+
186
+ // 3. プロンプト構築
187
+ const systemPrompt = options.systemPromptOverride ?? this.config.systemPrompt;
188
+ const userPrompt = this.buildUserPrompt(query, contextText, options.additionalContext);
189
+
190
+ // 4. メッセージ配列構築
191
+ const messages: Array<{ role: string; content: string }> = [
192
+ { role: 'system', content: systemPrompt },
193
+ ];
194
+
195
+ // 会話履歴があれば追加
196
+ if (options.conversationHistory && options.conversationHistory.length > 0) {
197
+ for (const msg of options.conversationHistory) {
198
+ messages.push({ role: msg.role, content: msg.content });
199
+ }
200
+ }
201
+
202
+ messages.push({ role: 'user', content: userPrompt });
203
+
204
+ // 5. 回答生成
205
+ const temperature = options.temperature ?? this.config.temperature;
206
+ const response = await this.llmProvider.generate({
207
+ messages,
208
+ temperature,
209
+ maxTokens: this.config.maxAnswerTokens,
210
+ });
211
+
212
+ const processingTimeMs = Date.now() - startTime;
213
+
214
+ return {
215
+ answer: response.content,
216
+ contexts,
217
+ retrievedChunks: searchResults,
218
+ query,
219
+ tokenUsage: response.usage,
220
+ processingTimeMs,
221
+ metadata: {
222
+ topK,
223
+ minScore,
224
+ temperature,
225
+ searchResultCount: searchResults.length,
226
+ contextLength: contextText.length,
227
+ },
228
+ };
229
+ }
230
+
231
+ /**
232
+ * 会話形式でクエリ(履歴を自動管理)
233
+ */
234
+ async chat(
235
+ query: string,
236
+ conversationHistory: Array<{ role: 'user' | 'assistant'; content: string }>,
237
+ options: Omit<RAGQueryOptions, 'conversationHistory'> = {},
238
+ ): Promise<RAGPipelineResult> {
239
+ return this.query(query, { ...options, conversationHistory });
240
+ }
241
+
242
+ /**
243
+ * 検索のみ実行(回答生成なし)
244
+ */
245
+ async search(query: string, topK?: number): Promise<SearchResult[]> {
246
+ if (topK) {
247
+ this.ragEngine.updateRetrieverConfig({ topK });
248
+ }
249
+ return this.ragEngine.query(query);
250
+ }
251
+
252
+ /**
253
+ * コンテキストを構築
254
+ */
255
+ private buildContexts(searchResults: SearchResult[]): string[] {
256
+ return searchResults.map((result, index) => {
257
+ const metadata = result.chunk.metadata;
258
+ const source = metadata.source ?? metadata.documentId ?? 'unknown';
259
+ return `[情報源 ${index + 1}] (score: ${result.score.toFixed(2)}, source: ${source})\n${result.chunk.content}`;
260
+ });
261
+ }
262
+
263
+ /**
264
+ * ユーザープロンプトを構築
265
+ */
266
+ private buildUserPrompt(query: string, contextText: string, additionalContext?: string): string {
267
+ let prompt = this.config.contextTemplate
268
+ .replace('{{contexts}}', contextText)
269
+ .replace('{{query}}', query);
270
+
271
+ if (additionalContext) {
272
+ prompt = `## 追加情報\n${additionalContext}\n\n${prompt}`;
273
+ }
274
+
275
+ return prompt;
276
+ }
277
+
278
+ /**
279
+ * コンテキストを最大長に切り詰め
280
+ */
281
+ private truncateContext(context: string): string {
282
+ if (context.length <= this.config.maxContextLength) {
283
+ return context;
284
+ }
285
+
286
+ // 文境界で切り詰め
287
+ const truncated = context.slice(0, this.config.maxContextLength);
288
+ const lastSentenceEnd = Math.max(
289
+ truncated.lastIndexOf('。'),
290
+ truncated.lastIndexOf('.'),
291
+ truncated.lastIndexOf('\n'),
292
+ );
293
+
294
+ if (lastSentenceEnd > this.config.maxContextLength * 0.8) {
295
+ return truncated.slice(0, lastSentenceEnd + 1) + '\n[...以下省略...]';
296
+ }
297
+
298
+ return truncated + '...\n[...以下省略...]';
299
+ }
300
+
301
+ /**
302
+ * チャンクを削除
303
+ */
304
+ async deleteChunk(chunkId: string): Promise<boolean> {
305
+ return this.ragEngine.deleteChunk(chunkId);
306
+ }
307
+
308
+ /**
309
+ * ドキュメントを削除
310
+ */
311
+ async deleteDocument(documentId: string, chunkCount: number): Promise<number> {
312
+ return this.ragEngine.deleteDocument(documentId, chunkCount);
313
+ }
314
+
315
+ /**
316
+ * 設定を更新
317
+ */
318
+ updateConfig(config: Partial<RAGPipelineConfig>): void {
319
+ if (config.systemPrompt !== undefined) {
320
+ this.config.systemPrompt = config.systemPrompt;
321
+ }
322
+ if (config.contextTemplate !== undefined) {
323
+ this.config.contextTemplate = config.contextTemplate;
324
+ }
325
+ if (config.maxContextLength !== undefined) {
326
+ this.config.maxContextLength = config.maxContextLength;
327
+ }
328
+ if (config.temperature !== undefined) {
329
+ this.config.temperature = config.temperature;
330
+ }
331
+ if (config.maxAnswerTokens !== undefined) {
332
+ this.config.maxAnswerTokens = config.maxAnswerTokens;
333
+ }
334
+ if (config.defaultTopK !== undefined) {
335
+ this.config.defaultTopK = config.defaultTopK;
336
+ }
337
+ if (config.minSearchScore !== undefined) {
338
+ this.config.minSearchScore = config.minSearchScore;
339
+ }
340
+
341
+ // RAGEngine設定の更新
342
+ const { chunking, retriever } = config;
343
+ if (chunking) {
344
+ this.ragEngine.updateChunkingConfig(chunking);
345
+ }
346
+ if (retriever) {
347
+ this.ragEngine.updateRetrieverConfig(retriever);
348
+ }
349
+ }
350
+
351
+ /**
352
+ * 現在の設定を取得
353
+ */
354
+ getConfig(): Readonly<RAGPipelineConfig> {
355
+ return { ...this.config, ...this.ragEngineConfig };
356
+ }
357
+ }
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Retriever - 検索エンジン
3
+ *
4
+ * @requirement REQ-RAG-004
5
+ * @design DES-KATASHIRO-003-RAG §3.4
6
+ */
7
+
8
+ import { EmbeddingManager } from './embedding/EmbeddingManager.js';
9
+ import type { Chunk, Document, EmbeddingProvider, RetrieverConfig, SearchResult, VectorStore } from './types.js';
10
+
11
+ /**
12
+ * デフォルト検索設定
13
+ */
14
+ const DEFAULT_CONFIG: Required<RetrieverConfig> = {
15
+ topK: 5,
16
+ minScore: 0.5,
17
+ };
18
+
19
+ /**
20
+ * 検索エンジン
21
+ * Embedding生成とVector検索を統合
22
+ */
23
+ export class Retriever {
24
+ private embeddingManager: EmbeddingManager;
25
+ private vectorStore: VectorStore;
26
+ private config: Required<RetrieverConfig>;
27
+
28
+ constructor(
29
+ embeddingProvider: EmbeddingProvider,
30
+ vectorStore: VectorStore,
31
+ config: RetrieverConfig = {},
32
+ ) {
33
+ this.embeddingManager = new EmbeddingManager(embeddingProvider);
34
+ this.vectorStore = vectorStore;
35
+ this.config = { ...DEFAULT_CONFIG, ...config };
36
+ }
37
+
38
+ /**
39
+ * ドキュメントをインデックスに追加
40
+ */
41
+ async addDocument(_document: Document, chunks: Chunk[]): Promise<void> {
42
+ const items = await Promise.all(
43
+ chunks.map(async (chunk) => ({
44
+ chunk,
45
+ vector: await this.embeddingManager.embed(chunk.content),
46
+ })),
47
+ );
48
+
49
+ await this.vectorStore.addBatch(items);
50
+ }
51
+
52
+ /**
53
+ * 複数ドキュメントをバッチでインデックスに追加
54
+ */
55
+ async addDocuments(documents: Array<{ document: Document; chunks: Chunk[] }>): Promise<void> {
56
+ for (const { chunks } of documents) {
57
+ const items = await Promise.all(
58
+ chunks.map(async (chunk) => ({
59
+ chunk,
60
+ vector: await this.embeddingManager.embed(chunk.content),
61
+ })),
62
+ );
63
+
64
+ await this.vectorStore.addBatch(items);
65
+ }
66
+ }
67
+
68
+ /**
69
+ * クエリで検索
70
+ */
71
+ async search(query: string): Promise<SearchResult[]> {
72
+ const queryVector = await this.embeddingManager.embed(query);
73
+ const results = await this.vectorStore.search(queryVector, this.config.topK);
74
+
75
+ // minScoreでフィルタリング
76
+ return results.filter((r) => r.score >= this.config.minScore);
77
+ }
78
+
79
+ /**
80
+ * 複数クエリで検索(結果をマージ)
81
+ */
82
+ async searchMultiple(queries: string[]): Promise<SearchResult[]> {
83
+ const allResults: SearchResult[] = [];
84
+ const seen = new Set<string>();
85
+
86
+ for (const query of queries) {
87
+ const results = await this.search(query);
88
+
89
+ for (const result of results) {
90
+ if (!seen.has(result.chunk.id)) {
91
+ seen.add(result.chunk.id);
92
+ allResults.push(result);
93
+ }
94
+ }
95
+ }
96
+
97
+ // スコア降順でソートして返却
98
+ return allResults.sort((a, b) => b.score - a.score).slice(0, this.config.topK);
99
+ }
100
+
101
+ /**
102
+ * チャンクを削除
103
+ */
104
+ async deleteChunk(chunkId: string): Promise<boolean> {
105
+ return this.vectorStore.delete(chunkId);
106
+ }
107
+
108
+ /**
109
+ * 検索設定を取得
110
+ */
111
+ getConfig(): Required<RetrieverConfig> {
112
+ return { ...this.config };
113
+ }
114
+
115
+ /**
116
+ * 検索設定を更新
117
+ */
118
+ updateConfig(config: Partial<RetrieverConfig>): void {
119
+ this.config = { ...this.config, ...config };
120
+ }
121
+ }
@@ -0,0 +1,207 @@
1
+ /**
2
+ * Document Chunker - ドキュメント分割
3
+ *
4
+ * @requirement REQ-RAG-003
5
+ * @design DES-KATASHIRO-003-RAG §3.3
6
+ */
7
+
8
+ import type { Chunk, ChunkingConfig, Document } from '../types.js';
9
+
10
+ /**
11
+ * デフォルトチャンキング設定
12
+ */
13
+ const DEFAULT_CONFIG: Required<ChunkingConfig> = {
14
+ strategy: 'fixed',
15
+ chunkSize: 512,
16
+ chunkOverlap: 64,
17
+ separators: ['\n\n', '\n', '. ', ' '],
18
+ };
19
+
20
+ /**
21
+ * ドキュメントをチャンクに分割するクラス
22
+ */
23
+ export class DocumentChunker {
24
+ private config: Required<ChunkingConfig>;
25
+
26
+ constructor(config: ChunkingConfig = {}) {
27
+ this.config = { ...DEFAULT_CONFIG, ...config };
28
+ }
29
+
30
+ /**
31
+ * ドキュメントをチャンクに分割
32
+ */
33
+ chunk(document: Document): Chunk[] {
34
+ const text = document.content;
35
+
36
+ switch (this.config.strategy) {
37
+ case 'sentence':
38
+ return this.chunkBySentence(document, text);
39
+ case 'paragraph':
40
+ return this.chunkByParagraph(document, text);
41
+ case 'fixed':
42
+ default:
43
+ return this.chunkByFixed(document, text);
44
+ }
45
+ }
46
+
47
+ /**
48
+ * 複数ドキュメントをチャンクに分割
49
+ */
50
+ chunkBatch(documents: Document[]): Chunk[] {
51
+ return documents.flatMap((doc) => this.chunk(doc));
52
+ }
53
+
54
+ /**
55
+ * 固定サイズでチャンク分割
56
+ */
57
+ private chunkByFixed(document: Document, text: string): Chunk[] {
58
+ const chunks: Chunk[] = [];
59
+ const { chunkSize, chunkOverlap } = this.config;
60
+ // strideが最低でも1になるように保証(無限ループ防止)
61
+ const stride = Math.max(1, chunkSize - chunkOverlap);
62
+
63
+ let position = 0;
64
+ let index = 0;
65
+
66
+ while (position < text.length) {
67
+ const content = text.slice(position, position + chunkSize);
68
+
69
+ if (content.trim().length > 0) {
70
+ chunks.push(this.createChunk(document, content, index, position));
71
+ index++;
72
+ }
73
+
74
+ position += stride;
75
+ }
76
+
77
+ return chunks;
78
+ }
79
+
80
+ /**
81
+ * 文単位でチャンク分割
82
+ */
83
+ private chunkBySentence(document: Document, text: string): Chunk[] {
84
+ const chunks: Chunk[] = [];
85
+ const { chunkSize, chunkOverlap } = this.config;
86
+
87
+ // 文に分割(句点またはピリオドで分割)
88
+ const sentences = this.splitBySentence(text);
89
+
90
+ let currentChunk = '';
91
+ let currentPosition = 0;
92
+ let index = 0;
93
+ let overlapBuffer = '';
94
+
95
+ for (const sentence of sentences) {
96
+ // チャンクサイズを超える場合
97
+ if (currentChunk.length + sentence.length > chunkSize && currentChunk.length > 0) {
98
+ chunks.push(this.createChunk(document, currentChunk.trim(), index, currentPosition));
99
+ index++;
100
+
101
+ // オーバーラップ用のバッファを保持
102
+ overlapBuffer = this.getOverlapText(currentChunk, chunkOverlap);
103
+ currentPosition += currentChunk.length - overlapBuffer.length;
104
+ currentChunk = overlapBuffer;
105
+ }
106
+
107
+ currentChunk += sentence;
108
+ }
109
+
110
+ // 残りを追加
111
+ if (currentChunk.trim().length > 0) {
112
+ chunks.push(this.createChunk(document, currentChunk.trim(), index, currentPosition));
113
+ }
114
+
115
+ return chunks;
116
+ }
117
+
118
+ /**
119
+ * 段落単位でチャンク分割
120
+ */
121
+ private chunkByParagraph(document: Document, text: string): Chunk[] {
122
+ const chunks: Chunk[] = [];
123
+ const { chunkSize, chunkOverlap } = this.config;
124
+
125
+ // 段落に分割
126
+ const paragraphs = text.split(/\n\s*\n/).filter((p) => p.trim().length > 0);
127
+
128
+ let currentChunk = '';
129
+ let currentPosition = 0;
130
+ let index = 0;
131
+ let overlapBuffer = '';
132
+
133
+ for (const paragraph of paragraphs) {
134
+ const paragraphWithSeparator = paragraph + '\n\n';
135
+
136
+ // チャンクサイズを超える場合
137
+ if (currentChunk.length + paragraphWithSeparator.length > chunkSize && currentChunk.length > 0) {
138
+ chunks.push(this.createChunk(document, currentChunk.trim(), index, currentPosition));
139
+ index++;
140
+
141
+ overlapBuffer = this.getOverlapText(currentChunk, chunkOverlap);
142
+ currentPosition += currentChunk.length - overlapBuffer.length;
143
+ currentChunk = overlapBuffer;
144
+ }
145
+
146
+ currentChunk += paragraphWithSeparator;
147
+ }
148
+
149
+ // 残りを追加
150
+ if (currentChunk.trim().length > 0) {
151
+ chunks.push(this.createChunk(document, currentChunk.trim(), index, currentPosition));
152
+ }
153
+
154
+ return chunks;
155
+ }
156
+
157
+ /**
158
+ * 文に分割
159
+ */
160
+ private splitBySentence(text: string): string[] {
161
+ // 日本語と英語の両方に対応
162
+ const pattern = /([^。.!?!?]+[。.!?!?]+\s*)/g;
163
+ const matches = text.match(pattern);
164
+
165
+ if (!matches) {
166
+ return [text];
167
+ }
168
+
169
+ // マッチしなかった残りを追加
170
+ const matched = matches.join('');
171
+ if (matched.length < text.length) {
172
+ const remaining = text.slice(matched.length);
173
+ if (remaining.trim().length > 0) {
174
+ matches.push(remaining);
175
+ }
176
+ }
177
+
178
+ return matches;
179
+ }
180
+
181
+ /**
182
+ * オーバーラップテキストを取得
183
+ */
184
+ private getOverlapText(text: string, overlapSize: number): string {
185
+ if (text.length <= overlapSize) {
186
+ return text;
187
+ }
188
+ return text.slice(-overlapSize);
189
+ }
190
+
191
+ /**
192
+ * チャンクを作成
193
+ */
194
+ private createChunk(document: Document, content: string, index: number, position: number): Chunk {
195
+ return {
196
+ id: `${document.id}_chunk_${index}`,
197
+ documentId: document.id,
198
+ content,
199
+ metadata: {
200
+ ...document.metadata,
201
+ chunkIndex: index,
202
+ startPosition: position,
203
+ endPosition: position + content.length,
204
+ },
205
+ };
206
+ }
207
+ }
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Chunking module exports
3
+ */
4
+
5
+ export { DocumentChunker } from './DocumentChunker.js';