@robthepcguy/rag-vault 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +24 -0
- package/README.md +421 -0
- package/dist/bin/install-skills.d.ts +20 -0
- package/dist/bin/install-skills.d.ts.map +1 -0
- package/dist/bin/install-skills.js +196 -0
- package/dist/bin/install-skills.js.map +1 -0
- package/dist/chunker/index.d.ts +11 -0
- package/dist/chunker/index.d.ts.map +1 -0
- package/dist/chunker/index.js +6 -0
- package/dist/chunker/index.js.map +1 -0
- package/dist/chunker/semantic-chunker.d.ts +96 -0
- package/dist/chunker/semantic-chunker.d.ts.map +1 -0
- package/dist/chunker/semantic-chunker.js +267 -0
- package/dist/chunker/semantic-chunker.js.map +1 -0
- package/dist/chunker/sentence-splitter.d.ts +16 -0
- package/dist/chunker/sentence-splitter.d.ts.map +1 -0
- package/dist/chunker/sentence-splitter.js +114 -0
- package/dist/chunker/sentence-splitter.js.map +1 -0
- package/dist/embedder/index.d.ts +55 -0
- package/dist/embedder/index.d.ts.map +1 -0
- package/dist/embedder/index.js +146 -0
- package/dist/embedder/index.js.map +1 -0
- package/dist/errors/index.d.ts +73 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +170 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +91 -0
- package/dist/index.js.map +1 -0
- package/dist/parser/html-parser.d.ts +14 -0
- package/dist/parser/html-parser.d.ts.map +1 -0
- package/dist/parser/html-parser.js +99 -0
- package/dist/parser/html-parser.js.map +1 -0
- package/dist/parser/index.d.ts +144 -0
- package/dist/parser/index.d.ts.map +1 -0
- package/dist/parser/index.js +446 -0
- package/dist/parser/index.js.map +1 -0
- package/dist/parser/pdf-filter.d.ts +89 -0
- package/dist/parser/pdf-filter.d.ts.map +1 -0
- package/dist/parser/pdf-filter.js +304 -0
- package/dist/parser/pdf-filter.js.map +1 -0
- package/dist/server/index.d.ts +144 -0
- package/dist/server/index.d.ts.map +1 -0
- package/dist/server/index.js +518 -0
- package/dist/server/index.js.map +1 -0
- package/dist/server/raw-data-utils.d.ts +81 -0
- package/dist/server/raw-data-utils.d.ts.map +1 -0
- package/dist/server/raw-data-utils.js +196 -0
- package/dist/server/raw-data-utils.js.map +1 -0
- package/dist/server/schemas.d.ts +186 -0
- package/dist/server/schemas.d.ts.map +1 -0
- package/dist/server/schemas.js +99 -0
- package/dist/server/schemas.js.map +1 -0
- package/dist/utils/config-parsers.d.ts +14 -0
- package/dist/utils/config-parsers.d.ts.map +1 -0
- package/dist/utils/config-parsers.js +47 -0
- package/dist/utils/config-parsers.js.map +1 -0
- package/dist/utils/config.d.ts +37 -0
- package/dist/utils/config.d.ts.map +1 -0
- package/dist/utils/config.js +52 -0
- package/dist/utils/config.js.map +1 -0
- package/dist/utils/logger.d.ts +36 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +64 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/math.d.ts +34 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +73 -0
- package/dist/utils/math.js.map +1 -0
- package/dist/utils/process-handlers.d.ts +26 -0
- package/dist/utils/process-handlers.d.ts.map +1 -0
- package/dist/utils/process-handlers.js +69 -0
- package/dist/utils/process-handlers.js.map +1 -0
- package/dist/vectordb/index.d.ts +210 -0
- package/dist/vectordb/index.d.ts.map +1 -0
- package/dist/vectordb/index.js +613 -0
- package/dist/vectordb/index.js.map +1 -0
- package/dist/web/api-routes.d.ts +9 -0
- package/dist/web/api-routes.d.ts.map +1 -0
- package/dist/web/api-routes.js +127 -0
- package/dist/web/api-routes.js.map +1 -0
- package/dist/web/config-routes.d.ts +7 -0
- package/dist/web/config-routes.d.ts.map +1 -0
- package/dist/web/config-routes.js +54 -0
- package/dist/web/config-routes.js.map +1 -0
- package/dist/web/database-manager.d.ts +130 -0
- package/dist/web/database-manager.d.ts.map +1 -0
- package/dist/web/database-manager.js +382 -0
- package/dist/web/database-manager.js.map +1 -0
- package/dist/web/http-server.d.ts +28 -0
- package/dist/web/http-server.d.ts.map +1 -0
- package/dist/web/http-server.js +311 -0
- package/dist/web/http-server.js.map +1 -0
- package/dist/web/index.d.ts +3 -0
- package/dist/web/index.d.ts.map +1 -0
- package/dist/web/index.js +114 -0
- package/dist/web/index.js.map +1 -0
- package/dist/web/middleware/async-handler.d.ts +17 -0
- package/dist/web/middleware/async-handler.d.ts.map +1 -0
- package/dist/web/middleware/async-handler.js +26 -0
- package/dist/web/middleware/async-handler.js.map +1 -0
- package/dist/web/middleware/auth.d.ts +22 -0
- package/dist/web/middleware/auth.d.ts.map +1 -0
- package/dist/web/middleware/auth.js +81 -0
- package/dist/web/middleware/auth.js.map +1 -0
- package/dist/web/middleware/error-handler.d.ts +36 -0
- package/dist/web/middleware/error-handler.d.ts.map +1 -0
- package/dist/web/middleware/error-handler.js +68 -0
- package/dist/web/middleware/error-handler.js.map +1 -0
- package/dist/web/middleware/index.d.ts +6 -0
- package/dist/web/middleware/index.d.ts.map +1 -0
- package/dist/web/middleware/index.js +19 -0
- package/dist/web/middleware/index.js.map +1 -0
- package/dist/web/middleware/rate-limit.d.ts +38 -0
- package/dist/web/middleware/rate-limit.d.ts.map +1 -0
- package/dist/web/middleware/rate-limit.js +116 -0
- package/dist/web/middleware/rate-limit.js.map +1 -0
- package/dist/web/middleware/request-logger.d.ts +52 -0
- package/dist/web/middleware/request-logger.d.ts.map +1 -0
- package/dist/web/middleware/request-logger.js +74 -0
- package/dist/web/middleware/request-logger.js.map +1 -0
- package/dist/web/types.d.ts +6 -0
- package/dist/web/types.d.ts.map +1 -0
- package/dist/web/types.js +4 -0
- package/dist/web/types.js.map +1 -0
- package/package.json +135 -0
- package/skills/rag-vault/SKILL.md +111 -0
- package/skills/rag-vault/references/html-ingestion.md +73 -0
- package/skills/rag-vault/references/query-optimization.md +57 -0
- package/skills/rag-vault/references/result-refinement.md +54 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import type { EmbedderInterface } from '../chunker/semantic-chunker.js';
|
|
2
|
+
export type { EmbedderInterface };
|
|
3
|
+
/**
|
|
4
|
+
* Text item with position information from PDF
|
|
5
|
+
*/
|
|
6
|
+
export interface TextItemWithPosition {
|
|
7
|
+
text: string;
|
|
8
|
+
x: number;
|
|
9
|
+
y: number;
|
|
10
|
+
fontSize: number;
|
|
11
|
+
hasEOL: boolean;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Page data containing positioned text items
|
|
15
|
+
*/
|
|
16
|
+
export interface PageData {
|
|
17
|
+
pageNum: number;
|
|
18
|
+
items: TextItemWithPosition[];
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Join filtered pages into text
|
|
22
|
+
*
|
|
23
|
+
* @param pages - Filtered page data
|
|
24
|
+
* @returns Joined text with proper line breaks
|
|
25
|
+
*/
|
|
26
|
+
export declare function joinFilteredPages(pages: PageData[]): string;
|
|
27
|
+
/**
|
|
28
|
+
* Configuration for sentence-level pattern detection
|
|
29
|
+
*/
|
|
30
|
+
export interface SentencePatternConfig {
|
|
31
|
+
/** Similarity threshold for pattern detection (default: 0.85) */
|
|
32
|
+
similarityThreshold: number;
|
|
33
|
+
/** Minimum pages required for pattern detection (default: 3) */
|
|
34
|
+
minPages: number;
|
|
35
|
+
/** Number of pages to sample from center for pattern detection (default: 5) */
|
|
36
|
+
samplePages: number;
|
|
37
|
+
}
|
|
38
|
+
/** Default configuration for sentence-level pattern detection */
|
|
39
|
+
export declare const DEFAULT_SENTENCE_PATTERN_CONFIG: SentencePatternConfig;
|
|
40
|
+
/**
|
|
41
|
+
* Result of sentence-level pattern detection
|
|
42
|
+
*/
|
|
43
|
+
export interface SentencePatternResult {
|
|
44
|
+
/** Whether first sentences should be removed (detected as header) */
|
|
45
|
+
removeFirstSentence: boolean;
|
|
46
|
+
/** Whether last sentences should be removed (detected as footer) */
|
|
47
|
+
removeLastSentence: boolean;
|
|
48
|
+
/** Median similarity of first sentences */
|
|
49
|
+
headerSimilarity: number;
|
|
50
|
+
/** Median similarity of last sentences */
|
|
51
|
+
footerSimilarity: number;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Detect header/footer patterns at sentence level
|
|
55
|
+
*
|
|
56
|
+
* Algorithm:
|
|
57
|
+
* 1. Sample pages from the CENTER of the document (guaranteed to be content pages)
|
|
58
|
+
* 2. Split each page into sentences with Y coordinate
|
|
59
|
+
* 3. Collect first/last sentences from sampled pages
|
|
60
|
+
* 4. Embed and calculate median pairwise similarity
|
|
61
|
+
* 5. If similarity > threshold, mark as header/footer
|
|
62
|
+
*
|
|
63
|
+
* Key insight: Middle pages are always content pages (cover, TOC, index are at edges).
|
|
64
|
+
* Using median instead of mean provides robustness against outliers.
|
|
65
|
+
*
|
|
66
|
+
* This approach handles variable content like page numbers ("7 of 75")
|
|
67
|
+
* by using semantic similarity instead of exact text matching.
|
|
68
|
+
*
|
|
69
|
+
* @param pages - Array of page data
|
|
70
|
+
* @param embedder - Embedder for generating embeddings
|
|
71
|
+
* @param config - Configuration options
|
|
72
|
+
* @returns Detection result
|
|
73
|
+
*/
|
|
74
|
+
export declare function detectSentencePatterns(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<SentencePatternResult>;
|
|
75
|
+
/**
|
|
76
|
+
* Filter page boundary sentences and join into text
|
|
77
|
+
*
|
|
78
|
+
* This is the main entry point for sentence-level header/footer filtering.
|
|
79
|
+
* It detects and removes repeating sentence patterns at page boundaries.
|
|
80
|
+
*
|
|
81
|
+
* Use this instead of joinFilteredPages when embedder is available.
|
|
82
|
+
*
|
|
83
|
+
* @param pages - Array of page data
|
|
84
|
+
* @param embedder - Embedder for generating embeddings
|
|
85
|
+
* @param config - Configuration options
|
|
86
|
+
* @returns Filtered text with header/footer sentences removed
|
|
87
|
+
*/
|
|
88
|
+
export declare function filterPageBoundarySentences(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<string>;
|
|
89
|
+
//# sourceMappingURL=pdf-filter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-filter.d.ts","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAA;AAIvE,YAAY,EAAE,iBAAiB,EAAE,CAAA;AAMjC;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,CAAC,EAAE,MAAM,CAAA;IACT,CAAC,EAAE,MAAM,CAAA;IACT,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,OAAO,CAAA;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,oBAAoB,EAAE,CAAA;CAC9B;AAoCD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,MAAM,CAK3D;AA6JD;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,iEAAiE;IACjE,mBAAmB,EAAE,MAAM,CAAA;IAC3B,gEAAgE;IAChE,QAAQ,EAAE,MAAM,CAAA;IAChB,+EAA+E;IAC/E,WAAW,EAAE,MAAM,CAAA;CACpB;AAED,iEAAiE;AACjE,eAAO,MAAM,+BAA+B,EAAE,qBAI7C,CAAA;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,qEAAqE;IACrE,mBAAmB,EAAE,OAAO,CAAA;IAC5B,oEAAoE;IACpE,kBAAkB,EAAE,OAAO,CAAA;IAC3B,2CAA2C;IAC3C,gBAAgB,EAAE,MAAM,CAAA;IACxB,0CAA0C;IAC1C,gBAAgB,EAAE,MAAM,CAAA;CACzB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,sBAAsB,CAC1C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,qBAAqB,CAAC,CAsEhC;AAED;;;;;;;;;;;;GAYG;AACH,wBAAsB,2BAA2B,CAC/C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,MAAM,CAAC,CAyCjB"}
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// PDF Header/Footer Filter
|
|
3
|
+
// - Detects and removes repeating patterns across pages
|
|
4
|
+
// - Semantic similarity-based header/footer detection (sentence-level)
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.DEFAULT_SENTENCE_PATTERN_CONFIG = void 0;
|
|
7
|
+
exports.joinFilteredPages = joinFilteredPages;
|
|
8
|
+
exports.detectSentencePatterns = detectSentencePatterns;
|
|
9
|
+
exports.filterPageBoundarySentences = filterPageBoundarySentences;
|
|
10
|
+
const sentence_splitter_js_1 = require("../chunker/sentence-splitter.js");
|
|
11
|
+
// ============================================
|
|
12
|
+
// Text Joining
|
|
13
|
+
// ============================================
|
|
14
|
+
/**
|
|
15
|
+
* Join page items into text
|
|
16
|
+
*
|
|
17
|
+
* Groups items by Y coordinate (same Y = same line),
|
|
18
|
+
* sorts each group by X coordinate (left to right),
|
|
19
|
+
* then joins groups with newlines (top to bottom).
|
|
20
|
+
*/
|
|
21
|
+
function joinPageItems(items) {
|
|
22
|
+
// Group by Y coordinate (rounded to handle minor variations)
|
|
23
|
+
const yGroups = new Map();
|
|
24
|
+
for (const item of items) {
|
|
25
|
+
const y = Math.round(item.y);
|
|
26
|
+
const group = yGroups.get(y) || [];
|
|
27
|
+
group.push(item);
|
|
28
|
+
yGroups.set(y, group);
|
|
29
|
+
}
|
|
30
|
+
// Sort groups by Y descending (top to bottom), items by X ascending (left to right)
|
|
31
|
+
return [...yGroups.entries()]
|
|
32
|
+
.sort((a, b) => b[0] - a[0])
|
|
33
|
+
.map(([_, group]) => group
|
|
34
|
+
.sort((a, b) => a.x - b.x)
|
|
35
|
+
.map((i) => i.text)
|
|
36
|
+
.join(' '))
|
|
37
|
+
.join('\n')
|
|
38
|
+
.trim();
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Join filtered pages into text
|
|
42
|
+
*
|
|
43
|
+
* @param pages - Filtered page data
|
|
44
|
+
* @returns Joined text with proper line breaks
|
|
45
|
+
*/
|
|
46
|
+
function joinFilteredPages(pages) {
|
|
47
|
+
return pages
|
|
48
|
+
.map((page) => joinPageItems(page.items))
|
|
49
|
+
.filter((text) => text.length > 0)
|
|
50
|
+
.join('\n\n');
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Split page items into sentences with Y coordinate
|
|
54
|
+
*
|
|
55
|
+
* 1. Join items into text (preserving item boundaries)
|
|
56
|
+
* 2. Split into sentences using splitIntoSentences
|
|
57
|
+
* 3. Map each sentence to the Y coordinate of its first item
|
|
58
|
+
* 4. Merge sentences with same Y coordinate
|
|
59
|
+
*
|
|
60
|
+
* @param items - Text items with position
|
|
61
|
+
* @returns Sentences with Y coordinate (merged by Y)
|
|
62
|
+
*/
|
|
63
|
+
function splitItemsIntoSentencesWithY(items) {
|
|
64
|
+
if (items.length === 0)
|
|
65
|
+
return [];
|
|
66
|
+
// Sort items by Y descending, then X ascending (reading order)
|
|
67
|
+
const sortedItems = [...items].sort((a, b) => {
|
|
68
|
+
const yDiff = b.y - a.y;
|
|
69
|
+
if (Math.abs(yDiff) > 1)
|
|
70
|
+
return yDiff;
|
|
71
|
+
return a.x - b.x;
|
|
72
|
+
});
|
|
73
|
+
// Build text and track character positions to item mapping
|
|
74
|
+
const charToItem = [];
|
|
75
|
+
let fullText = '';
|
|
76
|
+
let prevY = null;
|
|
77
|
+
for (const item of sortedItems) {
|
|
78
|
+
// Insert newline when Y coordinate changes (different line)
|
|
79
|
+
// This matches joinPageItems behavior: same Y = space, different Y = newline
|
|
80
|
+
if (prevY !== null && Math.abs(prevY - item.y) > 1) {
|
|
81
|
+
fullText = `${fullText.trimEnd()}\n`;
|
|
82
|
+
}
|
|
83
|
+
charToItem.push({ start: fullText.length, item });
|
|
84
|
+
fullText += `${item.text} `;
|
|
85
|
+
prevY = item.y;
|
|
86
|
+
}
|
|
87
|
+
// Split into sentences
|
|
88
|
+
const sentences = (0, sentence_splitter_js_1.splitIntoSentences)(fullText);
|
|
89
|
+
// Map each sentence to Y coordinate of its first character's item
|
|
90
|
+
const sentencesWithY = [];
|
|
91
|
+
let searchStart = 0;
|
|
92
|
+
for (const sentence of sentences) {
|
|
93
|
+
// Find where this sentence starts in fullText
|
|
94
|
+
const sentenceStart = fullText.indexOf(sentence.trim(), searchStart);
|
|
95
|
+
if (sentenceStart === -1)
|
|
96
|
+
continue;
|
|
97
|
+
// Find the item that contains this position
|
|
98
|
+
let firstItemY = sortedItems[0]?.y ?? 0;
|
|
99
|
+
for (let i = charToItem.length - 1; i >= 0; i--) {
|
|
100
|
+
const entry = charToItem[i];
|
|
101
|
+
if (entry && entry.start <= sentenceStart) {
|
|
102
|
+
firstItemY = Math.round(entry.item.y);
|
|
103
|
+
break;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
sentencesWithY.push({ text: sentence, y: firstItemY });
|
|
107
|
+
searchStart = sentenceStart + sentence.length;
|
|
108
|
+
}
|
|
109
|
+
// Merge sentences with same Y coordinate
|
|
110
|
+
return mergeSentencesByY(sentencesWithY);
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Merge sentences with same Y coordinate
|
|
114
|
+
*
|
|
115
|
+
* @param sentences - Sentences with Y coordinate
|
|
116
|
+
* @returns Merged sentences (same Y = one sentence)
|
|
117
|
+
*/
|
|
118
|
+
function mergeSentencesByY(sentences) {
|
|
119
|
+
if (sentences.length === 0)
|
|
120
|
+
return [];
|
|
121
|
+
const merged = [];
|
|
122
|
+
let current = null;
|
|
123
|
+
for (const sentence of sentences) {
|
|
124
|
+
if (current === null) {
|
|
125
|
+
current = { ...sentence };
|
|
126
|
+
}
|
|
127
|
+
else if (current.y === sentence.y) {
|
|
128
|
+
// Same Y: merge text
|
|
129
|
+
current.text += ` ${sentence.text}`;
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
// Different Y: push current and start new
|
|
133
|
+
merged.push(current);
|
|
134
|
+
current = { ...sentence };
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (current !== null) {
|
|
138
|
+
merged.push(current);
|
|
139
|
+
}
|
|
140
|
+
return merged;
|
|
141
|
+
}
|
|
142
|
+
// ============================================
|
|
143
|
+
// Sentence-Level Header/Footer Detection
|
|
144
|
+
// ============================================
|
|
145
|
+
// Use shared cosine similarity function
|
|
146
|
+
const math_js_1 = require("../utils/math.js");
|
|
147
|
+
/**
|
|
148
|
+
* Calculate median pairwise similarity for a list of embeddings
|
|
149
|
+
*
|
|
150
|
+
* Uses median instead of mean for robustness against outliers.
|
|
151
|
+
* This handles cases where some pages have different header content
|
|
152
|
+
* (e.g., chapter title changes) that would otherwise drag down the average.
|
|
153
|
+
*/
|
|
154
|
+
function medianPairwiseSimilarity(embeddings) {
|
|
155
|
+
if (embeddings.length < 2)
|
|
156
|
+
return 1.0;
|
|
157
|
+
const similarities = [];
|
|
158
|
+
for (let i = 0; i < embeddings.length; i++) {
|
|
159
|
+
for (let j = i + 1; j < embeddings.length; j++) {
|
|
160
|
+
const embI = embeddings[i];
|
|
161
|
+
const embJ = embeddings[j];
|
|
162
|
+
if (embI && embJ) {
|
|
163
|
+
similarities.push((0, math_js_1.cosineSimilarity)(embI, embJ));
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if (similarities.length === 0)
|
|
168
|
+
return 0;
|
|
169
|
+
// Sort and find median
|
|
170
|
+
similarities.sort((a, b) => a - b);
|
|
171
|
+
const mid = Math.floor(similarities.length / 2);
|
|
172
|
+
if (similarities.length % 2 === 0) {
|
|
173
|
+
// Even: average of two middle values
|
|
174
|
+
return ((similarities[mid - 1] ?? 0) + (similarities[mid] ?? 0)) / 2;
|
|
175
|
+
}
|
|
176
|
+
// Odd: middle value
|
|
177
|
+
return similarities[mid] ?? 0;
|
|
178
|
+
}
|
|
179
|
+
/** Default configuration for sentence-level pattern detection */
|
|
180
|
+
exports.DEFAULT_SENTENCE_PATTERN_CONFIG = {
|
|
181
|
+
similarityThreshold: 0.85,
|
|
182
|
+
minPages: 3,
|
|
183
|
+
samplePages: 5,
|
|
184
|
+
};
|
|
185
|
+
/**
|
|
186
|
+
* Detect header/footer patterns at sentence level
|
|
187
|
+
*
|
|
188
|
+
* Algorithm:
|
|
189
|
+
* 1. Sample pages from the CENTER of the document (guaranteed to be content pages)
|
|
190
|
+
* 2. Split each page into sentences with Y coordinate
|
|
191
|
+
* 3. Collect first/last sentences from sampled pages
|
|
192
|
+
* 4. Embed and calculate median pairwise similarity
|
|
193
|
+
* 5. If similarity > threshold, mark as header/footer
|
|
194
|
+
*
|
|
195
|
+
* Key insight: Middle pages are always content pages (cover, TOC, index are at edges).
|
|
196
|
+
* Using median instead of mean provides robustness against outliers.
|
|
197
|
+
*
|
|
198
|
+
* This approach handles variable content like page numbers ("7 of 75")
|
|
199
|
+
* by using semantic similarity instead of exact text matching.
|
|
200
|
+
*
|
|
201
|
+
* @param pages - Array of page data
|
|
202
|
+
* @param embedder - Embedder for generating embeddings
|
|
203
|
+
* @param config - Configuration options
|
|
204
|
+
* @returns Detection result
|
|
205
|
+
*/
|
|
206
|
+
async function detectSentencePatterns(pages, embedder, config = {}) {
|
|
207
|
+
const cfg = { ...exports.DEFAULT_SENTENCE_PATTERN_CONFIG, ...config };
|
|
208
|
+
const result = {
|
|
209
|
+
removeFirstSentence: false,
|
|
210
|
+
removeLastSentence: false,
|
|
211
|
+
headerSimilarity: 0,
|
|
212
|
+
footerSimilarity: 0,
|
|
213
|
+
};
|
|
214
|
+
// Need minimum pages to detect patterns reliably
|
|
215
|
+
if (pages.length < cfg.minPages) {
|
|
216
|
+
return result;
|
|
217
|
+
}
|
|
218
|
+
// 1. Sample pages from the CENTER of the document
|
|
219
|
+
// Middle pages are guaranteed to be content (not cover, TOC, or index)
|
|
220
|
+
const centerIndex = Math.floor(pages.length / 2);
|
|
221
|
+
const halfSample = Math.floor(cfg.samplePages / 2);
|
|
222
|
+
const startIndex = Math.max(0, centerIndex - halfSample);
|
|
223
|
+
const endIndex = Math.min(pages.length, startIndex + cfg.samplePages);
|
|
224
|
+
const samplePages = pages.slice(startIndex, endIndex);
|
|
225
|
+
// 2. Split each page into sentences with Y coordinate (merged by Y)
|
|
226
|
+
const pageSentences = samplePages.map((page) => splitItemsIntoSentencesWithY(page.items));
|
|
227
|
+
// 3. Collect first and last sentences from sampled pages
|
|
228
|
+
const firstSentences = [];
|
|
229
|
+
const lastSentences = [];
|
|
230
|
+
for (const sentences of pageSentences) {
|
|
231
|
+
if (sentences.length > 0) {
|
|
232
|
+
firstSentences.push(sentences[0].text);
|
|
233
|
+
if (sentences.length > 1) {
|
|
234
|
+
lastSentences.push(sentences[sentences.length - 1].text);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
// 5. Detect header pattern (sampled first sentences are semantically similar)
|
|
239
|
+
if (firstSentences.length >= cfg.minPages) {
|
|
240
|
+
const embeddings = await embedder.embedBatch(firstSentences);
|
|
241
|
+
const medianSim = medianPairwiseSimilarity(embeddings);
|
|
242
|
+
result.headerSimilarity = medianSim;
|
|
243
|
+
if (medianSim >= cfg.similarityThreshold) {
|
|
244
|
+
result.removeFirstSentence = true;
|
|
245
|
+
console.error(`Sentence header detected: sampled ${firstSentences.length} center pages (${startIndex + 1}-${endIndex}), median similarity: ${medianSim.toFixed(3)}`);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
// 6. Detect footer pattern (sampled last sentences are semantically similar)
|
|
249
|
+
if (lastSentences.length >= cfg.minPages) {
|
|
250
|
+
const embeddings = await embedder.embedBatch(lastSentences);
|
|
251
|
+
const medianSim = medianPairwiseSimilarity(embeddings);
|
|
252
|
+
result.footerSimilarity = medianSim;
|
|
253
|
+
if (medianSim >= cfg.similarityThreshold) {
|
|
254
|
+
result.removeLastSentence = true;
|
|
255
|
+
console.error(`Sentence footer detected: sampled ${lastSentences.length} center pages (${startIndex + 1}-${endIndex}), median similarity: ${medianSim.toFixed(3)}`);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return result;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Filter page boundary sentences and join into text
|
|
262
|
+
*
|
|
263
|
+
* This is the main entry point for sentence-level header/footer filtering.
|
|
264
|
+
* It detects and removes repeating sentence patterns at page boundaries.
|
|
265
|
+
*
|
|
266
|
+
* Use this instead of joinFilteredPages when embedder is available.
|
|
267
|
+
*
|
|
268
|
+
* @param pages - Array of page data
|
|
269
|
+
* @param embedder - Embedder for generating embeddings
|
|
270
|
+
* @param config - Configuration options
|
|
271
|
+
* @returns Filtered text with header/footer sentences removed
|
|
272
|
+
*/
|
|
273
|
+
async function filterPageBoundarySentences(pages, embedder, config = {}) {
|
|
274
|
+
const cfg = { ...exports.DEFAULT_SENTENCE_PATTERN_CONFIG, ...config };
|
|
275
|
+
// Need minimum pages to detect patterns
|
|
276
|
+
if (pages.length < cfg.minPages) {
|
|
277
|
+
return joinFilteredPages(pages);
|
|
278
|
+
}
|
|
279
|
+
// Detect patterns
|
|
280
|
+
const patterns = await detectSentencePatterns(pages, embedder, cfg);
|
|
281
|
+
// If no patterns detected, return normally joined text
|
|
282
|
+
if (!patterns.removeFirstSentence && !patterns.removeLastSentence) {
|
|
283
|
+
return joinFilteredPages(pages);
|
|
284
|
+
}
|
|
285
|
+
// Split each page into sentences with Y coordinate (merged by Y)
|
|
286
|
+
const pageSentences = pages.map((page) => splitItemsIntoSentencesWithY(page.items));
|
|
287
|
+
// Remove detected patterns from page sentences
|
|
288
|
+
const cleanedPageSentences = pageSentences.map((sentences) => {
|
|
289
|
+
let cleaned = [...sentences];
|
|
290
|
+
if (patterns.removeFirstSentence && cleaned.length > 0) {
|
|
291
|
+
cleaned = cleaned.slice(1);
|
|
292
|
+
}
|
|
293
|
+
if (patterns.removeLastSentence && cleaned.length > 0) {
|
|
294
|
+
cleaned = cleaned.slice(0, -1);
|
|
295
|
+
}
|
|
296
|
+
return cleaned;
|
|
297
|
+
});
|
|
298
|
+
// Join back into final text
|
|
299
|
+
return cleanedPageSentences
|
|
300
|
+
.map((sentences) => sentences.map((s) => s.text).join(' '))
|
|
301
|
+
.filter((text) => text.length > 0)
|
|
302
|
+
.join('\n\n');
|
|
303
|
+
}
|
|
304
|
+
//# sourceMappingURL=pdf-filter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-filter.js","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":";AAAA,2BAA2B;AAC3B,wDAAwD;AACxD,uEAAuE;;;AAuEvE,8CAKC;AAmND,wDA0EC;AAeD,kEA6CC;AAlaD,0EAAoE;AA4BpE,+CAA+C;AAC/C,eAAe;AACf,+CAA+C;AAE/C;;;;;;GAMG;AACH,SAAS,aAAa,CAAC,KAA6B;IAClD,6DAA6D;IAC7D,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkC,CAAA;IACzD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QAClC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAChB,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAA;IACvB,CAAC;IAED,oFAAoF;IACpF,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAClB,KAAK;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SAClB,IAAI,CAAC,GAAG,CAAC,CACb;SACA,IAAI,CAAC,IAAI,CAAC;SACV,IAAI,EAAE,CAAA;AACX,CAAC;AAED;;;;;GAKG;AACH,SAAgB,iBAAiB,CAAC,KAAiB;IACjD,OAAO,KAAK;SACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;SACxC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;SACjC,IAAI,CAAC,MAAM,CAAC,CAAA;AACjB,CAAC;AAcD;;;;;;;;;;GAUG;AACH,SAAS,4BAA4B,CAAC,KAA6B;IACjE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAEjC,+DAA+D;IAC/D,MAAM,WAAW,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC3C,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QACvB,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAA;QACrC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;IAClB,CAAC,CAAC,CAAA;IAEF,2DAA2D;IAC3D,MAAM,UAAU,GAAyD,EAAE,CAAA;IAC3E,IAAI,QAAQ,GAAG,EAAE,CAAA;IACjB,IAAI,KAAK,GAAkB,IAAI,CAAA;IAE/B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC/B,4DAA4D;QAC5D,6EAA6E;QAC7E,IAAI,KAAK,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACnD,QAAQ,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAA;QACtC,CAAC;QAED,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAA;QACjD,QAAQ,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,CAAA;QAC3B,KAAK,GAAG,IAAI,CAAC,CAAC,CAAA;IAChB,CAAC;IAED,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,QAAQ,CAAC,CAAA;IAE9C,kEAAkE;IAClE,MAAM,cAAc,GAAoB,EAAE,CAAA;IAC1C,IAAI,WAAW,GAAG,CAAC,CAAA;IAEnB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,8CAA8C;QAC9C,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACpE,IAAI,aAAa,KAAK,CAAC,CAAC;YAAE,SAAQ;QAElC,4CAA4C;QAC5C,IAAI,UAAU,GAAG,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC3B,IAAI,KAAK,IAAI,KAAK,CAAC,KAAK,IAAI,aAAa,EAAE,CAAC;gBAC1C,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;gBACrC,MAAK;YACP,CAAC;QACH,CAAC;QAED,cAAc,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAA;QACtD,WAAW,GAAG,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAA;IAC/C,CAAC;IAED,yCAAyC;IACzC,OAAO,iBAAiB,CAAC,cAAc,CAAC,CAAA;AAC1C,CAAC;AAED;;;;;GAKG;AACH,SAAS,iBAAiB,CAAC,SAA0B;IACnD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAErC,MAAM,MAAM,GAAoB,EAAE,CAAA;IAClC,IAAI,OAAO,GAAyB,IAAI,CAAA;IAExC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACrB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;aAAM,IAAI,OAAO,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,EAAE,CAAC;YACpC,qBAAqB;YACrB,OAAO,CAAC,IAAI,IAAI,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAA;QACrC,CAAC;aAAM,CAAC;YACN,0CAA0C;YAC1C,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACpB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;QACrB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACtB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,yCAAyC;AACzC,+CAA+C;AAE/C,wCAAwC;AACxC,8CAAmD;AAEnD;;;;;;GAMG;AACH,SAAS,wBAAwB,CAAC,UAAsB;IACtD,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,GAAG,CAAA;IAErC,MAAM,YAAY,GAAa,EAAE,CAAA;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;gBACjB,YAAY,CAAC,IAAI,CAAC,IAAA,0BAAgB,EAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;YACjD,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAEvC,uBAAuB;IACvB,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAE/C,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QAClC,qCAAqC;QACrC,OAAO,CAAC,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACtE,CAAC;IACD,oBAAoB;IACpB,OAAO,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;AAC/B,CAAC;AAcD,iEAAiE;AACpD,QAAA,+BAA+B,GAA0B;IACpE,mBAAmB,EAAE,IAAI;IACzB,QAAQ,EAAE,CAAC;IACX,WAAW,EAAE,CAAC;CACf,CAAA;AAgBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACI,KAAK,UAAU,sBAAsB,CAC1C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,uCAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,MAAM,MAAM,GAA0B;QACpC,mBAAmB,EAAE,KAAK;QAC1B,kBAAkB,EAAE,KAAK;QACzB,gBAAgB,EAAE,CAAC;QACnB,gBAAgB,EAAE,CAAC;KACpB,CAAA;IAED,iDAAiD;IACjD,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,MAAM,CAAA;IACf,CAAC;IAED,kDAAkD;IAClD,uEAAuE;IACvE,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAChD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,CAAA;IAClD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC,CAAA;IACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,GAAG,GAAG,CAAC,WAAW,CAAC,CAAA;IACrE,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAA;IAErD,oEAAoE;IACpE,MAAM,aAAa,GAAsB,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAChE,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,yDAAyD;IACzD,MAAM,cAAc,GAAa,EAAE,CAAA;IACnC,MAAM,aAAa,GAAa,EAAE,CAAA;IAElC,KAAK,MAAM,SAAS,IAAI,aAAa,EAAE,CAAC;QACtC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YACvC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,aAAa,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC3D,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,IAAI,cAAc,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,cAAc,CAAC,CAAA;QAC5D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,mBAAmB,GAAG,IAAI,CAAA;YACjC,OAAO,CAAC,KAAK,CACX,qCAAqC,cAAc,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACtJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,6EAA6E;IAC7E,IAAI,aAAa,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;QAC3D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,kBAAkB,GAAG,IAAI,CAAA;YAChC,OAAO,CAAC,KAAK,CACX,qCAAqC,aAAa,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACrJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;;;;;;GAYG;AACI,KAAK,UAAU,2BAA2B,CAC/C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,uCAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,iBAAiB,CAAC,KAAK,CAAC,CAAA;IACjC,CAAC;IAED,kBAAkB;IAClB,MAAM,QAAQ,GAAG,MAAM,sBAAsB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAA;IAEnE,uDAAuD;IACvD,IAAI,CAAC,QAAQ,CAAC,mBAAmB,IAAI,CAAC,QAAQ,CAAC,kBAAkB,EAAE,CAAC;QAClE,OAAO,iBAAiB,CAAC,KAAK,CAAC,CAAA;IACjC,CAAC;IAED,iEAAiE;IACjE,MAAM,aAAa,GAAsB,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC1D,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,+CAA+C;IAC/C,MAAM,oBAAoB,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;QAC3D,IAAI,OAAO,GAAG,CAAC,GAAG,SAAS,CAAC,CAAA;QAE5B,IAAI,QAAQ,CAAC,mBAAmB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;QAC5B,CAAC;QAED,IAAI,QAAQ,CAAC,kBAAkB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAChC,CAAC;QAED,OAAO,OAAO,CAAA;IAChB,CAAC,CAAC,CAAA;IAEF,4BAA4B;IAC5B,OAAO,oBAAoB;SACxB,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;SAC1D,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;SACjC,IAAI,CAAC,MAAM,CAAC,CAAA;AACjB,CAAC"}
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import { type GroupingMode } from '../vectordb/index.js';
|
|
2
|
+
import { type DeleteFileInput, type IngestDataInput, type IngestFileInput, type QueryDocumentsInput } from './schemas.js';
|
|
3
|
+
/**
|
|
4
|
+
* RAGServer configuration
|
|
5
|
+
*/
|
|
6
|
+
export interface RAGServerConfig {
|
|
7
|
+
/** LanceDB database path */
|
|
8
|
+
dbPath: string;
|
|
9
|
+
/** Transformers.js model path */
|
|
10
|
+
modelName: string;
|
|
11
|
+
/** Model cache directory */
|
|
12
|
+
cacheDir: string;
|
|
13
|
+
/** Document base directory */
|
|
14
|
+
baseDir: string;
|
|
15
|
+
/** Maximum file size (100MB) */
|
|
16
|
+
maxFileSize: number;
|
|
17
|
+
/** Maximum distance threshold for quality filtering (optional) */
|
|
18
|
+
maxDistance?: number;
|
|
19
|
+
/** Grouping mode for quality filtering (optional) */
|
|
20
|
+
grouping?: GroupingMode;
|
|
21
|
+
/** Hybrid search weight for BM25 (0.0 = vector only, 1.0 = BM25 only, default 0.6) */
|
|
22
|
+
hybridWeight?: number;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* RAG server compliant with MCP Protocol
|
|
26
|
+
*
|
|
27
|
+
* Responsibilities:
|
|
28
|
+
* - MCP tool integration (6 tools)
|
|
29
|
+
* - Tool handler implementation with Zod validation
|
|
30
|
+
* - Error handling
|
|
31
|
+
* - Initialization (LanceDB, Transformers.js)
|
|
32
|
+
*/
|
|
33
|
+
export declare class RAGServer {
|
|
34
|
+
private readonly server;
|
|
35
|
+
private readonly vectorStore;
|
|
36
|
+
private readonly embedder;
|
|
37
|
+
private readonly chunker;
|
|
38
|
+
private readonly parser;
|
|
39
|
+
private readonly dbPath;
|
|
40
|
+
constructor(config: RAGServerConfig);
|
|
41
|
+
/**
|
|
42
|
+
* Set up MCP handlers using tool() API
|
|
43
|
+
* Note: Type casts are used to work around Zod version compatibility between project and SDK
|
|
44
|
+
*/
|
|
45
|
+
private setupHandlers;
|
|
46
|
+
/**
|
|
47
|
+
* Initialization
|
|
48
|
+
*/
|
|
49
|
+
initialize(): Promise<void>;
|
|
50
|
+
/**
|
|
51
|
+
* Close the server and release resources
|
|
52
|
+
*/
|
|
53
|
+
close(): Promise<void>;
|
|
54
|
+
/**
|
|
55
|
+
* Get the current database configuration
|
|
56
|
+
*/
|
|
57
|
+
getConfig(): {
|
|
58
|
+
dbPath: string;
|
|
59
|
+
modelName: string;
|
|
60
|
+
};
|
|
61
|
+
/**
|
|
62
|
+
* Execute query_documents logic (returns plain data)
|
|
63
|
+
*/
|
|
64
|
+
private executeQueryDocuments;
|
|
65
|
+
/**
|
|
66
|
+
* query_documents tool handler (for test compatibility)
|
|
67
|
+
*/
|
|
68
|
+
handleQueryDocuments(args: QueryDocumentsInput): Promise<{
|
|
69
|
+
content: [{
|
|
70
|
+
type: 'text';
|
|
71
|
+
text: string;
|
|
72
|
+
}];
|
|
73
|
+
}>;
|
|
74
|
+
/**
|
|
75
|
+
* Execute ingest_file logic (returns plain data)
|
|
76
|
+
*/
|
|
77
|
+
private executeIngestFile;
|
|
78
|
+
/**
|
|
79
|
+
* ingest_file tool handler (for test compatibility)
|
|
80
|
+
*/
|
|
81
|
+
handleIngestFile(args: IngestFileInput): Promise<{
|
|
82
|
+
content: [{
|
|
83
|
+
type: 'text';
|
|
84
|
+
text: string;
|
|
85
|
+
}];
|
|
86
|
+
}>;
|
|
87
|
+
/**
|
|
88
|
+
* Execute ingest_data logic (returns plain data)
|
|
89
|
+
*/
|
|
90
|
+
private executeIngestData;
|
|
91
|
+
/**
|
|
92
|
+
* ingest_data tool handler (for test compatibility)
|
|
93
|
+
*/
|
|
94
|
+
handleIngestData(args: IngestDataInput): Promise<{
|
|
95
|
+
content: [{
|
|
96
|
+
type: 'text';
|
|
97
|
+
text: string;
|
|
98
|
+
}];
|
|
99
|
+
}>;
|
|
100
|
+
/**
|
|
101
|
+
* Execute list_files logic (returns plain data)
|
|
102
|
+
*/
|
|
103
|
+
private executeListFiles;
|
|
104
|
+
/**
|
|
105
|
+
* list_files tool handler (for test compatibility)
|
|
106
|
+
*/
|
|
107
|
+
handleListFiles(): Promise<{
|
|
108
|
+
content: [{
|
|
109
|
+
type: 'text';
|
|
110
|
+
text: string;
|
|
111
|
+
}];
|
|
112
|
+
}>;
|
|
113
|
+
/**
|
|
114
|
+
* Execute status logic (returns plain data)
|
|
115
|
+
*/
|
|
116
|
+
private executeStatus;
|
|
117
|
+
/**
|
|
118
|
+
* status tool handler (for test compatibility)
|
|
119
|
+
*/
|
|
120
|
+
handleStatus(): Promise<{
|
|
121
|
+
content: [{
|
|
122
|
+
type: 'text';
|
|
123
|
+
text: string;
|
|
124
|
+
}];
|
|
125
|
+
}>;
|
|
126
|
+
/**
|
|
127
|
+
* Execute delete_file logic (returns plain data)
|
|
128
|
+
*/
|
|
129
|
+
private executeDeleteFile;
|
|
130
|
+
/**
|
|
131
|
+
* delete_file tool handler (for test compatibility)
|
|
132
|
+
*/
|
|
133
|
+
handleDeleteFile(args: DeleteFileInput): Promise<{
|
|
134
|
+
content: [{
|
|
135
|
+
type: 'text';
|
|
136
|
+
text: string;
|
|
137
|
+
}];
|
|
138
|
+
}>;
|
|
139
|
+
/**
|
|
140
|
+
* Start the server
|
|
141
|
+
*/
|
|
142
|
+
run(): Promise<void>;
|
|
143
|
+
}
|
|
144
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/server/index.ts"],"names":[],"mappings":"AAYA,OAAO,EAAE,KAAK,YAAY,EAAiC,MAAM,sBAAsB,CAAA;AAQvF,OAAO,EACL,KAAK,eAAe,EAIpB,KAAK,eAAe,EACpB,KAAK,eAAe,EAEpB,KAAK,mBAAmB,EAGzB,MAAM,cAAc,CAAA;AAMrB;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAA;IACd,iCAAiC;IACjC,SAAS,EAAE,MAAM,CAAA;IACjB,4BAA4B;IAC5B,QAAQ,EAAE,MAAM,CAAA;IAChB,8BAA8B;IAC9B,OAAO,EAAE,MAAM,CAAA;IACf,gCAAgC;IAChC,WAAW,EAAE,MAAM,CAAA;IACnB,kEAAkE;IAClE,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,qDAAqD;IACrD,QAAQ,CAAC,EAAE,YAAY,CAAA;IACvB,sFAAsF;IACtF,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAMD;;;;;;;;GAQG;AACH,qBAAa,SAAS;IACpB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAW;IAClC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAa;IACzC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAU;IACnC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAiB;IACzC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAgB;IACvC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAQ;gBAEnB,MAAM,EAAE,eAAe;IAqCnC;;;OAGG;IACH,OAAO,CAAC,aAAa;IAuGrB;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAKjC;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAK5B;;OAEG;IACH,SAAS,IAAI;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE;IAOlD;;OAEG;YACW,qBAAqB;IA4BnC;;OAEG;IACG,oBAAoB,CACxB,IAAI,EAAE,mBAAmB,GACxB,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAiBzD;;OAEG;YACW,iBAAiB;IA6G/B;;OAEG;IACG,gBAAgB,CACpB,IAAI,EAAE,eAAe,GACpB,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAkBzD;;OAEG;YACW,iBAAiB;IA6C/B;;OAEG;IACG,gBAAgB,CACpB,IAAI,EAAE,eAAe,GACpB,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAkBzD;;OAEG;YACW,gBAAgB;IAe9B;;OAEG;IACG,eAAe,IAAI,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAiB/E;;OAEG;YACW,aAAa;IAI3B;;OAEG;IACG,YAAY,IAAI,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAiB5E;;OAEG;YACW,iBAAiB;IAyC/B;;OAEG;IACG,gBAAgB,CACpB,IAAI,EAAE,eAAe,GACpB,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAkBzD;;OAEG;IACG,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC;CAK3B"}
|