mcp-local-rag 0.5.6 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/parser/html-parser.d.ts +6 -2
- package/dist/parser/html-parser.d.ts.map +1 -1
- package/dist/parser/html-parser.js +19 -10
- package/dist/parser/html-parser.js.map +1 -1
- package/dist/parser/index.d.ts +18 -7
- package/dist/parser/index.d.ts.map +1 -1
- package/dist/parser/index.js +52 -13
- package/dist/parser/index.js.map +1 -1
- package/dist/parser/pdf-filter.d.ts +4 -3
- package/dist/parser/pdf-filter.d.ts.map +1 -1
- package/dist/parser/pdf-filter.js +8 -10
- package/dist/parser/pdf-filter.js.map +1 -1
- package/dist/parser/title-extractor.d.ts +64 -0
- package/dist/parser/title-extractor.d.ts.map +1 -0
- package/dist/parser/title-extractor.js +139 -0
- package/dist/parser/title-extractor.js.map +1 -0
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +45 -6
- package/dist/server/index.js.map +1 -1
- package/dist/server/raw-data-utils.d.ts +32 -0
- package/dist/server/raw-data-utils.d.ts.map +1 -1
- package/dist/server/raw-data-utils.js +46 -0
- package/dist/server/raw-data-utils.js.map +1 -1
- package/dist/server/types.d.ts +6 -0
- package/dist/server/types.d.ts.map +1 -1
- package/dist/server-main.d.ts.map +1 -1
- package/dist/server-main.js +17 -0
- package/dist/server-main.js.map +1 -1
- package/dist/vectordb/index.d.ts +9 -94
- package/dist/vectordb/index.d.ts.map +1 -1
- package/dist/vectordb/index.js +55 -185
- package/dist/vectordb/index.js.map +1 -1
- package/dist/vectordb/search-filters.d.ts +45 -0
- package/dist/vectordb/search-filters.d.ts.map +1 -0
- package/dist/vectordb/search-filters.js +142 -0
- package/dist/vectordb/search-filters.js.map +1 -0
- package/dist/vectordb/types.d.ts +112 -0
- package/dist/vectordb/types.d.ts.map +1 -0
- package/dist/vectordb/types.js +74 -0
- package/dist/vectordb/types.js.map +1 -0
- package/package.json +1 -1
- package/skills/mcp-local-rag/SKILL.md +10 -0
- package/skills/mcp-local-rag/references/html-ingestion.md +2 -1
- package/skills/mcp-local-rag/references/result-refinement.md +1 -0
package/README.md
CHANGED
|
@@ -129,7 +129,7 @@ HTML is automatically cleaned—you get the article content, not the boilerplate
|
|
|
129
129
|
|
|
130
130
|
Search uses semantic similarity with keyword boost. This means `useEffect` finds documents containing that exact term, not just semantically similar React concepts.
|
|
131
131
|
|
|
132
|
-
Results include text content, source file, and relevance score. Adjust result count with `limit` (1-20, default 10).
|
|
132
|
+
Results include text content, source file, document title, and relevance score. The document title provides context for each chunk, helping identify which document a result belongs to. Adjust result count with `limit` (1-20, default 10).
|
|
133
133
|
|
|
134
134
|
### Managing Files
|
|
135
135
|
|
|
@@ -148,6 +148,7 @@ Adjust these for your use case:
|
|
|
148
148
|
| `RAG_HYBRID_WEIGHT` | `0.6` | Keyword boost factor. 0 = semantic only, higher = stronger keyword boost. |
|
|
149
149
|
| `RAG_GROUPING` | (not set) | `similar` for top group only, `related` for top 2 groups. |
|
|
150
150
|
| `RAG_MAX_DISTANCE` | (not set) | Filter out low-relevance results (e.g., `0.5`). |
|
|
151
|
+
| `RAG_MAX_FILES` | (not set) | Limit results to top N files (e.g., `1` for single best file). |
|
|
151
152
|
|
|
152
153
|
### Code-focused tuning
|
|
153
154
|
|
|
@@ -5,10 +5,14 @@
|
|
|
5
5
|
* 1. HTML string → JSDOM (DOM creation)
|
|
6
6
|
* 2. JSDOM → Readability (main content extraction, noise removal)
|
|
7
7
|
* 3. Readability result → Turndown (Markdown conversion)
|
|
8
|
+
* 4. Title extracted separately via extractHtmlTitle (NOT prepended to content)
|
|
8
9
|
*
|
|
9
10
|
* @param html - Raw HTML string
|
|
10
11
|
* @param url - Source URL (used for resolving relative links)
|
|
11
|
-
* @returns
|
|
12
|
+
* @returns Object with content (markdown) and title (extracted separately)
|
|
12
13
|
*/
|
|
13
|
-
export declare function parseHtml(html: string, url: string): Promise<
|
|
14
|
+
export declare function parseHtml(html: string, url: string): Promise<{
|
|
15
|
+
content: string;
|
|
16
|
+
title: string;
|
|
17
|
+
}>;
|
|
14
18
|
//# sourceMappingURL=html-parser.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":"AAuDA;;;;;;;;;;;;GAYG;AACH,wBAAsB,SAAS,CAC7B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CA0D7C"}
|
|
@@ -9,6 +9,7 @@ exports.parseHtml = parseHtml;
|
|
|
9
9
|
const readability_1 = require("@mozilla/readability");
|
|
10
10
|
const jsdom_1 = require("jsdom");
|
|
11
11
|
const turndown_1 = __importDefault(require("turndown"));
|
|
12
|
+
const title_extractor_js_1 = require("./title-extractor.js");
|
|
12
13
|
// ============================================
|
|
13
14
|
// Turndown Service Configuration
|
|
14
15
|
// ============================================
|
|
@@ -46,15 +47,16 @@ function createTurndownService() {
|
|
|
46
47
|
* 1. HTML string → JSDOM (DOM creation)
|
|
47
48
|
* 2. JSDOM → Readability (main content extraction, noise removal)
|
|
48
49
|
* 3. Readability result → Turndown (Markdown conversion)
|
|
50
|
+
* 4. Title extracted separately via extractHtmlTitle (NOT prepended to content)
|
|
49
51
|
*
|
|
50
52
|
* @param html - Raw HTML string
|
|
51
53
|
* @param url - Source URL (used for resolving relative links)
|
|
52
|
-
* @returns
|
|
54
|
+
* @returns Object with content (markdown) and title (extracted separately)
|
|
53
55
|
*/
|
|
54
56
|
async function parseHtml(html, url) {
|
|
55
57
|
// Handle empty or whitespace-only HTML
|
|
56
58
|
if (!html || html.trim().length === 0) {
|
|
57
|
-
return '';
|
|
59
|
+
return { content: '', title: '' };
|
|
58
60
|
}
|
|
59
61
|
try {
|
|
60
62
|
// Create DOM from HTML string
|
|
@@ -75,25 +77,32 @@ async function parseHtml(html, url) {
|
|
|
75
77
|
// Try to get body content directly
|
|
76
78
|
const bodyContent = document.body?.innerHTML || '';
|
|
77
79
|
if (!bodyContent.trim()) {
|
|
78
|
-
return '';
|
|
80
|
+
return { content: '', title: '' };
|
|
79
81
|
}
|
|
80
82
|
// Convert raw body HTML to Markdown
|
|
81
83
|
const turndownService = createTurndownService();
|
|
82
|
-
return turndownService.turndown(bodyContent).trim();
|
|
84
|
+
return { content: turndownService.turndown(bodyContent).trim(), title: '' };
|
|
83
85
|
}
|
|
84
86
|
// Convert extracted HTML content to Markdown
|
|
85
87
|
const turndownService = createTurndownService();
|
|
86
88
|
const markdown = turndownService.turndown(article.content);
|
|
87
|
-
//
|
|
88
|
-
|
|
89
|
-
|
|
89
|
+
// Extract title separately (NOT prepended to markdown content)
|
|
90
|
+
// Use URL-derived filename as fallback when Readability has no title
|
|
91
|
+
let urlFileName = '';
|
|
92
|
+
try {
|
|
93
|
+
urlFileName = new URL(url).pathname.split('/').filter(Boolean).pop() || '';
|
|
90
94
|
}
|
|
91
|
-
|
|
95
|
+
catch {
|
|
96
|
+
// Non-URL string, empty fallback
|
|
97
|
+
}
|
|
98
|
+
const titleResult = (0, title_extractor_js_1.extractHtmlTitle)(article.title || '', urlFileName);
|
|
99
|
+
const title = titleResult.title;
|
|
100
|
+
return { content: markdown.trim(), title };
|
|
92
101
|
}
|
|
93
102
|
catch (error) {
|
|
94
|
-
// Log error but don't throw - return empty
|
|
103
|
+
// Log error but don't throw - return empty values for graceful degradation
|
|
95
104
|
console.error('Failed to parse HTML:', error);
|
|
96
|
-
return '';
|
|
105
|
+
return { content: '', title: '' };
|
|
97
106
|
}
|
|
98
107
|
}
|
|
99
108
|
//# sourceMappingURL=html-parser.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":";AAAA,6CAA6C;AAC7C,2DAA2D;;;;;
|
|
1
|
+
{"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":";AAAA,6CAA6C;AAC7C,2DAA2D;;;;;AAmE3D,8BA6DC;AA9HD,sDAAkD;AAClD,iCAA6B;AAC7B,wDAAsC;AACtC,6DAAuD;AAcvD,+CAA+C;AAC/C,iCAAiC;AACjC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,qBAAqB;IAC5B,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK,EAAE,uBAAuB;QAC5C,cAAc,EAAE,QAAQ,EAAE,0BAA0B;QACpD,gBAAgB,EAAE,GAAG,EAAE,yBAAyB;QAChD,WAAW,EAAE,GAAG,EAAE,qBAAqB;QACvC,eAAe,EAAE,IAAI,EAAE,kBAAkB;KAC1C,CAAC,CAAA;IAEF,0BAA0B;IAC1B,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE;QACpC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;YAC9B,MAAM,OAAO,GAAG,IAAe,CAAA;YAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAA;YACjD,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAA;YACxE,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,IAAI,EAAE,CAAA;YACvE,OAAO,WAAW,QAAQ,KAAK,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,CAAA;QAC/D,CAAC;KACF,CAAC,CAAA;IAEF,OAAO,eAAe,CAAA;AACxB,CAAC;AAED,+CAA+C;AAC/C,cAAc;AACd,+CAA+C;AAE/C;;;;;;;;;;;;GAYG;AACI,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,GAAW;IAEX,uCAAuC;IACvC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;IACnC,CAAC;IAED,IAAI,CAAC;QACH,8BAA8B;QAC9B,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE;YAC1B,GAAG;YACH,yCAAyC;YACzC,UAAU,EAAE,cAAc;SAC3B,CAAC,CAAA;QAEF,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAA;QAEpC,0CAA0C;QAC1C,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,QAAQ,EAAE;YACvC,WAAW,EAAE,KAAK;YAClB,KAAK,EAAE,KAAK;SACb,CAAC,CAAA;QAEF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAA8B,CAAA;QAE1D,kEAAkE;QAClE,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YACjC,mCAAmC;YACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,IAAI,EAAE,CAAA;YAClD,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;gBACxB,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;YACnC,CAAC;YAED,oCAAoC;YACpC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;YAC/C,OAAO,EAAE,OAAO,EAAE,eAAe,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAC7E,CAAC;QAED,6CAA6C;QAC7C,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;QAC/C,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;QAE1D,+DAA+D;QAC/D,qEAAqE;QACrE,IAAI,WAAW,GAAG,EAAE,CAAA;QACpB,IAAI,CAAC;YACH,WAAW,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAA;QAC5E,CAAC;QAAC,MAAM,CAAC;YACP,iCAAiC;QACnC,CAAC;QACD,MAAM,WAAW,GAAG,IAAA,qCAAgB,EAAC,OAAO,CAAC,KAAK,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACtE,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,CAAA;QAE/B,OAAO,EAAE,OAAO,EAAE,QAAQ,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAA;IAC5C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,2EAA2E;QAC3E,OAAO,CAAC,KAAK,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;IACnC,CAAC;AACH,CAAC"}
|
package/dist/parser/index.d.ts
CHANGED
|
@@ -1,4 +1,12 @@
|
|
|
1
1
|
import { type EmbedderInterface } from './pdf-filter.js';
|
|
2
|
+
/**
|
|
3
|
+
* Result from parsing a document, containing both content and extracted title.
|
|
4
|
+
* Title is display-only metadata (NOT used for search scoring).
|
|
5
|
+
*/
|
|
6
|
+
export interface ParseResult {
|
|
7
|
+
content: string;
|
|
8
|
+
title: string;
|
|
9
|
+
}
|
|
2
10
|
/**
|
|
3
11
|
* DocumentParser configuration
|
|
4
12
|
*/
|
|
@@ -54,11 +62,11 @@ export declare class DocumentParser {
|
|
|
54
62
|
* File parsing (auto format detection)
|
|
55
63
|
*
|
|
56
64
|
* @param filePath - File path to parse
|
|
57
|
-
* @returns
|
|
65
|
+
* @returns ParseResult with content and extracted title
|
|
58
66
|
* @throws ValidationError - Path traversal, size exceeded, unsupported format
|
|
59
67
|
* @throws FileOperationError - File read failed, parse failed
|
|
60
68
|
*/
|
|
61
|
-
parseFile(filePath: string): Promise<
|
|
69
|
+
parseFile(filePath: string): Promise<ParseResult>;
|
|
62
70
|
/**
|
|
63
71
|
* PDF parsing with header/footer filtering
|
|
64
72
|
*
|
|
@@ -66,18 +74,21 @@ export declare class DocumentParser {
|
|
|
66
74
|
* - Extracts text with position information (x, y, fontSize)
|
|
67
75
|
* - Semantic header/footer detection using embedding similarity
|
|
68
76
|
* - Uses hasEOL for proper line break handling
|
|
77
|
+
* - Extracts document title from PDF metadata and first page font heuristic
|
|
69
78
|
*
|
|
70
79
|
* @param filePath - PDF file path
|
|
71
80
|
* @param embedder - Embedder for semantic header/footer detection
|
|
72
|
-
* @returns
|
|
81
|
+
* @returns ParseResult with content and extracted title
|
|
73
82
|
* @throws FileOperationError - File read failed, parse failed
|
|
74
83
|
*/
|
|
75
|
-
parsePdf(filePath: string, embedder: EmbedderInterface): Promise<
|
|
84
|
+
parsePdf(filePath: string, embedder: EmbedderInterface): Promise<ParseResult>;
|
|
76
85
|
/**
|
|
77
86
|
* DOCX parsing (using mammoth)
|
|
78
87
|
*
|
|
88
|
+
* Uses extractRawText for content and convertToHtml additionally for title detection.
|
|
89
|
+
*
|
|
79
90
|
* @param filePath - DOCX file path
|
|
80
|
-
* @returns
|
|
91
|
+
* @returns ParseResult with content and extracted title
|
|
81
92
|
* @throws FileOperationError - File read failed, parse failed
|
|
82
93
|
*/
|
|
83
94
|
private parseDocx;
|
|
@@ -85,7 +96,7 @@ export declare class DocumentParser {
|
|
|
85
96
|
* TXT parsing (using fs.readFile)
|
|
86
97
|
*
|
|
87
98
|
* @param filePath - TXT file path
|
|
88
|
-
* @returns
|
|
99
|
+
* @returns ParseResult with content and extracted title
|
|
89
100
|
* @throws FileOperationError - File read failed
|
|
90
101
|
*/
|
|
91
102
|
private parseTxt;
|
|
@@ -93,7 +104,7 @@ export declare class DocumentParser {
|
|
|
93
104
|
* MD parsing (using fs.readFile)
|
|
94
105
|
*
|
|
95
106
|
* @param filePath - MD file path
|
|
96
|
-
* @returns
|
|
107
|
+
* @returns ParseResult with content and extracted title
|
|
97
108
|
* @throws FileOperationError - File read failed
|
|
98
109
|
*/
|
|
99
110
|
private parseMd;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,KAAK,iBAAiB,EAA8C,MAAM,iBAAiB,CAAA;AAYpG;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;GAEG;AACH,UAAU,YAAY;IACpB,uCAAuC;IACvC,OAAO,EAAE,MAAM,CAAA;IACf,gCAAgC;IAChC,WAAW,EAAE,MAAM,CAAA;CACpB;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,KAAK;aAGb,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAED;;GAEG;AACH,qBAAa,kBAAmB,SAAQ,KAAK;aAGhB,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAMD;;;;;;;GAOG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,6FAA6F;IAC7F,OAAO,CAAC,eAAe,CAAsB;gBAEjC,MAAM,EAAE,YAAY;IAIhC;;;;;OAKG;IACG,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAiDvD;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI;IAgBxC;;;;;;;OAOG;IACG,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAmBvD;;;;;;;;;;;;;OAaG;IACG,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IAqEnF;;;;;;;;OAQG;YACW,SAAS;IAqBvB;;;;;;OAMG;YACW,QAAQ;IAYtB;;;;;;OAMG;YACW,OAAO;CAWtB"}
|
package/dist/parser/index.js
CHANGED
|
@@ -10,7 +10,9 @@ const promises_1 = require("node:fs/promises");
|
|
|
10
10
|
const node_path_1 = require("node:path");
|
|
11
11
|
const mammoth_1 = __importDefault(require("mammoth"));
|
|
12
12
|
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
13
|
+
const index_js_1 = require("../chunker/index.js");
|
|
13
14
|
const pdf_filter_js_1 = require("./pdf-filter.js");
|
|
15
|
+
const title_extractor_js_1 = require("./title-extractor.js");
|
|
14
16
|
/**
|
|
15
17
|
* Validation error (equivalent to 400)
|
|
16
18
|
*/
|
|
@@ -118,7 +120,7 @@ class DocumentParser {
|
|
|
118
120
|
* File parsing (auto format detection)
|
|
119
121
|
*
|
|
120
122
|
* @param filePath - File path to parse
|
|
121
|
-
* @returns
|
|
123
|
+
* @returns ParseResult with content and extracted title
|
|
122
124
|
* @throws ValidationError - Path traversal, size exceeded, unsupported format
|
|
123
125
|
* @throws FileOperationError - File read failed, parse failed
|
|
124
126
|
*/
|
|
@@ -146,10 +148,11 @@ class DocumentParser {
|
|
|
146
148
|
* - Extracts text with position information (x, y, fontSize)
|
|
147
149
|
* - Semantic header/footer detection using embedding similarity
|
|
148
150
|
* - Uses hasEOL for proper line break handling
|
|
151
|
+
* - Extracts document title from PDF metadata and first page font heuristic
|
|
149
152
|
*
|
|
150
153
|
* @param filePath - PDF file path
|
|
151
154
|
* @param embedder - Embedder for semantic header/footer detection
|
|
152
|
-
* @returns
|
|
155
|
+
* @returns ParseResult with content and extracted title
|
|
153
156
|
* @throws FileOperationError - File read failed, parse failed
|
|
154
157
|
*/
|
|
155
158
|
async parsePdf(filePath, embedder) {
|
|
@@ -163,6 +166,9 @@ class DocumentParser {
|
|
|
163
166
|
useSystemFonts: true,
|
|
164
167
|
isEvalSupported: false,
|
|
165
168
|
}).promise;
|
|
169
|
+
// Extract metadata for title extraction
|
|
170
|
+
const metadata = await pdf.getMetadata();
|
|
171
|
+
const metadataTitle = metadata?.info?.['Title'];
|
|
166
172
|
// Extract text with position information from each page
|
|
167
173
|
const pages = [];
|
|
168
174
|
for (let i = 1; i <= pdf.numPages; i++) {
|
|
@@ -179,11 +185,30 @@ class DocumentParser {
|
|
|
179
185
|
}));
|
|
180
186
|
pages.push({ pageNum: i, items });
|
|
181
187
|
}
|
|
182
|
-
// Apply sentence-level header/footer filtering
|
|
188
|
+
// Apply sentence-level header/footer filtering (returns per-page filtered text)
|
|
183
189
|
// This handles variable content like page numbers ("7 of 75") using semantic similarity
|
|
184
|
-
const
|
|
190
|
+
const filteredPages = await (0, pdf_filter_js_1.filterPageBoundarySentences)(pages, embedder);
|
|
191
|
+
const text = filteredPages.filter((t) => t.length > 0).join('\n\n');
|
|
192
|
+
// Extract title from filtered page 1 via semantic chunking
|
|
193
|
+
// Isolated try-catch: title extraction failure should not abort PDF ingestion
|
|
194
|
+
const fileName = (0, node_path_1.basename)(filePath);
|
|
195
|
+
let firstPageChunkText;
|
|
196
|
+
try {
|
|
197
|
+
const filteredPage1 = filteredPages[0];
|
|
198
|
+
if (filteredPage1 && filteredPage1.trim().length > 0) {
|
|
199
|
+
const chunker = new index_js_1.SemanticChunker();
|
|
200
|
+
const page1Chunks = await chunker.chunkText(filteredPage1, embedder);
|
|
201
|
+
if (page1Chunks.length > 0) {
|
|
202
|
+
firstPageChunkText = page1Chunks[0].text;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
catch (titleError) {
|
|
207
|
+
console.error(`Title extraction failed, falling back to filename: ${titleError}`);
|
|
208
|
+
}
|
|
209
|
+
const titleResult = (0, title_extractor_js_1.extractPdfTitle)(metadataTitle, firstPageChunkText, fileName);
|
|
185
210
|
console.error(`Parsed PDF: ${filePath} (${text.length} characters, ${pdf.numPages} pages)`);
|
|
186
|
-
return text;
|
|
211
|
+
return { content: text, title: titleResult.title };
|
|
187
212
|
}
|
|
188
213
|
catch (error) {
|
|
189
214
|
throw new FileOperationError(`Failed to parse PDF: ${filePath}`, error);
|
|
@@ -192,15 +217,25 @@ class DocumentParser {
|
|
|
192
217
|
/**
|
|
193
218
|
* DOCX parsing (using mammoth)
|
|
194
219
|
*
|
|
220
|
+
* Uses extractRawText for content and convertToHtml additionally for title detection.
|
|
221
|
+
*
|
|
195
222
|
* @param filePath - DOCX file path
|
|
196
|
-
* @returns
|
|
223
|
+
* @returns ParseResult with content and extracted title
|
|
197
224
|
* @throws FileOperationError - File read failed, parse failed
|
|
198
225
|
*/
|
|
199
226
|
async parseDocx(filePath) {
|
|
200
227
|
try {
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
228
|
+
// Read file once and pass buffer to both mammoth calls
|
|
229
|
+
const buffer = await (0, promises_1.readFile)(filePath);
|
|
230
|
+
// Use extractRawText for content (unchanged behavior)
|
|
231
|
+
const result = await mammoth_1.default.extractRawText({ buffer });
|
|
232
|
+
const rawText = result.value;
|
|
233
|
+
// Use convertToHtml additionally for title extraction (first <h1>)
|
|
234
|
+
const htmlResult = await mammoth_1.default.convertToHtml({ buffer });
|
|
235
|
+
const fileName = (0, node_path_1.basename)(filePath);
|
|
236
|
+
const titleResult = (0, title_extractor_js_1.extractDocxTitle)(htmlResult.value, fileName);
|
|
237
|
+
console.error(`Parsed DOCX: ${filePath} (${rawText.length} characters)`);
|
|
238
|
+
return { content: rawText, title: titleResult.title };
|
|
204
239
|
}
|
|
205
240
|
catch (error) {
|
|
206
241
|
throw new FileOperationError(`Failed to parse DOCX: ${filePath}`, error);
|
|
@@ -210,14 +245,16 @@ class DocumentParser {
|
|
|
210
245
|
* TXT parsing (using fs.readFile)
|
|
211
246
|
*
|
|
212
247
|
* @param filePath - TXT file path
|
|
213
|
-
* @returns
|
|
248
|
+
* @returns ParseResult with content and extracted title
|
|
214
249
|
* @throws FileOperationError - File read failed
|
|
215
250
|
*/
|
|
216
251
|
async parseTxt(filePath) {
|
|
217
252
|
try {
|
|
218
253
|
const text = await (0, promises_1.readFile)(filePath, 'utf-8');
|
|
254
|
+
const fileName = (0, node_path_1.basename)(filePath);
|
|
255
|
+
const titleResult = (0, title_extractor_js_1.extractTxtTitle)(text, fileName);
|
|
219
256
|
console.error(`Parsed TXT: ${filePath} (${text.length} characters)`);
|
|
220
|
-
return text;
|
|
257
|
+
return { content: text, title: titleResult.title };
|
|
221
258
|
}
|
|
222
259
|
catch (error) {
|
|
223
260
|
throw new FileOperationError(`Failed to parse TXT: ${filePath}`, error);
|
|
@@ -227,14 +264,16 @@ class DocumentParser {
|
|
|
227
264
|
* MD parsing (using fs.readFile)
|
|
228
265
|
*
|
|
229
266
|
* @param filePath - MD file path
|
|
230
|
-
* @returns
|
|
267
|
+
* @returns ParseResult with content and extracted title
|
|
231
268
|
* @throws FileOperationError - File read failed
|
|
232
269
|
*/
|
|
233
270
|
async parseMd(filePath) {
|
|
234
271
|
try {
|
|
235
272
|
const text = await (0, promises_1.readFile)(filePath, 'utf-8');
|
|
273
|
+
const fileName = (0, node_path_1.basename)(filePath);
|
|
274
|
+
const titleResult = (0, title_extractor_js_1.extractMarkdownTitle)(text, fileName);
|
|
236
275
|
console.error(`Parsed MD: ${filePath} (${text.length} characters)`);
|
|
237
|
-
return text;
|
|
276
|
+
return { content: text, title: titleResult.title };
|
|
238
277
|
}
|
|
239
278
|
catch (error) {
|
|
240
279
|
throw new FileOperationError(`Failed to parse MD: ${filePath}`, error);
|
package/dist/parser/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":";AAAA,6DAA6D;;;;;;AAE7D,qCAAkC;AAClC,+CAA4D;AAC5D,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":";AAAA,6DAA6D;;;;;;AAE7D,qCAAkC;AAClC,+CAA4D;AAC5D,yCAAuE;AACvE,sDAA6B;AAC7B,6DAA6D;AAE7D,kDAAqD;AACrD,mDAAoG;AACpG,6DAK6B;AAyB7B;;GAEG;AACH,MAAa,eAAgB,SAAQ,KAAK;IACxC,YACE,OAAe,EACU,KAAa;QAEtC,KAAK,CAAC,OAAO,CAAC,CAAA;QAFW,UAAK,GAAL,KAAK,CAAQ;QAGtC,IAAI,CAAC,IAAI,GAAG,iBAAiB,CAAA;IAC/B,CAAC;CACF;AARD,0CAQC;AAED;;GAEG;AACH,MAAa,kBAAmB,SAAQ,KAAK;IAC3C,YACE,OAAe,EACU,KAAa;QAEtC,KAAK,CAAC,OAAO,CAAC,CAAA;QAFW,UAAK,GAAL,KAAK,CAAQ;QAGtC,IAAI,CAAC,IAAI,GAAG,oBAAoB,CAAA;IAClC,CAAC;CACF;AARD,gDAQC;AAED,+CAA+C;AAC/C,uBAAuB;AACvB,+CAA+C;AAE/C;;;;;;;GAOG;AACH,MAAa,cAAc;IAKzB,YAAY,MAAoB;QAHhC,6FAA6F;QACrF,oBAAe,GAAkB,IAAI,CAAA;QAG3C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAA;IACtB,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,gBAAgB,CAAC,QAAgB;QACrC,wDAAwD;QACxD,IAAI,CAAC,IAAA,sBAAU,EAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,eAAe,CACvB,8CAA8C,QAAQ,qDAAqD,CAC5G,CAAA;QACH,CAAC;QAED,oEAAoE;QACpE,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,MAAM,IAAA,mBAAQ,EAAC,IAAA,mBAAO,EAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAA;YAC7D,uDAAuD;YACvD,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC,QAAQ,CAAC,eAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,GAAG,eAAG,CAAA;QAC3E,CAAC;QAED,uDAAuD;QACvD,IAAI,YAAoB,CAAA;QACxB,IAAI,CAAC;YACH,YAAY,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;QACzC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,sDAAsD;YACtD,iEAAiE;YACjE,oEAAoE;YACpE,4EAA4E;YAC5E,MAAM,SAAS,GAAG,MAAM,IAAA,gBAAK,EAAC,QAAQ,CAAC;iBACpC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,cAAc,EAAE,CAAC;iBACvC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAA;YAErB,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,IAAI,eAAe,CACvB,6BAA6B,QAAQ,kDAAkD,EACvF,KAAc,CACf,CAAA;YACH,CAAC;YAED,0EAA0E;YAC1E,kFAAkF;YAClF,2FAA2F;YAC3F,YAAY,GAAG,IAAA,mBAAO,EAAC,QAAQ,CAAC,CAAA;QAClC,CAAC;QAED,4CAA4C;QAC5C,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,eAAe,CACvB,sCAAsC,IAAI,CAAC,eAAe,sCAAsC,QAAQ,EAAE,CAC3G,CAAA;QACH,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAgB;QAC/B,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,IAAA,kBAAQ,EAAC,QAAQ,CAAC,CAAA;YAChC,IAAI,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;gBACzC,MAAM,IAAI,eAAe,CACvB,4BAA4B,KAAK,CAAC,IAAI,MAAM,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CACtE,CAAA;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,KAAK,YAAY,eAAe,EAAE,CAAC;gBACrC,MAAM,KAAK,CAAA;YACb,CAAC;YACD,MAAM,IAAI,kBAAkB,CAAC,8BAA8B,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACxF,CAAC;IACH,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,QAAgB;QAC9B,aAAa;QACb,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QACrC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QAE/B,gDAAgD;QAChD,MAAM,GAAG,GAAG,IAAA,mBAAO,EAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAA;QAC3C,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,OAAO;gBACV,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAA;YACvC,KAAK,MAAM;gBACT,OAAO,MAAM,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAA;YACtC,KAAK,KAAK;gBACR,OAAO,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAA;YACrC;gBACE,MAAM,IAAI,eAAe,CAAC,4BAA4B,GAAG,EAAE,CAAC,CAAA;QAChE,CAAC;IACH,CAAC;IAED;;;;;;;;;;;;;OAaG;IACH,KAAK,CAAC,QAAQ,CAAC,QAAgB,EAAE,QAA2B;QAC1D,aAAa;QACb,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QACrC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QAE/B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;YACvC,MAAM,GAAG,GAAG,MAAM,IAAA,qBAAW,EAAC;gBAC5B,IAAI,EAAE,IAAI,UAAU,CAAC,MAAM,CAAC;gBAC5B,cAAc,EAAE,IAAI;gBACpB,eAAe,EAAE,KAAK;aACvB,CAAC,CAAC,OAAO,CAAA;YAEV,wCAAwC;YACxC,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAA;YACxC,MAAM,aAAa,GAAI,QAAQ,EAAE,IAAgC,EAAE,CAAC,OAAO,CAE9D,CAAA;YAEb,wDAAwD;YACxD,MAAM,KAAK,GAAe,EAAE,CAAA;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;gBACjC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAA;gBAE/C,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK;qBAC5B,MAAM,CAAC,CAAC,IAAI,EAAoB,EAAE,CAAC,KAAK,IAAI,IAAI,CAAC;qBACjD,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;oBACd,IAAI,EAAE,IAAI,CAAC,GAAG;oBACd,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;oBACpB,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;oBACpB,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBACrC,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,KAAK;iBAC7B,CAAC,CAAC,CAAA;gBAEL,KAAK,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;YACnC,CAAC;YAED,gFAAgF;YAChF,wFAAwF;YACxF,MAAM,aAAa,GAAG,MAAM,IAAA,2CAA2B,EAAC,KAAK,EAAE,QAAQ,CAAC,CAAA;YACxE,MAAM,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;YAEnE,2DAA2D;YAC3D,8EAA8E;YAC9E,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,IAAI,kBAAsC,CAAA;YAC1C,IAAI,CAAC;gBACH,MAAM,aAAa,GAAG,aAAa,CAAC,CAAC,CAAC,CAAA;gBACtC,IAAI,aAAa,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACrD,MAAM,OAAO,GAAG,IAAI,0BAAe,EAAE,CAAA;oBACrC,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,SAAS,CAAC,aAAa,EAAE,QAAQ,CAAC,CAAA;oBACpE,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC3B,kBAAkB,GAAI,WAAW,CAAC,CAAC,CAAsB,CAAC,IAAI,CAAA;oBAChE,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,UAAU,EAAE,CAAC;gBACpB,OAAO,CAAC,KAAK,CAAC,sDAAsD,UAAU,EAAE,CAAC,CAAA;YACnF,CAAC;YACD,MAAM,WAAW,GAAG,IAAA,oCAAe,EAAC,aAAa,EAAE,kBAAkB,EAAE,QAAQ,CAAC,CAAA;YAEhF,OAAO,CAAC,KAAK,CAAC,eAAe,QAAQ,KAAK,IAAI,CAAC,MAAM,gBAAgB,GAAG,CAAC,QAAQ,SAAS,CAAC,CAAA;YAE3F,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACpD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QAClF,CAAC;IACH,CAAC;IAED;;;;;;;;OAQG;IACK,KAAK,CAAC,SAAS,CAAC,QAAgB;QACtC,IAAI,CAAC;YACH,uDAAuD;YACvD,MAAM,MAAM,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;YAEvC,sDAAsD;YACtD,MAAM,MAAM,GAAG,MAAM,iBAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC,CAAA;YACvD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAA;YAE5B,mEAAmE;YACnE,MAAM,UAAU,GAAG,MAAM,iBAAO,CAAC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC,CAAA;YAC1D,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,MAAM,WAAW,GAAG,IAAA,qCAAgB,EAAC,UAAU,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAA;YAEhE,OAAO,CAAC,KAAK,CAAC,gBAAgB,QAAQ,KAAK,OAAO,CAAC,MAAM,cAAc,CAAC,CAAA;YACxE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,yBAAyB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACnF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,QAAQ,CAAC,QAAgB;QACrC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;YAC9C,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,MAAM,WAAW,GAAG,IAAA,oCAAe,EAAC,IAAI,EAAE,QAAQ,CAAC,CAAA;YACnD,OAAO,CAAC,KAAK,CAAC,eAAe,QAAQ,KAAK,IAAI,CAAC,MAAM,cAAc,CAAC,CAAA;YACpE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACpD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QAClF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,OAAO,CAAC,QAAgB;QACpC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;YAC9C,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,MAAM,WAAW,GAAG,IAAA,yCAAoB,EAAC,IAAI,EAAE,QAAQ,CAAC,CAAA;YACxD,OAAO,CAAC,KAAK,CAAC,cAAc,QAAQ,KAAK,IAAI,CAAC,MAAM,cAAc,CAAC,CAAA;YACnE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACpD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,uBAAuB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACjF,CAAC;IACH,CAAC;CACF;AAxQD,wCAwQC"}
|
|
@@ -71,17 +71,18 @@ interface SentencePatternResult {
|
|
|
71
71
|
*/
|
|
72
72
|
export declare function detectSentencePatterns(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<SentencePatternResult>;
|
|
73
73
|
/**
|
|
74
|
-
* Filter page boundary sentences and
|
|
74
|
+
* Filter page boundary sentences and return per-page filtered text
|
|
75
75
|
*
|
|
76
76
|
* This is the main entry point for sentence-level header/footer filtering.
|
|
77
77
|
* It detects and removes repeating sentence patterns at page boundaries.
|
|
78
|
+
* Returns an array of filtered text per page, preserving page boundaries.
|
|
78
79
|
*
|
|
79
80
|
* Use this instead of joinFilteredPages when embedder is available.
|
|
80
81
|
*
|
|
81
82
|
* @param pages - Array of page data
|
|
82
83
|
* @param embedder - Embedder for generating embeddings
|
|
83
84
|
* @param config - Configuration options
|
|
84
|
-
* @returns
|
|
85
|
+
* @returns Array of filtered text strings, one per page
|
|
85
86
|
*/
|
|
86
|
-
export declare function filterPageBoundarySentences(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<string>;
|
|
87
|
+
export declare function filterPageBoundarySentences(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<string[]>;
|
|
87
88
|
//# sourceMappingURL=pdf-filter.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pdf-filter.d.ts","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAA;AAIvE,YAAY,EAAE,iBAAiB,EAAE,CAAA;AAMjC;;GAEG;AACH,UAAU,oBAAoB;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,CAAC,EAAE,MAAM,CAAA;IACT,CAAC,EAAE,MAAM,CAAA;IACT,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,OAAO,CAAA;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,oBAAoB,EAAE,CAAA;CAC9B;AAoCD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,MAAM,CAK3D;AAoLD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,iEAAiE;IACjE,mBAAmB,EAAE,MAAM,CAAA;IAC3B,gEAAgE;IAChE,QAAQ,EAAE,MAAM,CAAA;IAChB,+EAA+E;IAC/E,WAAW,EAAE,MAAM,CAAA;CACpB;AASD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,qEAAqE;IACrE,mBAAmB,EAAE,OAAO,CAAA;IAC5B,oEAAoE;IACpE,kBAAkB,EAAE,OAAO,CAAA;IAC3B,2CAA2C;IAC3C,gBAAgB,EAAE,MAAM,CAAA;IACxB,0CAA0C;IAC1C,gBAAgB,EAAE,MAAM,CAAA;CACzB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,sBAAsB,CAC1C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,qBAAqB,CAAC,CAsEhC;AAED
|
|
1
|
+
{"version":3,"file":"pdf-filter.d.ts","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAA;AAIvE,YAAY,EAAE,iBAAiB,EAAE,CAAA;AAMjC;;GAEG;AACH,UAAU,oBAAoB;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,CAAC,EAAE,MAAM,CAAA;IACT,CAAC,EAAE,MAAM,CAAA;IACT,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,OAAO,CAAA;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,oBAAoB,EAAE,CAAA;CAC9B;AAoCD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,MAAM,CAK3D;AAoLD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,iEAAiE;IACjE,mBAAmB,EAAE,MAAM,CAAA;IAC3B,gEAAgE;IAChE,QAAQ,EAAE,MAAM,CAAA;IAChB,+EAA+E;IAC/E,WAAW,EAAE,MAAM,CAAA;CACpB;AASD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,qEAAqE;IACrE,mBAAmB,EAAE,OAAO,CAAA;IAC5B,oEAAoE;IACpE,kBAAkB,EAAE,OAAO,CAAA;IAC3B,2CAA2C;IAC3C,gBAAgB,EAAE,MAAM,CAAA;IACxB,0CAA0C;IAC1C,gBAAgB,EAAE,MAAM,CAAA;CACzB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,sBAAsB,CAC1C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,qBAAqB,CAAC,CAsEhC;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,2BAA2B,CAC/C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,MAAM,EAAE,CAAC,CAsCnB"}
|
|
@@ -277,29 +277,30 @@ async function detectSentencePatterns(pages, embedder, config = {}) {
|
|
|
277
277
|
return result;
|
|
278
278
|
}
|
|
279
279
|
/**
|
|
280
|
-
* Filter page boundary sentences and
|
|
280
|
+
* Filter page boundary sentences and return per-page filtered text
|
|
281
281
|
*
|
|
282
282
|
* This is the main entry point for sentence-level header/footer filtering.
|
|
283
283
|
* It detects and removes repeating sentence patterns at page boundaries.
|
|
284
|
+
* Returns an array of filtered text per page, preserving page boundaries.
|
|
284
285
|
*
|
|
285
286
|
* Use this instead of joinFilteredPages when embedder is available.
|
|
286
287
|
*
|
|
287
288
|
* @param pages - Array of page data
|
|
288
289
|
* @param embedder - Embedder for generating embeddings
|
|
289
290
|
* @param config - Configuration options
|
|
290
|
-
* @returns
|
|
291
|
+
* @returns Array of filtered text strings, one per page
|
|
291
292
|
*/
|
|
292
293
|
async function filterPageBoundarySentences(pages, embedder, config = {}) {
|
|
293
294
|
const cfg = { ...DEFAULT_SENTENCE_PATTERN_CONFIG, ...config };
|
|
294
295
|
// Need minimum pages to detect patterns
|
|
295
296
|
if (pages.length < cfg.minPages) {
|
|
296
|
-
return joinFilteredPages(
|
|
297
|
+
return pages.map((page) => joinFilteredPages([page]));
|
|
297
298
|
}
|
|
298
299
|
// Detect patterns
|
|
299
300
|
const patterns = await detectSentencePatterns(pages, embedder, cfg);
|
|
300
|
-
// If no patterns detected, return normally joined text
|
|
301
|
+
// If no patterns detected, return normally joined text per page
|
|
301
302
|
if (!patterns.removeFirstSentence && !patterns.removeLastSentence) {
|
|
302
|
-
return joinFilteredPages(
|
|
303
|
+
return pages.map((page) => joinFilteredPages([page]));
|
|
303
304
|
}
|
|
304
305
|
// Split each page into sentences with Y coordinate (merged by Y)
|
|
305
306
|
const pageSentences = pages.map((page) => splitItemsIntoSentencesWithY(page.items));
|
|
@@ -314,10 +315,7 @@ async function filterPageBoundarySentences(pages, embedder, config = {}) {
|
|
|
314
315
|
}
|
|
315
316
|
return cleaned;
|
|
316
317
|
});
|
|
317
|
-
//
|
|
318
|
-
return cleanedPageSentences
|
|
319
|
-
.map((sentences) => sentences.map((s) => s.text).join(' '))
|
|
320
|
-
.filter((text) => text.length > 0)
|
|
321
|
-
.join('\n\n');
|
|
318
|
+
// Return per-page filtered text
|
|
319
|
+
return cleanedPageSentences.map((sentences) => sentences.map((s) => s.text).join(' '));
|
|
322
320
|
}
|
|
323
321
|
//# sourceMappingURL=pdf-filter.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pdf-filter.js","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":";AAAA,2BAA2B;AAC3B,wDAAwD;AACxD,uEAAuE;;AAuEvE,8CAKC;AA0OD,wDA0EC;
|
|
1
|
+
{"version":3,"file":"pdf-filter.js","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":";AAAA,2BAA2B;AAC3B,wDAAwD;AACxD,uEAAuE;;AAuEvE,8CAKC;AA0OD,wDA0EC;AAgBD,kEA0CC;AAvbD,0EAAoE;AA4BpE,+CAA+C;AAC/C,eAAe;AACf,+CAA+C;AAE/C;;;;;;GAMG;AACH,SAAS,aAAa,CAAC,KAA6B;IAClD,6DAA6D;IAC7D,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkC,CAAA;IACzD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QAClC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAChB,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAA;IACvB,CAAC;IAED,oFAAoF;IACpF,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAClB,KAAK;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SAClB,IAAI,CAAC,GAAG,CAAC,CACb;SACA,IAAI,CAAC,IAAI,CAAC;SACV,IAAI,EAAE,CAAA;AACX,CAAC;AAED;;;;;GAKG;AACH,SAAgB,iBAAiB,CAAC,KAAiB;IACjD,OAAO,KAAK;SACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;SACxC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;SACjC,IAAI,CAAC,MAAM,CAAC,CAAA;AACjB,CAAC;AAcD;;;;;;;;;;GAUG;AACH,SAAS,4BAA4B,CAAC,KAA6B;IACjE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAEjC,+DAA+D;IAC/D,MAAM,WAAW,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC3C,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QACvB,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAA;QACrC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;IAClB,CAAC,CAAC,CAAA;IAEF,2DAA2D;IAC3D,MAAM,UAAU,GAAyD,EAAE,CAAA;IAC3E,IAAI,QAAQ,GAAG,EAAE,CAAA;IACjB,IAAI,KAAK,GAAkB,IAAI,CAAA;IAE/B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC/B,4DAA4D;QAC5D,6EAA6E;QAC7E,IAAI,KAAK,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACnD,QAAQ,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAA;QACtC,CAAC;QAED,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAA;QACjD,QAAQ,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,CAAA;QAC3B,KAAK,GAAG,IAAI,CAAC,CAAC,CAAA;IAChB,CAAC;IAED,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,QAAQ,CAAC,CAAA;IAE9C,kEAAkE;IAClE,MAAM,cAAc,GAAoB,EAAE,CAAA;IAC1C,IAAI,WAAW,GAAG,CAAC,CAAA;IAEnB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,8CAA8C;QAC9C,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACpE,IAAI,aAAa,KAAK,CAAC,CAAC;YAAE,SAAQ;QAElC,4CAA4C;QAC5C,IAAI,UAAU,GAAG,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC3B,IAAI,KAAK,IAAI,KAAK,CAAC,KAAK,IAAI,aAAa,EAAE,CAAC;gBAC1C,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;gBACrC,MAAK;YACP,CAAC;QACH,CAAC;QAED,cAAc,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAA;QACtD,WAAW,GAAG,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAA;IAC/C,CAAC;IAED,yCAAyC;IACzC,OAAO,iBAAiB,CAAC,cAAc,CAAC,CAAA;AAC1C,CAAC;AAED;;;;;GAKG;AACH,SAAS,iBAAiB,CAAC,SAA0B;IACnD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAErC,MAAM,MAAM,GAAoB,EAAE,CAAA;IAClC,IAAI,OAAO,GAAyB,IAAI,CAAA;IAExC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACrB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;aAAM,IAAI,OAAO,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,EAAE,CAAC;YACpC,qBAAqB;YACrB,OAAO,CAAC,IAAI,IAAI,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAA;QACrC,CAAC;aAAM,CAAC;YACN,0CAA0C;YAC1C,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACpB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;QACrB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACtB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,yCAAyC;AACzC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAc,EAAE,IAAc;IACtD,IAAI,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,CAAC,CAAA;IACV,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;QACvB,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;QACvB,UAAU,IAAI,EAAE,GAAG,EAAE,CAAA;QACrB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;QAChB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;IAClB,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACvD,IAAI,WAAW,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAE/B,OAAO,UAAU,GAAG,WAAW,CAAA;AACjC,CAAC;AAED;;;;;;GAMG;AACH,SAAS,wBAAwB,CAAC,UAAsB;IACtD,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,GAAG,CAAA;IAErC,MAAM,YAAY,GAAa,EAAE,CAAA;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;gBACjB,YAAY,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;YACjD,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAEvC,uBAAuB;IACvB,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAE/C,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QAClC,qCAAqC;QACrC,OAAO,CAAC,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACtE,CAAC;IACD,oBAAoB;IACpB,OAAO,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;AAC/B,CAAC;AAcD,iEAAiE;AACjE,MAAM,+BAA+B,GAA0B;IAC7D,mBAAmB,EAAE,IAAI;IACzB,QAAQ,EAAE,CAAC;IACX,WAAW,EAAE,CAAC;CACf,CAAA;AAgBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACI,KAAK,UAAU,sBAAsB,CAC1C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,MAAM,MAAM,GAA0B;QACpC,mBAAmB,EAAE,KAAK;QAC1B,kBAAkB,EAAE,KAAK;QACzB,gBAAgB,EAAE,CAAC;QACnB,gBAAgB,EAAE,CAAC;KACpB,CAAA;IAED,iDAAiD;IACjD,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,MAAM,CAAA;IACf,CAAC;IAED,kDAAkD;IAClD,uEAAuE;IACvE,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAChD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,CAAA;IAClD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC,CAAA;IACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,GAAG,GAAG,CAAC,WAAW,CAAC,CAAA;IACrE,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAA;IAErD,oEAAoE;IACpE,MAAM,aAAa,GAAsB,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAChE,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,yDAAyD;IACzD,MAAM,cAAc,GAAa,EAAE,CAAA;IACnC,MAAM,aAAa,GAAa,EAAE,CAAA;IAElC,KAAK,MAAM,SAAS,IAAI,aAAa,EAAE,CAAC;QACtC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YACvC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,aAAa,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC3D,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,IAAI,cAAc,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,cAAc,CAAC,CAAA;QAC5D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,mBAAmB,GAAG,IAAI,CAAA;YACjC,OAAO,CAAC,KAAK,CACX,qCAAqC,cAAc,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACtJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,6EAA6E;IAC7E,IAAI,aAAa,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;QAC3D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,kBAAkB,GAAG,IAAI,CAAA;YAChC,OAAO,CAAC,KAAK,CACX,qCAAqC,aAAa,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACrJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;;;;;;;GAaG;AACI,KAAK,UAAU,2BAA2B,CAC/C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACvD,CAAC;IAED,kBAAkB;IAClB,MAAM,QAAQ,GAAG,MAAM,sBAAsB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAA;IAEnE,gEAAgE;IAChE,IAAI,CAAC,QAAQ,CAAC,mBAAmB,IAAI,CAAC,QAAQ,CAAC,kBAAkB,EAAE,CAAC;QAClE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACvD,CAAC;IAED,iEAAiE;IACjE,MAAM,aAAa,GAAsB,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC1D,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,+CAA+C;IAC/C,MAAM,oBAAoB,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;QAC3D,IAAI,OAAO,GAAG,CAAC,GAAG,SAAS,CAAC,CAAA;QAE5B,IAAI,QAAQ,CAAC,mBAAmB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;QAC5B,CAAC;QAED,IAAI,QAAQ,CAAC,kBAAkB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAChC,CAAC;QAED,OAAO,OAAO,CAAA;IAChB,CAAC,CAAC,CAAA;IAEF,gCAAgC;IAChC,OAAO,oBAAoB,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;AACxF,CAAC"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Result of title extraction, including how the title was determined
|
|
3
|
+
*/
|
|
4
|
+
export interface TitleExtractionResult {
|
|
5
|
+
title: string;
|
|
6
|
+
source: 'metadata' | 'content' | 'filename';
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Convert a file name to a human-readable title
|
|
10
|
+
* Strips the extension and replaces hyphens/underscores with spaces
|
|
11
|
+
*
|
|
12
|
+
* @param fileName - File name (e.g., "2024-annual-report.pdf")
|
|
13
|
+
* @returns Human-readable title (e.g., "2024 annual report")
|
|
14
|
+
*/
|
|
15
|
+
export declare function fileNameToTitle(fileName: string): string;
|
|
16
|
+
/**
|
|
17
|
+
* Extract title from Markdown content
|
|
18
|
+
* Priority: YAML frontmatter title -> first # H1 -> file name
|
|
19
|
+
*
|
|
20
|
+
* @param text - Markdown content
|
|
21
|
+
* @param fileName - File name for fallback
|
|
22
|
+
* @returns Title extraction result
|
|
23
|
+
*/
|
|
24
|
+
export declare function extractMarkdownTitle(text: string, fileName: string): TitleExtractionResult;
|
|
25
|
+
/**
|
|
26
|
+
* Extract title from plain text content
|
|
27
|
+
* Priority: first line followed by empty line -> file name
|
|
28
|
+
*
|
|
29
|
+
* @param text - Plain text content
|
|
30
|
+
* @param fileName - File name for fallback
|
|
31
|
+
* @returns Title extraction result
|
|
32
|
+
*/
|
|
33
|
+
export declare function extractTxtTitle(text: string, fileName: string): TitleExtractionResult;
|
|
34
|
+
/**
|
|
35
|
+
* Extract title from HTML content (using Readability title)
|
|
36
|
+
* Priority: readability title -> file name
|
|
37
|
+
*
|
|
38
|
+
* @param readabilityTitle - Title extracted by Readability
|
|
39
|
+
* @param fileName - File name for fallback
|
|
40
|
+
* @returns Title extraction result
|
|
41
|
+
*/
|
|
42
|
+
export declare function extractHtmlTitle(readabilityTitle: string, fileName: string): TitleExtractionResult;
|
|
43
|
+
/**
|
|
44
|
+
* Extract title from PDF metadata or first page chunk text
|
|
45
|
+
* Priority: PDF metadata /Title -> first page chunk 0 text -> file name
|
|
46
|
+
*
|
|
47
|
+
* Rejects metadata titles that look like file paths (contain / or \) or are empty/whitespace-only.
|
|
48
|
+
*
|
|
49
|
+
* @param metadataTitle - PDF metadata /Title value (may be undefined)
|
|
50
|
+
* @param firstPageChunkText - Text of chunk 0 from semantic chunking of page 1 (may be undefined)
|
|
51
|
+
* @param fileName - File name for fallback
|
|
52
|
+
* @returns Title extraction result
|
|
53
|
+
*/
|
|
54
|
+
export declare function extractPdfTitle(metadataTitle: string | undefined, firstPageChunkText: string | undefined, fileName: string): TitleExtractionResult;
|
|
55
|
+
/**
|
|
56
|
+
* Extract title from DOCX mammoth HTML output
|
|
57
|
+
* Priority: first <h1> from mammoth HTML -> file name
|
|
58
|
+
*
|
|
59
|
+
* @param htmlContent - HTML content generated by mammoth.convertToHtml()
|
|
60
|
+
* @param fileName - File name for fallback
|
|
61
|
+
* @returns Title extraction result
|
|
62
|
+
*/
|
|
63
|
+
export declare function extractDocxTitle(htmlContent: string, fileName: string): TitleExtractionResult;
|
|
64
|
+
//# sourceMappingURL=title-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"title-extractor.d.ts","sourceRoot":"","sources":["../../src/parser/title-extractor.ts"],"names":[],"mappings":"AAOA;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE,MAAM,CAAA;IACb,MAAM,EAAE,UAAU,GAAG,SAAS,GAAG,UAAU,CAAA;CAC5C;AAMD;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAMxD;AAMD;;;;;;;GAOG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,qBAAqB,CAe1F;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,qBAAqB,CAkBrF;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAC9B,gBAAgB,EAAE,MAAM,EACxB,QAAQ,EAAE,MAAM,GACf,qBAAqB,CAOvB;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,eAAe,CAC7B,aAAa,EAAE,MAAM,GAAG,SAAS,EACjC,kBAAkB,EAAE,MAAM,GAAG,SAAS,EACtC,QAAQ,EAAE,MAAM,GACf,qBAAqB,CAiBvB;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,qBAAqB,CAY7F"}
|