mcp-local-rag 0.5.6 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +3 -2
  2. package/dist/parser/html-parser.d.ts +6 -2
  3. package/dist/parser/html-parser.d.ts.map +1 -1
  4. package/dist/parser/html-parser.js +19 -10
  5. package/dist/parser/html-parser.js.map +1 -1
  6. package/dist/parser/index.d.ts +24 -7
  7. package/dist/parser/index.d.ts.map +1 -1
  8. package/dist/parser/index.js +62 -14
  9. package/dist/parser/index.js.map +1 -1
  10. package/dist/parser/pdf-filter.d.ts +4 -3
  11. package/dist/parser/pdf-filter.d.ts.map +1 -1
  12. package/dist/parser/pdf-filter.js +8 -10
  13. package/dist/parser/pdf-filter.js.map +1 -1
  14. package/dist/parser/title-extractor.d.ts +64 -0
  15. package/dist/parser/title-extractor.d.ts.map +1 -0
  16. package/dist/parser/title-extractor.js +139 -0
  17. package/dist/parser/title-extractor.js.map +1 -0
  18. package/dist/server/index.d.ts +3 -1
  19. package/dist/server/index.d.ts.map +1 -1
  20. package/dist/server/index.js +86 -22
  21. package/dist/server/index.js.map +1 -1
  22. package/dist/server/raw-data-utils.d.ts +32 -0
  23. package/dist/server/raw-data-utils.d.ts.map +1 -1
  24. package/dist/server/raw-data-utils.js +46 -0
  25. package/dist/server/raw-data-utils.js.map +1 -1
  26. package/dist/server/tool-definitions.js +1 -1
  27. package/dist/server/tool-definitions.js.map +1 -1
  28. package/dist/server/types.d.ts +39 -0
  29. package/dist/server/types.d.ts.map +1 -1
  30. package/dist/server-main.d.ts.map +1 -1
  31. package/dist/server-main.js +17 -0
  32. package/dist/server-main.js.map +1 -1
  33. package/dist/vectordb/index.d.ts +9 -94
  34. package/dist/vectordb/index.d.ts.map +1 -1
  35. package/dist/vectordb/index.js +55 -185
  36. package/dist/vectordb/index.js.map +1 -1
  37. package/dist/vectordb/search-filters.d.ts +45 -0
  38. package/dist/vectordb/search-filters.d.ts.map +1 -0
  39. package/dist/vectordb/search-filters.js +142 -0
  40. package/dist/vectordb/search-filters.js.map +1 -0
  41. package/dist/vectordb/types.d.ts +112 -0
  42. package/dist/vectordb/types.d.ts.map +1 -0
  43. package/dist/vectordb/types.js +74 -0
  44. package/dist/vectordb/types.js.map +1 -0
  45. package/package.json +1 -1
  46. package/skills/mcp-local-rag/SKILL.md +10 -0
  47. package/skills/mcp-local-rag/references/html-ingestion.md +2 -1
  48. package/skills/mcp-local-rag/references/result-refinement.md +1 -0
package/README.md CHANGED
@@ -129,12 +129,12 @@ HTML is automatically cleaned—you get the article content, not the boilerplate
129
129
 
130
130
  Search uses semantic similarity with keyword boost. This means `useEffect` finds documents containing that exact term, not just semantically similar React concepts.
131
131
 
132
- Results include text content, source file, and relevance score. Adjust result count with `limit` (1-20, default 10).
132
+ Results include text content, source file, document title, and relevance score. The document title provides context for each chunk, helping identify which document a result belongs to. Adjust result count with `limit` (1-20, default 10).
133
133
 
134
134
  ### Managing Files
135
135
 
136
136
  ```
137
- "List all ingested files" # See what's indexed
137
+ "List all files in BASE_DIR and their ingested status" # See what's indexed
138
138
  "Delete old-spec.pdf from RAG" # Remove a file
139
139
  "Show RAG server status" # Check system health
140
140
  ```
@@ -148,6 +148,7 @@ Adjust these for your use case:
148
148
  | `RAG_HYBRID_WEIGHT` | `0.6` | Keyword boost factor. 0 = semantic only, higher = stronger keyword boost. |
149
149
  | `RAG_GROUPING` | (not set) | `similar` for top group only, `related` for top 2 groups. |
150
150
  | `RAG_MAX_DISTANCE` | (not set) | Filter out low-relevance results (e.g., `0.5`). |
151
+ | `RAG_MAX_FILES` | (not set) | Limit results to top N files (e.g., `1` for single best file). |
151
152
 
152
153
  ### Code-focused tuning
153
154
 
@@ -5,10 +5,14 @@
5
5
  * 1. HTML string → JSDOM (DOM creation)
6
6
  * 2. JSDOM → Readability (main content extraction, noise removal)
7
7
  * 3. Readability result → Turndown (Markdown conversion)
8
+ * 4. Title extracted separately via extractHtmlTitle (NOT prepended to content)
8
9
  *
9
10
  * @param html - Raw HTML string
10
11
  * @param url - Source URL (used for resolving relative links)
11
- * @returns Markdown string of extracted content
12
+ * @returns Object with content (markdown) and title (extracted separately)
12
13
  */
13
- export declare function parseHtml(html: string, url: string): Promise<string>;
14
+ export declare function parseHtml(html: string, url: string): Promise<{
15
+ content: string;
16
+ title: string;
17
+ }>;
14
18
  //# sourceMappingURL=html-parser.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":"AAsDA;;;;;;;;;;;GAWG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAoD1E"}
1
+ {"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":"AAuDA;;;;;;;;;;;;GAYG;AACH,wBAAsB,SAAS,CAC7B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CA0D7C"}
@@ -9,6 +9,7 @@ exports.parseHtml = parseHtml;
9
9
  const readability_1 = require("@mozilla/readability");
10
10
  const jsdom_1 = require("jsdom");
11
11
  const turndown_1 = __importDefault(require("turndown"));
12
+ const title_extractor_js_1 = require("./title-extractor.js");
12
13
  // ============================================
13
14
  // Turndown Service Configuration
14
15
  // ============================================
@@ -46,15 +47,16 @@ function createTurndownService() {
46
47
  * 1. HTML string → JSDOM (DOM creation)
47
48
  * 2. JSDOM → Readability (main content extraction, noise removal)
48
49
  * 3. Readability result → Turndown (Markdown conversion)
50
+ * 4. Title extracted separately via extractHtmlTitle (NOT prepended to content)
49
51
  *
50
52
  * @param html - Raw HTML string
51
53
  * @param url - Source URL (used for resolving relative links)
52
- * @returns Markdown string of extracted content
54
+ * @returns Object with content (markdown) and title (extracted separately)
53
55
  */
54
56
  async function parseHtml(html, url) {
55
57
  // Handle empty or whitespace-only HTML
56
58
  if (!html || html.trim().length === 0) {
57
- return '';
59
+ return { content: '', title: '' };
58
60
  }
59
61
  try {
60
62
  // Create DOM from HTML string
@@ -75,25 +77,32 @@ async function parseHtml(html, url) {
75
77
  // Try to get body content directly
76
78
  const bodyContent = document.body?.innerHTML || '';
77
79
  if (!bodyContent.trim()) {
78
- return '';
80
+ return { content: '', title: '' };
79
81
  }
80
82
  // Convert raw body HTML to Markdown
81
83
  const turndownService = createTurndownService();
82
- return turndownService.turndown(bodyContent).trim();
84
+ return { content: turndownService.turndown(bodyContent).trim(), title: '' };
83
85
  }
84
86
  // Convert extracted HTML content to Markdown
85
87
  const turndownService = createTurndownService();
86
88
  const markdown = turndownService.turndown(article.content);
87
- // Add title if available
88
- if (article.title) {
89
- return `# ${article.title}\n\n${markdown}`.trim();
89
+ // Extract title separately (NOT prepended to markdown content)
90
+ // Use URL-derived filename as fallback when Readability has no title
91
+ let urlFileName = '';
92
+ try {
93
+ urlFileName = new URL(url).pathname.split('/').filter(Boolean).pop() || '';
90
94
  }
91
- return markdown.trim();
95
+ catch {
96
+ // Non-URL string, empty fallback
97
+ }
98
+ const titleResult = (0, title_extractor_js_1.extractHtmlTitle)(article.title || '', urlFileName);
99
+ const title = titleResult.title;
100
+ return { content: markdown.trim(), title };
92
101
  }
93
102
  catch (error) {
94
- // Log error but don't throw - return empty string for graceful degradation
103
+ // Log error but don't throw - return empty values for graceful degradation
95
104
  console.error('Failed to parse HTML:', error);
96
- return '';
105
+ return { content: '', title: '' };
97
106
  }
98
107
  }
99
108
  //# sourceMappingURL=html-parser.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":";AAAA,6CAA6C;AAC7C,2DAA2D;;;;;AAiE3D,8BAoDC;AAnHD,sDAAkD;AAClD,iCAA6B;AAC7B,wDAAsC;AActC,+CAA+C;AAC/C,iCAAiC;AACjC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,qBAAqB;IAC5B,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK,EAAE,uBAAuB;QAC5C,cAAc,EAAE,QAAQ,EAAE,0BAA0B;QACpD,gBAAgB,EAAE,GAAG,EAAE,yBAAyB;QAChD,WAAW,EAAE,GAAG,EAAE,qBAAqB;QACvC,eAAe,EAAE,IAAI,EAAE,kBAAkB;KAC1C,CAAC,CAAA;IAEF,0BAA0B;IAC1B,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE;QACpC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;YAC9B,MAAM,OAAO,GAAG,IAAe,CAAA;YAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAA;YACjD,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAA;YACxE,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,IAAI,EAAE,CAAA;YACvE,OAAO,WAAW,QAAQ,KAAK,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,CAAA;QAC/D,CAAC;KACF,CAAC,CAAA;IAEF,OAAO,eAAe,CAAA;AACxB,CAAC;AAED,+CAA+C;AAC/C,cAAc;AACd,+CAA+C;AAE/C;;;;;;;;;;;GAWG;AACI,KAAK,UAAU,SAAS,CAAC,IAAY,EAAE,GAAW;IACvD,uCAAuC;IACvC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,CAAA;IACX,CAAC;IAED,IAAI,CAAC;QACH,8BAA8B;QAC9B,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE;YAC1B,GAAG;YACH,yCAAyC;YACzC,UAAU,EAAE,cAAc;SAC3B,CAAC,CAAA;QAEF,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAA;QAEpC,0CAA0C;QAC1C,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,QAAQ,EAAE;YACvC,WAAW,EAAE,KAAK;YAClB,KAAK,EAAE,KAAK;SACb,CAAC,CAAA;QAEF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAA8B,CAAA;QAE1D,kEAAkE;QAClE,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YACjC,mCAAmC;YACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,IAAI,EAAE,CAAA;YAClD,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;gBACxB,OAAO,EAAE,CAAA;YACX,CAAC;YAED,oCAAoC;YACpC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;YAC/C,OAAO,eAAe,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,CAAA;QACrD,CAAC;QAED,6CAA6C;QAC7C,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;QAC/C,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;QAE1D,yBAAyB;QACzB,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,OAAO,KAAK,OAAO,CAAC,KAAK,OAAO,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;QACnD,CAAC;QAED,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAA;IACxB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,2EAA2E;QAC3E,OAAO,CAAC,KAAK,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,OAAO,EAAE,CAAA;IACX,CAAC;AACH,CAAC"}
1
+ {"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":";AAAA,6CAA6C;AAC7C,2DAA2D;;;;;AAmE3D,8BA6DC;AA9HD,sDAAkD;AAClD,iCAA6B;AAC7B,wDAAsC;AACtC,6DAAuD;AAcvD,+CAA+C;AAC/C,iCAAiC;AACjC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,qBAAqB;IAC5B,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK,EAAE,uBAAuB;QAC5C,cAAc,EAAE,QAAQ,EAAE,0BAA0B;QACpD,gBAAgB,EAAE,GAAG,EAAE,yBAAyB;QAChD,WAAW,EAAE,GAAG,EAAE,qBAAqB;QACvC,eAAe,EAAE,IAAI,EAAE,kBAAkB;KAC1C,CAAC,CAAA;IAEF,0BAA0B;IAC1B,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE;QACpC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;YAC9B,MAAM,OAAO,GAAG,IAAe,CAAA;YAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAA;YACjD,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAA;YACxE,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,IAAI,EAAE,CAAA;YACvE,OAAO,WAAW,QAAQ,KAAK,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,CAAA;QAC/D,CAAC;KACF,CAAC,CAAA;IAEF,OAAO,eAAe,CAAA;AACxB,CAAC;AAED,+CAA+C;AAC/C,cAAc;AACd,+CAA+C;AAE/C;;;;;;;;;;;;GAYG;AACI,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,GAAW;IAEX,uCAAuC;IACvC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;IACnC,CAAC;IAED,IAAI,CAAC;QACH,8BAA8B;QAC9B,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE;YAC1B,GAAG;YACH,yCAAyC;YACzC,UAAU,EAAE,cAAc;SAC3B,CAAC,CAAA;QAEF,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAA;QAEpC,0CAA0C;QAC1C,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,QAAQ,EAAE;YACvC,WAAW,EAAE,KAAK;YAClB,KAAK,EAAE,KAAK;SACb,CAAC,CAAA;QAEF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAA8B,CAAA;QAE1D,kEAAkE;QAClE,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YACjC,mCAAmC;YACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,IAAI,EAAE,CAAA;YAClD,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;gBACxB,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;YACnC,CAAC;YAED,oCAAoC;YACpC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;YAC/C,OAAO,EAAE,OAAO,EAAE,eAAe,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAC7E,CAAC;QAED,6CAA6C;QAC7C,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;QAC/C,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;QAE1D,+DAA+D;QAC/D,qEAAqE;QACrE,IAAI,WAAW,GAAG,EAAE,CAAA;QACpB,IAAI,CAAC;YACH,WAAW,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAA;QAC5E,CAAC;QAAC,MAAM,CAAC;YACP,iCAAiC;QACnC,CAAC;QACD,MAAM,WAAW,GAAG,IAAA,qCAAgB,EAAC,OAAO,CAAC,KAAK,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACtE,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,CAAA;QAE/B,OAAO,EAAE,OAAO,EAAE,QAAQ,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAA;IAC5C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,2EAA2E;QAC3E,OAAO,CAAC,KAAK,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;IACnC,CAAC;AACH,CAAC"}
@@ -1,4 +1,18 @@
1
1
  import { type EmbedderInterface } from './pdf-filter.js';
2
+ /**
3
+ * File extensions supported by the parser module (parseFile + parsePdf).
4
+ * Exported so other modules (e.g. list_files) stay in sync automatically
5
+ * when new formats are added here.
6
+ */
7
+ export declare const SUPPORTED_EXTENSIONS: Set<string>;
8
+ /**
9
+ * Result from parsing a document, containing both content and extracted title.
10
+ * Title is display-only metadata (NOT used for search scoring).
11
+ */
12
+ export interface ParseResult {
13
+ content: string;
14
+ title: string;
15
+ }
2
16
  /**
3
17
  * DocumentParser configuration
4
18
  */
@@ -54,11 +68,11 @@ export declare class DocumentParser {
54
68
  * File parsing (auto format detection)
55
69
  *
56
70
  * @param filePath - File path to parse
57
- * @returns Parsed text
71
+ * @returns ParseResult with content and extracted title
58
72
  * @throws ValidationError - Path traversal, size exceeded, unsupported format
59
73
  * @throws FileOperationError - File read failed, parse failed
60
74
  */
61
- parseFile(filePath: string): Promise<string>;
75
+ parseFile(filePath: string): Promise<ParseResult>;
62
76
  /**
63
77
  * PDF parsing with header/footer filtering
64
78
  *
@@ -66,18 +80,21 @@ export declare class DocumentParser {
66
80
  * - Extracts text with position information (x, y, fontSize)
67
81
  * - Semantic header/footer detection using embedding similarity
68
82
  * - Uses hasEOL for proper line break handling
83
+ * - Extracts document title from PDF metadata and first page font heuristic
69
84
  *
70
85
  * @param filePath - PDF file path
71
86
  * @param embedder - Embedder for semantic header/footer detection
72
- * @returns Parsed text with header/footer removed
87
+ * @returns ParseResult with content and extracted title
73
88
  * @throws FileOperationError - File read failed, parse failed
74
89
  */
75
- parsePdf(filePath: string, embedder: EmbedderInterface): Promise<string>;
90
+ parsePdf(filePath: string, embedder: EmbedderInterface): Promise<ParseResult>;
76
91
  /**
77
92
  * DOCX parsing (using mammoth)
78
93
  *
94
+ * Uses extractRawText for content and convertToHtml additionally for title detection.
95
+ *
79
96
  * @param filePath - DOCX file path
80
- * @returns Parsed text
97
+ * @returns ParseResult with content and extracted title
81
98
  * @throws FileOperationError - File read failed, parse failed
82
99
  */
83
100
  private parseDocx;
@@ -85,7 +102,7 @@ export declare class DocumentParser {
85
102
  * TXT parsing (using fs.readFile)
86
103
  *
87
104
  * @param filePath - TXT file path
88
- * @returns Parsed text
105
+ * @returns ParseResult with content and extracted title
89
106
  * @throws FileOperationError - File read failed
90
107
  */
91
108
  private parseTxt;
@@ -93,7 +110,7 @@ export declare class DocumentParser {
93
110
  * MD parsing (using fs.readFile)
94
111
  *
95
112
  * @param filePath - MD file path
96
- * @returns Parsed text
113
+ * @returns ParseResult with content and extracted title
97
114
  * @throws FileOperationError - File read failed
98
115
  */
99
116
  private parseMd;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,KAAK,iBAAiB,EAA8C,MAAM,iBAAiB,CAAA;AAMpG;;GAEG;AACH,UAAU,YAAY;IACpB,uCAAuC;IACvC,OAAO,EAAE,MAAM,CAAA;IACf,gCAAgC;IAChC,WAAW,EAAE,MAAM,CAAA;CACpB;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,KAAK;aAGb,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAED;;GAEG;AACH,qBAAa,kBAAmB,SAAQ,KAAK;aAGhB,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAMD;;;;;;;GAOG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,6FAA6F;IAC7F,OAAO,CAAC,eAAe,CAAsB;gBAEjC,MAAM,EAAE,YAAY;IAIhC;;;;;OAKG;IACG,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAiDvD;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI;IAgBxC;;;;;;;OAOG;IACG,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAmBlD;;;;;;;;;;;;OAYG;IACG,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,MAAM,CAAC;IA4C9E;;;;;;OAMG;YACW,SAAS;IAUvB;;;;;;OAMG;YACW,QAAQ;IAUtB;;;;;;OAMG;YACW,OAAO;CAStB"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,KAAK,iBAAiB,EAA8C,MAAM,iBAAiB,CAAA;AAYpG;;;;GAIG;AACH,eAAO,MAAM,oBAAoB,aAA4C,CAAA;AAM7E;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;GAEG;AACH,UAAU,YAAY;IACpB,uCAAuC;IACvC,OAAO,EAAE,MAAM,CAAA;IACf,gCAAgC;IAChC,WAAW,EAAE,MAAM,CAAA;CACpB;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,KAAK;aAGb,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAED;;GAEG;AACH,qBAAa,kBAAmB,SAAQ,KAAK;aAGhB,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAMD;;;;;;;GAOG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,6FAA6F;IAC7F,OAAO,CAAC,eAAe,CAAsB;gBAEjC,MAAM,EAAE,YAAY;IAIhC;;;;;OAKG;IACG,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAiDvD;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI;IAgBxC;;;;;;;OAOG;IACG,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAmBvD;;;;;;;;;;;;;OAaG;IACG,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IAqEnF;;;;;;;;OAQG;YACW,SAAS;IAqBvB;;;;;;OAMG;YACW,QAAQ;IAYtB;;;;;;OAMG;YACW,OAAO;CAWtB"}
@@ -4,13 +4,24 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
4
4
  return (mod && mod.__esModule) ? mod : { "default": mod };
5
5
  };
6
6
  Object.defineProperty(exports, "__esModule", { value: true });
7
- exports.DocumentParser = exports.FileOperationError = exports.ValidationError = void 0;
7
+ exports.DocumentParser = exports.FileOperationError = exports.ValidationError = exports.SUPPORTED_EXTENSIONS = void 0;
8
8
  const node_fs_1 = require("node:fs");
9
9
  const promises_1 = require("node:fs/promises");
10
10
  const node_path_1 = require("node:path");
11
11
  const mammoth_1 = __importDefault(require("mammoth"));
12
12
  const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
13
+ const index_js_1 = require("../chunker/index.js");
13
14
  const pdf_filter_js_1 = require("./pdf-filter.js");
15
+ const title_extractor_js_1 = require("./title-extractor.js");
16
+ // ============================================
17
+ // Supported Extensions
18
+ // ============================================
19
+ /**
20
+ * File extensions supported by the parser module (parseFile + parsePdf).
21
+ * Exported so other modules (e.g. list_files) stay in sync automatically
22
+ * when new formats are added here.
23
+ */
24
+ exports.SUPPORTED_EXTENSIONS = new Set(['.pdf', '.docx', '.txt', '.md']);
14
25
  /**
15
26
  * Validation error (equivalent to 400)
16
27
  */
@@ -118,7 +129,7 @@ class DocumentParser {
118
129
  * File parsing (auto format detection)
119
130
  *
120
131
  * @param filePath - File path to parse
121
- * @returns Parsed text
132
+ * @returns ParseResult with content and extracted title
122
133
  * @throws ValidationError - Path traversal, size exceeded, unsupported format
123
134
  * @throws FileOperationError - File read failed, parse failed
124
135
  */
@@ -146,10 +157,11 @@ class DocumentParser {
146
157
  * - Extracts text with position information (x, y, fontSize)
147
158
  * - Semantic header/footer detection using embedding similarity
148
159
  * - Uses hasEOL for proper line break handling
160
+ * - Extracts document title from PDF metadata and first page font heuristic
149
161
  *
150
162
  * @param filePath - PDF file path
151
163
  * @param embedder - Embedder for semantic header/footer detection
152
- * @returns Parsed text with header/footer removed
164
+ * @returns ParseResult with content and extracted title
153
165
  * @throws FileOperationError - File read failed, parse failed
154
166
  */
155
167
  async parsePdf(filePath, embedder) {
@@ -163,6 +175,9 @@ class DocumentParser {
163
175
  useSystemFonts: true,
164
176
  isEvalSupported: false,
165
177
  }).promise;
178
+ // Extract metadata for title extraction
179
+ const metadata = await pdf.getMetadata();
180
+ const metadataTitle = metadata?.info?.['Title'];
166
181
  // Extract text with position information from each page
167
182
  const pages = [];
168
183
  for (let i = 1; i <= pdf.numPages; i++) {
@@ -179,11 +194,30 @@ class DocumentParser {
179
194
  }));
180
195
  pages.push({ pageNum: i, items });
181
196
  }
182
- // Apply sentence-level header/footer filtering
197
+ // Apply sentence-level header/footer filtering (returns per-page filtered text)
183
198
  // This handles variable content like page numbers ("7 of 75") using semantic similarity
184
- const text = await (0, pdf_filter_js_1.filterPageBoundarySentences)(pages, embedder);
199
+ const filteredPages = await (0, pdf_filter_js_1.filterPageBoundarySentences)(pages, embedder);
200
+ const text = filteredPages.filter((t) => t.length > 0).join('\n\n');
201
+ // Extract title from filtered page 1 via semantic chunking
202
+ // Isolated try-catch: title extraction failure should not abort PDF ingestion
203
+ const fileName = (0, node_path_1.basename)(filePath);
204
+ let firstPageChunkText;
205
+ try {
206
+ const filteredPage1 = filteredPages[0];
207
+ if (filteredPage1 && filteredPage1.trim().length > 0) {
208
+ const chunker = new index_js_1.SemanticChunker();
209
+ const page1Chunks = await chunker.chunkText(filteredPage1, embedder);
210
+ if (page1Chunks.length > 0) {
211
+ firstPageChunkText = page1Chunks[0].text;
212
+ }
213
+ }
214
+ }
215
+ catch (titleError) {
216
+ console.error(`Title extraction failed, falling back to filename: ${titleError}`);
217
+ }
218
+ const titleResult = (0, title_extractor_js_1.extractPdfTitle)(metadataTitle, firstPageChunkText, fileName);
185
219
  console.error(`Parsed PDF: ${filePath} (${text.length} characters, ${pdf.numPages} pages)`);
186
- return text;
220
+ return { content: text, title: titleResult.title };
187
221
  }
188
222
  catch (error) {
189
223
  throw new FileOperationError(`Failed to parse PDF: ${filePath}`, error);
@@ -192,15 +226,25 @@ class DocumentParser {
192
226
  /**
193
227
  * DOCX parsing (using mammoth)
194
228
  *
229
+ * Uses extractRawText for content and convertToHtml additionally for title detection.
230
+ *
195
231
  * @param filePath - DOCX file path
196
- * @returns Parsed text
232
+ * @returns ParseResult with content and extracted title
197
233
  * @throws FileOperationError - File read failed, parse failed
198
234
  */
199
235
  async parseDocx(filePath) {
200
236
  try {
201
- const result = await mammoth_1.default.extractRawText({ path: filePath });
202
- console.error(`Parsed DOCX: ${filePath} (${result.value.length} characters)`);
203
- return result.value;
237
+ // Read file once and pass buffer to both mammoth calls
238
+ const buffer = await (0, promises_1.readFile)(filePath);
239
+ // Use extractRawText for content (unchanged behavior)
240
+ const result = await mammoth_1.default.extractRawText({ buffer });
241
+ const rawText = result.value;
242
+ // Use convertToHtml additionally for title extraction (first <h1>)
243
+ const htmlResult = await mammoth_1.default.convertToHtml({ buffer });
244
+ const fileName = (0, node_path_1.basename)(filePath);
245
+ const titleResult = (0, title_extractor_js_1.extractDocxTitle)(htmlResult.value, fileName);
246
+ console.error(`Parsed DOCX: ${filePath} (${rawText.length} characters)`);
247
+ return { content: rawText, title: titleResult.title };
204
248
  }
205
249
  catch (error) {
206
250
  throw new FileOperationError(`Failed to parse DOCX: ${filePath}`, error);
@@ -210,14 +254,16 @@ class DocumentParser {
210
254
  * TXT parsing (using fs.readFile)
211
255
  *
212
256
  * @param filePath - TXT file path
213
- * @returns Parsed text
257
+ * @returns ParseResult with content and extracted title
214
258
  * @throws FileOperationError - File read failed
215
259
  */
216
260
  async parseTxt(filePath) {
217
261
  try {
218
262
  const text = await (0, promises_1.readFile)(filePath, 'utf-8');
263
+ const fileName = (0, node_path_1.basename)(filePath);
264
+ const titleResult = (0, title_extractor_js_1.extractTxtTitle)(text, fileName);
219
265
  console.error(`Parsed TXT: ${filePath} (${text.length} characters)`);
220
- return text;
266
+ return { content: text, title: titleResult.title };
221
267
  }
222
268
  catch (error) {
223
269
  throw new FileOperationError(`Failed to parse TXT: ${filePath}`, error);
@@ -227,14 +273,16 @@ class DocumentParser {
227
273
  * MD parsing (using fs.readFile)
228
274
  *
229
275
  * @param filePath - MD file path
230
- * @returns Parsed text
276
+ * @returns ParseResult with content and extracted title
231
277
  * @throws FileOperationError - File read failed
232
278
  */
233
279
  async parseMd(filePath) {
234
280
  try {
235
281
  const text = await (0, promises_1.readFile)(filePath, 'utf-8');
282
+ const fileName = (0, node_path_1.basename)(filePath);
283
+ const titleResult = (0, title_extractor_js_1.extractMarkdownTitle)(text, fileName);
236
284
  console.error(`Parsed MD: ${filePath} (${text.length} characters)`);
237
- return text;
285
+ return { content: text, title: titleResult.title };
238
286
  }
239
287
  catch (error) {
240
288
  throw new FileOperationError(`Failed to parse MD: ${filePath}`, error);
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":";AAAA,6DAA6D;;;;;;AAE7D,qCAAkC;AAClC,+CAA4D;AAC5D,yCAA6D;AAC7D,sDAA6B;AAC7B,6DAA6D;AAE7D,mDAAoG;AAgBpG;;GAEG;AACH,MAAa,eAAgB,SAAQ,KAAK;IACxC,YACE,OAAe,EACU,KAAa;QAEtC,KAAK,CAAC,OAAO,CAAC,CAAA;QAFW,UAAK,GAAL,KAAK,CAAQ;QAGtC,IAAI,CAAC,IAAI,GAAG,iBAAiB,CAAA;IAC/B,CAAC;CACF;AARD,0CAQC;AAED;;GAEG;AACH,MAAa,kBAAmB,SAAQ,KAAK;IAC3C,YACE,OAAe,EACU,KAAa;QAEtC,KAAK,CAAC,OAAO,CAAC,CAAA;QAFW,UAAK,GAAL,KAAK,CAAQ;QAGtC,IAAI,CAAC,IAAI,GAAG,oBAAoB,CAAA;IAClC,CAAC;CACF;AARD,gDAQC;AAED,+CAA+C;AAC/C,uBAAuB;AACvB,+CAA+C;AAE/C;;;;;;;GAOG;AACH,MAAa,cAAc;IAKzB,YAAY,MAAoB;QAHhC,6FAA6F;QACrF,oBAAe,GAAkB,IAAI,CAAA;QAG3C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAA;IACtB,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,gBAAgB,CAAC,QAAgB;QACrC,wDAAwD;QACxD,IAAI,CAAC,IAAA,sBAAU,EAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,eAAe,CACvB,8CAA8C,QAAQ,qDAAqD,CAC5G,CAAA;QACH,CAAC;QAED,oEAAoE;QACpE,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,MAAM,IAAA,mBAAQ,EAAC,IAAA,mBAAO,EAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAA;YAC7D,uDAAuD;YACvD,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC,QAAQ,CAAC,eAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,GAAG,eAAG,CAAA;QAC3E,CAAC;QAED,uDAAuD;QACvD,IAAI,YAAoB,CAAA;QACxB,IAAI,CAAC;YACH,YAAY,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;QACzC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,sDAAsD;YACtD,iEAAiE;YACjE,oEAAoE;YACpE,4EAA4E;YAC5E,MAAM,SAAS,GAAG,MAAM,IAAA,gBAAK,EAAC,QAAQ,CAAC;iBACpC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,cAAc,EAAE,CAAC;iBACvC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAA;YAErB,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,IAAI,eAAe,CACvB,6BAA6B,QAAQ,kDAAkD,EACvF,KAAc,CACf,CAAA;YACH,CAAC;YAED,0EAA0E;YAC1E,kFAAkF;YAClF,2FAA2F;YAC3F,YAAY,GAAG,IAAA,mBAAO,EAAC,QAAQ,CAAC,CAAA;QAClC,CAAC;QAED,4CAA4C;QAC5C,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,eAAe,CACvB,sCAAsC,IAAI,CAAC,eAAe,sCAAsC,QAAQ,EAAE,CAC3G,CAAA;QACH,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAgB;QAC/B,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,IAAA,kBAAQ,EAAC,QAAQ,CAAC,CAAA;YAChC,IAAI,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;gBACzC,MAAM,IAAI,eAAe,CACvB,4BAA4B,KAAK,CAAC,IAAI,MAAM,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CACtE,CAAA;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,KAAK,YAAY,eAAe,EAAE,CAAC;gBACrC,MAAM,KAAK,CAAA;YACb,CAAC;YACD,MAAM,IAAI,kBAAkB,CAAC,8BAA8B,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACxF,CAAC;IACH,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,QAAgB;QAC9B,aAAa;QACb,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QACrC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QAE/B,gDAAgD;QAChD,MAAM,GAAG,GAAG,IAAA,mBAAO,EAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAA;QAC3C,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,OAAO;gBACV,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAA;YACvC,KAAK,MAAM;gBACT,OAAO,MAAM,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAA;YACtC,KAAK,KAAK;gBACR,OAAO,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAA;YACrC;gBACE,MAAM,IAAI,eAAe,CAAC,4BAA4B,GAAG,EAAE,CAAC,CAAA;QAChE,CAAC;IACH,CAAC;IAED;;;;;;;;;;;;OAYG;IACH,KAAK,CAAC,QAAQ,CAAC,QAAgB,EAAE,QAA2B;QAC1D,aAAa;QACb,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QACrC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QAE/B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;YACvC,MAAM,GAAG,GAAG,MAAM,IAAA,qBAAW,EAAC;gBAC5B,IAAI,EAAE,IAAI,UAAU,CAAC,MAAM,CAAC;gBAC5B,cAAc,EAAE,IAAI;gBACpB,eAAe,EAAE,KAAK;aACvB,CAAC,CAAC,OAAO,CAAA;YAEV,wDAAwD;YACxD,MAAM,KAAK,GAAe,EAAE,CAAA;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;gBACjC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAA;gBAE/C,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK;qBAC5B,MAAM,CAAC,CAAC,IAAI,EAAoB,EAAE,CAAC,KAAK,IAAI,IAAI,CAAC;qBACjD,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;oBACd,IAAI,EAAE,IAAI,CAAC,GAAG;oBACd,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;oBACpB,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;oBACpB,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBACrC,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,KAAK;iBAC7B,CAAC,CAAC,CAAA;gBAEL,KAAK,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;YACnC,CAAC;YAED,+CAA+C;YAC/C,wFAAwF;YACxF,MAAM,IAAI,GAAG,MAAM,IAAA,2CAA2B,EAAC,KAAK,EAAE,QAAQ,CAAC,CAAA;YAE/D,OAAO,CAAC,KAAK,CAAC,eAAe,QAAQ,KAAK,IAAI,CAAC,MAAM,gBAAgB,GAAG,CAAC,QAAQ,SAAS,CAAC,CAAA;YAE3F,OAAO,IAAI,CAAA;QACb,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QAClF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,SAAS,CAAC,QAAgB;QACtC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,iBAAO,CAAC,cAAc,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAA;YAC/D,OAAO,CAAC,KAAK,CAAC,gBAAgB,QAAQ,KAAK,MAAM,CAAC,KAAK,CAAC,MAAM,cAAc,CAAC,CAAA;YAC7E,OAAO,MAAM,CAAC,KAAK,CAAA;QACrB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,yBAAyB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACnF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,QAAQ,CAAC,QAAgB;QACrC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;YAC9C,OAAO,CAAC,KAAK,CAAC,eAAe,QAAQ,KAAK,IAAI,CAAC,MAAM,cAAc,CAAC,CAAA;YACpE,OAAO,IAAI,CAAA;QACb,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QAClF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,OAAO,CAAC,QAAgB;QACpC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;YAC9C,OAAO,CAAC,KAAK,CAAC,cAAc,QAAQ,KAAK,IAAI,CAAC,MAAM,cAAc,CAAC,CAAA;YACnE,OAAO,IAAI,CAAA;QACb,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,uBAAuB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACjF,CAAC;IACH,CAAC;CACF;AA7ND,wCA6NC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":";AAAA,6DAA6D;;;;;;AAE7D,qCAAkC;AAClC,+CAA4D;AAC5D,yCAAuE;AACvE,sDAA6B;AAC7B,6DAA6D;AAE7D,kDAAqD;AACrD,mDAAoG;AACpG,6DAK6B;AAE7B,+CAA+C;AAC/C,uBAAuB;AACvB,+CAA+C;AAE/C;;;;GAIG;AACU,QAAA,oBAAoB,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAA;AAyB7E;;GAEG;AACH,MAAa,eAAgB,SAAQ,KAAK;IACxC,YACE,OAAe,EACU,KAAa;QAEtC,KAAK,CAAC,OAAO,CAAC,CAAA;QAFW,UAAK,GAAL,KAAK,CAAQ;QAGtC,IAAI,CAAC,IAAI,GAAG,iBAAiB,CAAA;IAC/B,CAAC;CACF;AARD,0CAQC;AAED;;GAEG;AACH,MAAa,kBAAmB,SAAQ,KAAK;IAC3C,YACE,OAAe,EACU,KAAa;QAEtC,KAAK,CAAC,OAAO,CAAC,CAAA;QAFW,UAAK,GAAL,KAAK,CAAQ;QAGtC,IAAI,CAAC,IAAI,GAAG,oBAAoB,CAAA;IAClC,CAAC;CACF;AARD,gDAQC;AAED,+CAA+C;AAC/C,uBAAuB;AACvB,+CAA+C;AAE/C;;;;;;;GAOG;AACH,MAAa,cAAc;IAKzB,YAAY,MAAoB;QAHhC,6FAA6F;QACrF,oBAAe,GAAkB,IAAI,CAAA;QAG3C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAA;IACtB,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,gBAAgB,CAAC,QAAgB;QACrC,wDAAwD;QACxD,IAAI,CAAC,IAAA,sBAAU,EAAC,QAAQ,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,eAAe,CACvB,8CAA8C,QAAQ,qDAAqD,CAC5G,CAAA;QACH,CAAC;QAED,oEAAoE;QACpE,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,MAAM,IAAA,mBAAQ,EAAC,IAAA,mBAAO,EAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAA;YAC7D,uDAAuD;YACvD,IAAI,CAAC,eAAe,GAAG,QAAQ,CAAC,QAAQ,CAAC,eAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,GAAG,eAAG,CAAA;QAC3E,CAAC;QAED,uDAAuD;QACvD,IAAI,YAAoB,CAAA;QACxB,IAAI,CAAC;YACH,YAAY,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;QACzC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,sDAAsD;YACtD,iEAAiE;YACjE,oEAAoE;YACpE,4EAA4E;YAC5E,MAAM,SAAS,GAAG,MAAM,IAAA,gBAAK,EAAC,QAAQ,CAAC;iBACpC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,cAAc,EAAE,CAAC;iBACvC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAA;YAErB,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,IAAI,eAAe,CACvB,6BAA6B,QAAQ,kDAAkD,EACvF,KAAc,CACf,CAAA;YACH,CAAC;YAED,0EAA0E;YAC1E,kFAAkF;YAClF,2FAA2F;YAC3F,YAAY,GAAG,IAAA,mBAAO,EAAC,QAAQ,CAAC,CAAA;QAClC,CAAC;QAED,4CAA4C;QAC5C,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,eAAe,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,eAAe,CACvB,sCAAsC,IAAI,CAAC,eAAe,sCAAsC,QAAQ,EAAE,CAC3G,CAAA;QACH,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAgB;QAC/B,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,IAAA,kBAAQ,EAAC,QAAQ,CAAC,CAAA;YAChC,IAAI,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;gBACzC,MAAM,IAAI,eAAe,CACvB,4BAA4B,KAAK,CAAC,IAAI,MAAM,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CACtE,CAAA;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,KAAK,YAAY,eAAe,EAAE,CAAC;gBACrC,MAAM,KAAK,CAAA;YACb,CAAC;YACD,MAAM,IAAI,kBAAkB,CAAC,8BAA8B,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACxF,CAAC;IACH,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,SAAS,CAAC,QAAgB;QAC9B,aAAa;QACb,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QACrC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QAE/B,gDAAgD;QAChD,MAAM,GAAG,GAAG,IAAA,mBAAO,EAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAA;QAC3C,QAAQ,GAAG,EAAE,CAAC;YACZ,KAAK,OAAO;gBACV,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAA;YACvC,KAAK,MAAM;gBACT,OAAO,MAAM,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAA;YACtC,KAAK,KAAK;gBACR,OAAO,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAA;YACrC;gBACE,MAAM,IAAI,eAAe,CAAC,4BAA4B,GAAG,EAAE,CAAC,CAAA;QAChE,CAAC;IACH,CAAC;IAED;;;;;;;;;;;;;OAaG;IACH,KAAK,CAAC,QAAQ,CAAC,QAAgB,EAAE,QAA2B;QAC1D,aAAa;QACb,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QACrC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAA;QAE/B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;YACvC,MAAM,GAAG,GAAG,MAAM,IAAA,qBAAW,EAAC;gBAC5B,IAAI,EAAE,IAAI,UAAU,CAAC,MAAM,CAAC;gBAC5B,cAAc,EAAE,IAAI;gBACpB,eAAe,EAAE,KAAK;aACvB,CAAC,CAAC,OAAO,CAAA;YAEV,wCAAwC;YACxC,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAA;YACxC,MAAM,aAAa,GAAI,QAAQ,EAAE,IAAgC,EAAE,CAAC,OAAO,CAE9D,CAAA;YAEb,wDAAwD;YACxD,MAAM,KAAK,GAAe,EAAE,CAAA;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;gBACjC,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAA;gBAE/C,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK;qBAC5B,MAAM,CAAC,CAAC,IAAI,EAAoB,EAAE,CAAC,KAAK,IAAI,IAAI,CAAC;qBACjD,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;oBACd,IAAI,EAAE,IAAI,CAAC,GAAG;oBACd,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;oBACpB,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC;oBACpB,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBACrC,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,KAAK;iBAC7B,CAAC,CAAC,CAAA;gBAEL,KAAK,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;YACnC,CAAC;YAED,gFAAgF;YAChF,wFAAwF;YACxF,MAAM,aAAa,GAAG,MAAM,IAAA,2CAA2B,EAAC,KAAK,EAAE,QAAQ,CAAC,CAAA;YACxE,MAAM,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;YAEnE,2DAA2D;YAC3D,8EAA8E;YAC9E,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,IAAI,kBAAsC,CAAA;YAC1C,IAAI,CAAC;gBACH,MAAM,aAAa,GAAG,aAAa,CAAC,CAAC,CAAC,CAAA;gBACtC,IAAI,aAAa,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACrD,MAAM,OAAO,GAAG,IAAI,0BAAe,EAAE,CAAA;oBACrC,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,SAAS,CAAC,aAAa,EAAE,QAAQ,CAAC,CAAA;oBACpE,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC3B,kBAAkB,GAAI,WAAW,CAAC,CAAC,CAAsB,CAAC,IAAI,CAAA;oBAChE,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,UAAU,EAAE,CAAC;gBACpB,OAAO,CAAC,KAAK,CAAC,sDAAsD,UAAU,EAAE,CAAC,CAAA;YACnF,CAAC;YACD,MAAM,WAAW,GAAG,IAAA,oCAAe,EAAC,aAAa,EAAE,kBAAkB,EAAE,QAAQ,CAAC,CAAA;YAEhF,OAAO,CAAC,KAAK,CAAC,eAAe,QAAQ,KAAK,IAAI,CAAC,MAAM,gBAAgB,GAAG,CAAC,QAAQ,SAAS,CAAC,CAAA;YAE3F,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACpD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QAClF,CAAC;IACH,CAAC;IAED;;;;;;;;OAQG;IACK,KAAK,CAAC,SAAS,CAAC,QAAgB;QACtC,IAAI,CAAC;YACH,uDAAuD;YACvD,MAAM,MAAM,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAA;YAEvC,sDAAsD;YACtD,MAAM,MAAM,GAAG,MAAM,iBAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC,CAAA;YACvD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAA;YAE5B,mEAAmE;YACnE,MAAM,UAAU,GAAG,MAAM,iBAAO,CAAC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC,CAAA;YAC1D,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,MAAM,WAAW,GAAG,IAAA,qCAAgB,EAAC,UAAU,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAA;YAEhE,OAAO,CAAC,KAAK,CAAC,gBAAgB,QAAQ,KAAK,OAAO,CAAC,MAAM,cAAc,CAAC,CAAA;YACxE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,yBAAyB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACnF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,QAAQ,CAAC,QAAgB;QACrC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;YAC9C,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,MAAM,WAAW,GAAG,IAAA,oCAAe,EAAC,IAAI,EAAE,QAAQ,CAAC,CAAA;YACnD,OAAO,CAAC,KAAK,CAAC,eAAe,QAAQ,KAAK,IAAI,CAAC,MAAM,cAAc,CAAC,CAAA;YACpE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACpD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QAClF,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACK,KAAK,CAAC,OAAO,CAAC,QAAgB;QACpC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;YAC9C,MAAM,QAAQ,GAAG,IAAA,oBAAQ,EAAC,QAAQ,CAAC,CAAA;YACnC,MAAM,WAAW,GAAG,IAAA,yCAAoB,EAAC,IAAI,EAAE,QAAQ,CAAC,CAAA;YACxD,OAAO,CAAC,KAAK,CAAC,cAAc,QAAQ,KAAK,IAAI,CAAC,MAAM,cAAc,CAAC,CAAA;YACnE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAA;QACpD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,kBAAkB,CAAC,uBAAuB,QAAQ,EAAE,EAAE,KAAc,CAAC,CAAA;QACjF,CAAC;IACH,CAAC;CACF;AAxQD,wCAwQC"}
@@ -71,17 +71,18 @@ interface SentencePatternResult {
71
71
  */
72
72
  export declare function detectSentencePatterns(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<SentencePatternResult>;
73
73
  /**
74
- * Filter page boundary sentences and join into text
74
+ * Filter page boundary sentences and return per-page filtered text
75
75
  *
76
76
  * This is the main entry point for sentence-level header/footer filtering.
77
77
  * It detects and removes repeating sentence patterns at page boundaries.
78
+ * Returns an array of filtered text per page, preserving page boundaries.
78
79
  *
79
80
  * Use this instead of joinFilteredPages when embedder is available.
80
81
  *
81
82
  * @param pages - Array of page data
82
83
  * @param embedder - Embedder for generating embeddings
83
84
  * @param config - Configuration options
84
- * @returns Filtered text with header/footer sentences removed
85
+ * @returns Array of filtered text strings, one per page
85
86
  */
86
- export declare function filterPageBoundarySentences(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<string>;
87
+ export declare function filterPageBoundarySentences(pages: PageData[], embedder: EmbedderInterface, config?: Partial<SentencePatternConfig>): Promise<string[]>;
87
88
  //# sourceMappingURL=pdf-filter.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"pdf-filter.d.ts","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAA;AAIvE,YAAY,EAAE,iBAAiB,EAAE,CAAA;AAMjC;;GAEG;AACH,UAAU,oBAAoB;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,CAAC,EAAE,MAAM,CAAA;IACT,CAAC,EAAE,MAAM,CAAA;IACT,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,OAAO,CAAA;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,oBAAoB,EAAE,CAAA;CAC9B;AAoCD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,MAAM,CAK3D;AAoLD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,iEAAiE;IACjE,mBAAmB,EAAE,MAAM,CAAA;IAC3B,gEAAgE;IAChE,QAAQ,EAAE,MAAM,CAAA;IAChB,+EAA+E;IAC/E,WAAW,EAAE,MAAM,CAAA;CACpB;AASD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,qEAAqE;IACrE,mBAAmB,EAAE,OAAO,CAAA;IAC5B,oEAAoE;IACpE,kBAAkB,EAAE,OAAO,CAAA;IAC3B,2CAA2C;IAC3C,gBAAgB,EAAE,MAAM,CAAA;IACxB,0CAA0C;IAC1C,gBAAgB,EAAE,MAAM,CAAA;CACzB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,sBAAsB,CAC1C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,qBAAqB,CAAC,CAsEhC;AAED;;;;;;;;;;;;GAYG;AACH,wBAAsB,2BAA2B,CAC/C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,MAAM,CAAC,CAyCjB"}
1
+ {"version":3,"file":"pdf-filter.d.ts","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,gCAAgC,CAAA;AAIvE,YAAY,EAAE,iBAAiB,EAAE,CAAA;AAMjC;;GAEG;AACH,UAAU,oBAAoB;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,CAAC,EAAE,MAAM,CAAA;IACT,CAAC,EAAE,MAAM,CAAA;IACT,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,OAAO,CAAA;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,oBAAoB,EAAE,CAAA;CAC9B;AAoCD;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,MAAM,CAK3D;AAoLD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,iEAAiE;IACjE,mBAAmB,EAAE,MAAM,CAAA;IAC3B,gEAAgE;IAChE,QAAQ,EAAE,MAAM,CAAA;IAChB,+EAA+E;IAC/E,WAAW,EAAE,MAAM,CAAA;CACpB;AASD;;GAEG;AACH,UAAU,qBAAqB;IAC7B,qEAAqE;IACrE,mBAAmB,EAAE,OAAO,CAAA;IAC5B,oEAAoE;IACpE,kBAAkB,EAAE,OAAO,CAAA;IAC3B,2CAA2C;IAC3C,gBAAgB,EAAE,MAAM,CAAA;IACxB,0CAA0C;IAC1C,gBAAgB,EAAE,MAAM,CAAA;CACzB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,sBAAsB,CAC1C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,qBAAqB,CAAC,CAsEhC;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,2BAA2B,CAC/C,KAAK,EAAE,QAAQ,EAAE,EACjB,QAAQ,EAAE,iBAAiB,EAC3B,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM,GAC1C,OAAO,CAAC,MAAM,EAAE,CAAC,CAsCnB"}
@@ -277,29 +277,30 @@ async function detectSentencePatterns(pages, embedder, config = {}) {
277
277
  return result;
278
278
  }
279
279
  /**
280
- * Filter page boundary sentences and join into text
280
+ * Filter page boundary sentences and return per-page filtered text
281
281
  *
282
282
  * This is the main entry point for sentence-level header/footer filtering.
283
283
  * It detects and removes repeating sentence patterns at page boundaries.
284
+ * Returns an array of filtered text per page, preserving page boundaries.
284
285
  *
285
286
  * Use this instead of joinFilteredPages when embedder is available.
286
287
  *
287
288
  * @param pages - Array of page data
288
289
  * @param embedder - Embedder for generating embeddings
289
290
  * @param config - Configuration options
290
- * @returns Filtered text with header/footer sentences removed
291
+ * @returns Array of filtered text strings, one per page
291
292
  */
292
293
  async function filterPageBoundarySentences(pages, embedder, config = {}) {
293
294
  const cfg = { ...DEFAULT_SENTENCE_PATTERN_CONFIG, ...config };
294
295
  // Need minimum pages to detect patterns
295
296
  if (pages.length < cfg.minPages) {
296
- return joinFilteredPages(pages);
297
+ return pages.map((page) => joinFilteredPages([page]));
297
298
  }
298
299
  // Detect patterns
299
300
  const patterns = await detectSentencePatterns(pages, embedder, cfg);
300
- // If no patterns detected, return normally joined text
301
+ // If no patterns detected, return normally joined text per page
301
302
  if (!patterns.removeFirstSentence && !patterns.removeLastSentence) {
302
- return joinFilteredPages(pages);
303
+ return pages.map((page) => joinFilteredPages([page]));
303
304
  }
304
305
  // Split each page into sentences with Y coordinate (merged by Y)
305
306
  const pageSentences = pages.map((page) => splitItemsIntoSentencesWithY(page.items));
@@ -314,10 +315,7 @@ async function filterPageBoundarySentences(pages, embedder, config = {}) {
314
315
  }
315
316
  return cleaned;
316
317
  });
317
- // Join back into final text
318
- return cleanedPageSentences
319
- .map((sentences) => sentences.map((s) => s.text).join(' '))
320
- .filter((text) => text.length > 0)
321
- .join('\n\n');
318
+ // Return per-page filtered text
319
+ return cleanedPageSentences.map((sentences) => sentences.map((s) => s.text).join(' '));
322
320
  }
323
321
  //# sourceMappingURL=pdf-filter.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"pdf-filter.js","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":";AAAA,2BAA2B;AAC3B,wDAAwD;AACxD,uEAAuE;;AAuEvE,8CAKC;AA0OD,wDA0EC;AAeD,kEA6CC;AAzbD,0EAAoE;AA4BpE,+CAA+C;AAC/C,eAAe;AACf,+CAA+C;AAE/C;;;;;;GAMG;AACH,SAAS,aAAa,CAAC,KAA6B;IAClD,6DAA6D;IAC7D,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkC,CAAA;IACzD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QAClC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAChB,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAA;IACvB,CAAC;IAED,oFAAoF;IACpF,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAClB,KAAK;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SAClB,IAAI,CAAC,GAAG,CAAC,CACb;SACA,IAAI,CAAC,IAAI,CAAC;SACV,IAAI,EAAE,CAAA;AACX,CAAC;AAED;;;;;GAKG;AACH,SAAgB,iBAAiB,CAAC,KAAiB;IACjD,OAAO,KAAK;SACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;SACxC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;SACjC,IAAI,CAAC,MAAM,CAAC,CAAA;AACjB,CAAC;AAcD;;;;;;;;;;GAUG;AACH,SAAS,4BAA4B,CAAC,KAA6B;IACjE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAEjC,+DAA+D;IAC/D,MAAM,WAAW,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC3C,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QACvB,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAA;QACrC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;IAClB,CAAC,CAAC,CAAA;IAEF,2DAA2D;IAC3D,MAAM,UAAU,GAAyD,EAAE,CAAA;IAC3E,IAAI,QAAQ,GAAG,EAAE,CAAA;IACjB,IAAI,KAAK,GAAkB,IAAI,CAAA;IAE/B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC/B,4DAA4D;QAC5D,6EAA6E;QAC7E,IAAI,KAAK,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACnD,QAAQ,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAA;QACtC,CAAC;QAED,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAA;QACjD,QAAQ,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,CAAA;QAC3B,KAAK,GAAG,IAAI,CAAC,CAAC,CAAA;IAChB,CAAC;IAED,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,QAAQ,CAAC,CAAA;IAE9C,kEAAkE;IAClE,MAAM,cAAc,GAAoB,EAAE,CAAA;IAC1C,IAAI,WAAW,GAAG,CAAC,CAAA;IAEnB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,8CAA8C;QAC9C,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACpE,IAAI,aAAa,KAAK,CAAC,CAAC;YAAE,SAAQ;QAElC,4CAA4C;QAC5C,IAAI,UAAU,GAAG,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC3B,IAAI,KAAK,IAAI,KAAK,CAAC,KAAK,IAAI,aAAa,EAAE,CAAC;gBAC1C,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;gBACrC,MAAK;YACP,CAAC;QACH,CAAC;QAED,cAAc,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAA;QACtD,WAAW,GAAG,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAA;IAC/C,CAAC;IAED,yCAAyC;IACzC,OAAO,iBAAiB,CAAC,cAAc,CAAC,CAAA;AAC1C,CAAC;AAED;;;;;GAKG;AACH,SAAS,iBAAiB,CAAC,SAA0B;IACnD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAErC,MAAM,MAAM,GAAoB,EAAE,CAAA;IAClC,IAAI,OAAO,GAAyB,IAAI,CAAA;IAExC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACrB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;aAAM,IAAI,OAAO,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,EAAE,CAAC;YACpC,qBAAqB;YACrB,OAAO,CAAC,IAAI,IAAI,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAA;QACrC,CAAC;aAAM,CAAC;YACN,0CAA0C;YAC1C,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACpB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;QACrB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACtB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,yCAAyC;AACzC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAc,EAAE,IAAc;IACtD,IAAI,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,CAAC,CAAA;IACV,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;QACvB,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;QACvB,UAAU,IAAI,EAAE,GAAG,EAAE,CAAA;QACrB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;QAChB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;IAClB,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACvD,IAAI,WAAW,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAE/B,OAAO,UAAU,GAAG,WAAW,CAAA;AACjC,CAAC;AAED;;;;;;GAMG;AACH,SAAS,wBAAwB,CAAC,UAAsB;IACtD,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,GAAG,CAAA;IAErC,MAAM,YAAY,GAAa,EAAE,CAAA;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;gBACjB,YAAY,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;YACjD,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAEvC,uBAAuB;IACvB,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAE/C,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QAClC,qCAAqC;QACrC,OAAO,CAAC,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACtE,CAAC;IACD,oBAAoB;IACpB,OAAO,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;AAC/B,CAAC;AAcD,iEAAiE;AACjE,MAAM,+BAA+B,GAA0B;IAC7D,mBAAmB,EAAE,IAAI;IACzB,QAAQ,EAAE,CAAC;IACX,WAAW,EAAE,CAAC;CACf,CAAA;AAgBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACI,KAAK,UAAU,sBAAsB,CAC1C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,MAAM,MAAM,GAA0B;QACpC,mBAAmB,EAAE,KAAK;QAC1B,kBAAkB,EAAE,KAAK;QACzB,gBAAgB,EAAE,CAAC;QACnB,gBAAgB,EAAE,CAAC;KACpB,CAAA;IAED,iDAAiD;IACjD,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,MAAM,CAAA;IACf,CAAC;IAED,kDAAkD;IAClD,uEAAuE;IACvE,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAChD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,CAAA;IAClD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC,CAAA;IACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,GAAG,GAAG,CAAC,WAAW,CAAC,CAAA;IACrE,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAA;IAErD,oEAAoE;IACpE,MAAM,aAAa,GAAsB,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAChE,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,yDAAyD;IACzD,MAAM,cAAc,GAAa,EAAE,CAAA;IACnC,MAAM,aAAa,GAAa,EAAE,CAAA;IAElC,KAAK,MAAM,SAAS,IAAI,aAAa,EAAE,CAAC;QACtC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YACvC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,aAAa,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC3D,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,IAAI,cAAc,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,cAAc,CAAC,CAAA;QAC5D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,mBAAmB,GAAG,IAAI,CAAA;YACjC,OAAO,CAAC,KAAK,CACX,qCAAqC,cAAc,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACtJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,6EAA6E;IAC7E,IAAI,aAAa,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;QAC3D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,kBAAkB,GAAG,IAAI,CAAA;YAChC,OAAO,CAAC,KAAK,CACX,qCAAqC,aAAa,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACrJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;;;;;;GAYG;AACI,KAAK,UAAU,2BAA2B,CAC/C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,iBAAiB,CAAC,KAAK,CAAC,CAAA;IACjC,CAAC;IAED,kBAAkB;IAClB,MAAM,QAAQ,GAAG,MAAM,sBAAsB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAA;IAEnE,uDAAuD;IACvD,IAAI,CAAC,QAAQ,CAAC,mBAAmB,IAAI,CAAC,QAAQ,CAAC,kBAAkB,EAAE,CAAC;QAClE,OAAO,iBAAiB,CAAC,KAAK,CAAC,CAAA;IACjC,CAAC;IAED,iEAAiE;IACjE,MAAM,aAAa,GAAsB,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC1D,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,+CAA+C;IAC/C,MAAM,oBAAoB,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;QAC3D,IAAI,OAAO,GAAG,CAAC,GAAG,SAAS,CAAC,CAAA;QAE5B,IAAI,QAAQ,CAAC,mBAAmB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;QAC5B,CAAC;QAED,IAAI,QAAQ,CAAC,kBAAkB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAChC,CAAC;QAED,OAAO,OAAO,CAAA;IAChB,CAAC,CAAC,CAAA;IAEF,4BAA4B;IAC5B,OAAO,oBAAoB;SACxB,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;SAC1D,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;SACjC,IAAI,CAAC,MAAM,CAAC,CAAA;AACjB,CAAC"}
1
+ {"version":3,"file":"pdf-filter.js","sourceRoot":"","sources":["../../src/parser/pdf-filter.ts"],"names":[],"mappings":";AAAA,2BAA2B;AAC3B,wDAAwD;AACxD,uEAAuE;;AAuEvE,8CAKC;AA0OD,wDA0EC;AAgBD,kEA0CC;AAvbD,0EAAoE;AA4BpE,+CAA+C;AAC/C,eAAe;AACf,+CAA+C;AAE/C;;;;;;GAMG;AACH,SAAS,aAAa,CAAC,KAA6B;IAClD,6DAA6D;IAC7D,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkC,CAAA;IACzD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QAClC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAChB,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAA;IACvB,CAAC;IAED,oFAAoF;IACpF,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAClB,KAAK;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SAClB,IAAI,CAAC,GAAG,CAAC,CACb;SACA,IAAI,CAAC,IAAI,CAAC;SACV,IAAI,EAAE,CAAA;AACX,CAAC;AAED;;;;;GAKG;AACH,SAAgB,iBAAiB,CAAC,KAAiB;IACjD,OAAO,KAAK;SACT,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;SACxC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC;SACjC,IAAI,CAAC,MAAM,CAAC,CAAA;AACjB,CAAC;AAcD;;;;;;;;;;GAUG;AACH,SAAS,4BAA4B,CAAC,KAA6B;IACjE,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAEjC,+DAA+D;IAC/D,MAAM,WAAW,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC3C,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QACvB,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAA;QACrC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;IAClB,CAAC,CAAC,CAAA;IAEF,2DAA2D;IAC3D,MAAM,UAAU,GAAyD,EAAE,CAAA;IAC3E,IAAI,QAAQ,GAAG,EAAE,CAAA;IACjB,IAAI,KAAK,GAAkB,IAAI,CAAA;IAE/B,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;QAC/B,4DAA4D;QAC5D,6EAA6E;QAC7E,IAAI,KAAK,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACnD,QAAQ,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAA;QACtC,CAAC;QAED,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAA;QACjD,QAAQ,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,CAAA;QAC3B,KAAK,GAAG,IAAI,CAAC,CAAC,CAAA;IAChB,CAAC;IAED,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,QAAQ,CAAC,CAAA;IAE9C,kEAAkE;IAClE,MAAM,cAAc,GAAoB,EAAE,CAAA;IAC1C,IAAI,WAAW,GAAG,CAAC,CAAA;IAEnB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,8CAA8C;QAC9C,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACpE,IAAI,aAAa,KAAK,CAAC,CAAC;YAAE,SAAQ;QAElC,4CAA4C;QAC5C,IAAI,UAAU,GAAG,WAAW,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChD,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC3B,IAAI,KAAK,IAAI,KAAK,CAAC,KAAK,IAAI,aAAa,EAAE,CAAC;gBAC1C,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;gBACrC,MAAK;YACP,CAAC;QACH,CAAC;QAED,cAAc,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAA;QACtD,WAAW,GAAG,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAA;IAC/C,CAAC;IAED,yCAAyC;IACzC,OAAO,iBAAiB,CAAC,cAAc,CAAC,CAAA;AAC1C,CAAC;AAED;;;;;GAKG;AACH,SAAS,iBAAiB,CAAC,SAA0B;IACnD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAErC,MAAM,MAAM,GAAoB,EAAE,CAAA;IAClC,IAAI,OAAO,GAAyB,IAAI,CAAA;IAExC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;YACrB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;aAAM,IAAI,OAAO,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,EAAE,CAAC;YACpC,qBAAqB;YACrB,OAAO,CAAC,IAAI,IAAI,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAA;QACrC,CAAC;aAAM,CAAC;YACN,0CAA0C;YAC1C,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACpB,OAAO,GAAG,EAAE,GAAG,QAAQ,EAAE,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;QACrB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACtB,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,yCAAyC;AACzC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAc,EAAE,IAAc;IACtD,IAAI,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrD,OAAO,CAAC,CAAA;IACV,CAAC;IAED,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;QACvB,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;QACvB,UAAU,IAAI,EAAE,GAAG,EAAE,CAAA;QACrB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;QAChB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;IAClB,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACvD,IAAI,WAAW,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAE/B,OAAO,UAAU,GAAG,WAAW,CAAA;AACjC,CAAC;AAED;;;;;;GAMG;AACH,SAAS,wBAAwB,CAAC,UAAsB;IACtD,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,GAAG,CAAA;IAErC,MAAM,YAAY,GAAa,EAAE,CAAA;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAC1B,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;gBACjB,YAAY,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;YACjD,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAEvC,uBAAuB;IACvB,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAE/C,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QAClC,qCAAqC;QACrC,OAAO,CAAC,CAAC,YAAY,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACtE,CAAC;IACD,oBAAoB;IACpB,OAAO,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;AAC/B,CAAC;AAcD,iEAAiE;AACjE,MAAM,+BAA+B,GAA0B;IAC7D,mBAAmB,EAAE,IAAI;IACzB,QAAQ,EAAE,CAAC;IACX,WAAW,EAAE,CAAC;CACf,CAAA;AAgBD;;;;;;;;;;;;;;;;;;;;GAoBG;AACI,KAAK,UAAU,sBAAsB,CAC1C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,MAAM,MAAM,GAA0B;QACpC,mBAAmB,EAAE,KAAK;QAC1B,kBAAkB,EAAE,KAAK;QACzB,gBAAgB,EAAE,CAAC;QACnB,gBAAgB,EAAE,CAAC;KACpB,CAAA;IAED,iDAAiD;IACjD,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,MAAM,CAAA;IACf,CAAC;IAED,kDAAkD;IAClD,uEAAuE;IACvE,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAChD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,GAAG,CAAC,CAAC,CAAA;IAClD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC,CAAA;IACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,GAAG,GAAG,CAAC,WAAW,CAAC,CAAA;IACrE,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAA;IAErD,oEAAoE;IACpE,MAAM,aAAa,GAAsB,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAChE,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,yDAAyD;IACzD,MAAM,cAAc,GAAa,EAAE,CAAA;IACnC,MAAM,aAAa,GAAa,EAAE,CAAA;IAElC,KAAK,MAAM,SAAS,IAAI,aAAa,EAAE,CAAC;QACtC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YACvC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,aAAa,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC3D,CAAC;QACH,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,IAAI,cAAc,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,cAAc,CAAC,CAAA;QAC5D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,mBAAmB,GAAG,IAAI,CAAA;YACjC,OAAO,CAAC,KAAK,CACX,qCAAqC,cAAc,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACtJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,6EAA6E;IAC7E,IAAI,aAAa,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;QAC3D,MAAM,SAAS,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAA;QACtD,MAAM,CAAC,gBAAgB,GAAG,SAAS,CAAA;QAEnC,IAAI,SAAS,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YACzC,MAAM,CAAC,kBAAkB,GAAG,IAAI,CAAA;YAChC,OAAO,CAAC,KAAK,CACX,qCAAqC,aAAa,CAAC,MAAM,kBAAkB,UAAU,GAAG,CAAC,IAAI,QAAQ,yBAAyB,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACrJ,CAAA;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;;;;;;;GAaG;AACI,KAAK,UAAU,2BAA2B,CAC/C,KAAiB,EACjB,QAA2B,EAC3B,SAAyC,EAAE;IAE3C,MAAM,GAAG,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IAE7D,wCAAwC;IACxC,IAAI,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAChC,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACvD,CAAC;IAED,kBAAkB;IAClB,MAAM,QAAQ,GAAG,MAAM,sBAAsB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,CAAC,CAAA;IAEnE,gEAAgE;IAChE,IAAI,CAAC,QAAQ,CAAC,mBAAmB,IAAI,CAAC,QAAQ,CAAC,kBAAkB,EAAE,CAAC;QAClE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACvD,CAAC;IAED,iEAAiE;IACjE,MAAM,aAAa,GAAsB,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAC1D,4BAA4B,CAAC,IAAI,CAAC,KAAK,CAAC,CACzC,CAAA;IAED,+CAA+C;IAC/C,MAAM,oBAAoB,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;QAC3D,IAAI,OAAO,GAAG,CAAC,GAAG,SAAS,CAAC,CAAA;QAE5B,IAAI,QAAQ,CAAC,mBAAmB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;QAC5B,CAAC;QAED,IAAI,QAAQ,CAAC,kBAAkB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtD,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAChC,CAAC;QAED,OAAO,OAAO,CAAA;IAChB,CAAC,CAAC,CAAA;IAEF,gCAAgC;IAChC,OAAO,oBAAoB,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;AACxF,CAAC"}
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Result of title extraction, including how the title was determined
3
+ */
4
+ export interface TitleExtractionResult {
5
+ title: string;
6
+ source: 'metadata' | 'content' | 'filename';
7
+ }
8
+ /**
9
+ * Convert a file name to a human-readable title
10
+ * Strips the extension and replaces hyphens/underscores with spaces
11
+ *
12
+ * @param fileName - File name (e.g., "2024-annual-report.pdf")
13
+ * @returns Human-readable title (e.g., "2024 annual report")
14
+ */
15
+ export declare function fileNameToTitle(fileName: string): string;
16
+ /**
17
+ * Extract title from Markdown content
18
+ * Priority: YAML frontmatter title -> first # H1 -> file name
19
+ *
20
+ * @param text - Markdown content
21
+ * @param fileName - File name for fallback
22
+ * @returns Title extraction result
23
+ */
24
+ export declare function extractMarkdownTitle(text: string, fileName: string): TitleExtractionResult;
25
+ /**
26
+ * Extract title from plain text content
27
+ * Priority: first line followed by empty line -> file name
28
+ *
29
+ * @param text - Plain text content
30
+ * @param fileName - File name for fallback
31
+ * @returns Title extraction result
32
+ */
33
+ export declare function extractTxtTitle(text: string, fileName: string): TitleExtractionResult;
34
+ /**
35
+ * Extract title from HTML content (using Readability title)
36
+ * Priority: readability title -> file name
37
+ *
38
+ * @param readabilityTitle - Title extracted by Readability
39
+ * @param fileName - File name for fallback
40
+ * @returns Title extraction result
41
+ */
42
+ export declare function extractHtmlTitle(readabilityTitle: string, fileName: string): TitleExtractionResult;
43
+ /**
44
+ * Extract title from PDF metadata or first page chunk text
45
+ * Priority: PDF metadata /Title -> first page chunk 0 text -> file name
46
+ *
47
+ * Rejects metadata titles that look like file paths (contain / or \) or are empty/whitespace-only.
48
+ *
49
+ * @param metadataTitle - PDF metadata /Title value (may be undefined)
50
+ * @param firstPageChunkText - Text of chunk 0 from semantic chunking of page 1 (may be undefined)
51
+ * @param fileName - File name for fallback
52
+ * @returns Title extraction result
53
+ */
54
+ export declare function extractPdfTitle(metadataTitle: string | undefined, firstPageChunkText: string | undefined, fileName: string): TitleExtractionResult;
55
+ /**
56
+ * Extract title from DOCX mammoth HTML output
57
+ * Priority: first <h1> from mammoth HTML -> file name
58
+ *
59
+ * @param htmlContent - HTML content generated by mammoth.convertToHtml()
60
+ * @param fileName - File name for fallback
61
+ * @returns Title extraction result
62
+ */
63
+ export declare function extractDocxTitle(htmlContent: string, fileName: string): TitleExtractionResult;
64
+ //# sourceMappingURL=title-extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"title-extractor.d.ts","sourceRoot":"","sources":["../../src/parser/title-extractor.ts"],"names":[],"mappings":"AAOA;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE,MAAM,CAAA;IACb,MAAM,EAAE,UAAU,GAAG,SAAS,GAAG,UAAU,CAAA;CAC5C;AAMD;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAMxD;AAMD;;;;;;;GAOG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,qBAAqB,CAe1F;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,qBAAqB,CAkBrF;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAC9B,gBAAgB,EAAE,MAAM,EACxB,QAAQ,EAAE,MAAM,GACf,qBAAqB,CAOvB;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,eAAe,CAC7B,aAAa,EAAE,MAAM,GAAG,SAAS,EACjC,kBAAkB,EAAE,MAAM,GAAG,SAAS,EACtC,QAAQ,EAAE,MAAM,GACf,qBAAqB,CAiBvB;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,qBAAqB,CAY7F"}