mcp-local-rag 0.5.5 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -7
- package/dist/chunker/semantic-chunker.d.ts +0 -1
- package/dist/chunker/semantic-chunker.d.ts.map +1 -1
- package/dist/chunker/semantic-chunker.js +3 -3
- package/dist/chunker/semantic-chunker.js.map +1 -1
- package/dist/cli-main.d.ts +6 -0
- package/dist/cli-main.d.ts.map +1 -0
- package/dist/cli-main.js +30 -0
- package/dist/cli-main.js.map +1 -0
- package/dist/index.js +20 -106
- package/dist/index.js.map +1 -1
- package/dist/parser/html-parser.d.ts +6 -2
- package/dist/parser/html-parser.d.ts.map +1 -1
- package/dist/parser/html-parser.js +19 -10
- package/dist/parser/html-parser.js.map +1 -1
- package/dist/parser/index.d.ts +23 -9
- package/dist/parser/index.d.ts.map +1 -1
- package/dist/parser/index.js +88 -22
- package/dist/parser/index.js.map +1 -1
- package/dist/parser/pdf-filter.d.ts +7 -8
- package/dist/parser/pdf-filter.d.ts.map +1 -1
- package/dist/parser/pdf-filter.js +11 -14
- package/dist/parser/pdf-filter.js.map +1 -1
- package/dist/parser/title-extractor.d.ts +64 -0
- package/dist/parser/title-extractor.d.ts.map +1 -0
- package/dist/parser/title-extractor.js +139 -0
- package/dist/parser/title-extractor.js.map +1 -0
- package/dist/server/error-utils.d.ts +7 -0
- package/dist/server/error-utils.d.ts.map +1 -0
- package/dist/server/error-utils.js +25 -0
- package/dist/server/error-utils.js.map +1 -0
- package/dist/server/index.d.ts +2 -102
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +53 -121
- package/dist/server/index.js.map +1 -1
- package/dist/server/raw-data-utils.d.ts +32 -0
- package/dist/server/raw-data-utils.d.ts.map +1 -1
- package/dist/server/raw-data-utils.js +46 -0
- package/dist/server/raw-data-utils.js.map +1 -1
- package/dist/server/tool-definitions.d.ts +8 -0
- package/dist/server/tool-definitions.d.ts.map +1 -0
- package/dist/server/tool-definitions.js +100 -0
- package/dist/server/tool-definitions.js.map +1 -0
- package/dist/server/types.d.ts +101 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +4 -0
- package/dist/server/types.js.map +1 -0
- package/dist/server-main.d.ts +5 -0
- package/dist/server-main.d.ts.map +1 -0
- package/dist/server-main.js +107 -0
- package/dist/server-main.js.map +1 -0
- package/dist/vectordb/index.d.ts +9 -100
- package/dist/vectordb/index.d.ts.map +1 -1
- package/dist/vectordb/index.js +56 -187
- package/dist/vectordb/index.js.map +1 -1
- package/dist/vectordb/search-filters.d.ts +45 -0
- package/dist/vectordb/search-filters.d.ts.map +1 -0
- package/dist/vectordb/search-filters.js +142 -0
- package/dist/vectordb/search-filters.js.map +1 -0
- package/dist/vectordb/types.d.ts +112 -0
- package/dist/vectordb/types.d.ts.map +1 -0
- package/dist/vectordb/types.js +74 -0
- package/dist/vectordb/types.js.map +1 -0
- package/package.json +1 -1
- package/skills/mcp-local-rag/SKILL.md +10 -0
- package/skills/mcp-local-rag/references/html-ingestion.md +2 -1
- package/skills/mcp-local-rag/references/result-refinement.md +1 -0
package/README.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# MCP Local RAG
|
|
2
2
|
|
|
3
|
+
[](https://github.com/shinpr/mcp-local-rag)
|
|
3
4
|
[](https://www.npmjs.com/package/mcp-local-rag)
|
|
4
5
|
[](https://opensource.org/licenses/MIT)
|
|
5
6
|
[](https://www.typescriptlang.org/)
|
|
@@ -128,7 +129,7 @@ HTML is automatically cleaned—you get the article content, not the boilerplate
|
|
|
128
129
|
|
|
129
130
|
Search uses semantic similarity with keyword boost. This means `useEffect` finds documents containing that exact term, not just semantically similar React concepts.
|
|
130
131
|
|
|
131
|
-
Results include text content, source file, and relevance score. Adjust result count with `limit` (1-20, default 10).
|
|
132
|
+
Results include text content, source file, document title, and relevance score. The document title provides context for each chunk, helping identify which document a result belongs to. Adjust result count with `limit` (1-20, default 10).
|
|
132
133
|
|
|
133
134
|
### Managing Files
|
|
134
135
|
|
|
@@ -147,6 +148,7 @@ Adjust these for your use case:
|
|
|
147
148
|
| `RAG_HYBRID_WEIGHT` | `0.6` | Keyword boost factor. 0 = semantic only, higher = stronger keyword boost. |
|
|
148
149
|
| `RAG_GROUPING` | (not set) | `similar` for top group only, `related` for top 2 groups. |
|
|
149
150
|
| `RAG_MAX_DISTANCE` | (not set) | Filter out low-relevance results (e.g., `0.5`). |
|
|
151
|
+
| `RAG_MAX_FILES` | (not set) | Limit results to top N files (e.g., `1` for single best file). |
|
|
150
152
|
|
|
151
153
|
### Code-focused tuning
|
|
152
154
|
|
|
@@ -416,12 +418,7 @@ src/
|
|
|
416
418
|
|
|
417
419
|
## Contributing
|
|
418
420
|
|
|
419
|
-
Contributions welcome.
|
|
420
|
-
|
|
421
|
-
1. Run tests: `pnpm test`
|
|
422
|
-
2. Check quality: `pnpm run check:all`
|
|
423
|
-
3. Add tests for new features
|
|
424
|
-
4. Update docs if behavior changes
|
|
421
|
+
Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for setup and guidelines.
|
|
425
422
|
|
|
426
423
|
## License
|
|
427
424
|
|
|
@@ -34,7 +34,6 @@ export interface EmbedderInterface {
|
|
|
34
34
|
* @returns true if chunk is garbage and should be removed
|
|
35
35
|
*/
|
|
36
36
|
export declare function isGarbageChunk(text: string): boolean;
|
|
37
|
-
export declare const DEFAULT_SEMANTIC_CHUNKER_CONFIG: SemanticChunkerConfig;
|
|
38
37
|
/**
|
|
39
38
|
* Semantic chunker using Max-Min algorithm
|
|
40
39
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"semantic-chunker.d.ts","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAO3C;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC,2DAA2D;IAC3D,aAAa,EAAE,MAAM,CAAA;IACrB,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAA;IACjB,gEAAgE;IAChE,CAAC,EAAE,MAAM,CAAA;IACT,uDAAuD;IACvD,cAAc,EAAE,MAAM,CAAA;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAA;CACjD;AAoBD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAmBpD;
|
|
1
|
+
{"version":3,"file":"semantic-chunker.d.ts","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAO3C;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC,2DAA2D;IAC3D,aAAa,EAAE,MAAM,CAAA;IACrB,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAA;IACjB,gEAAgE;IAChE,CAAC,EAAE,MAAM,CAAA;IACT,uDAAuD;IACvD,cAAc,EAAE,MAAM,CAAA;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAA;CACjD;AAoBD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAmBpD;AAiBD;;;;;;;;;;;GAWG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAuB;gBAElC,MAAM,GAAE,OAAO,CAAC,qBAAqB,CAAM;IAIvD;;;;;;OAMG;IACG,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IAsChF;;OAEG;IACH,OAAO,CAAC,cAAc;IAmEtB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAaxB;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAuBxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAWxB;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAM1B;;OAEG;IACH,OAAO,CAAC,OAAO;IAIf;;;OAGG;IACH,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM;CAsBzD"}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
// Semantic Chunker implementation using Max-Min algorithm
|
|
3
3
|
// Based on: "Max–Min semantic chunking of documents for RAG application" (Springer, 2025)
|
|
4
4
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
-
exports.SemanticChunker =
|
|
5
|
+
exports.SemanticChunker = void 0;
|
|
6
6
|
exports.isGarbageChunk = isGarbageChunk;
|
|
7
7
|
const sentence_splitter_js_1 = require("./sentence-splitter.js");
|
|
8
8
|
// ============================================
|
|
@@ -57,7 +57,7 @@ function isGarbageChunk(text) {
|
|
|
57
57
|
// ============================================
|
|
58
58
|
// Default Configuration
|
|
59
59
|
// ============================================
|
|
60
|
-
|
|
60
|
+
const DEFAULT_SEMANTIC_CHUNKER_CONFIG = {
|
|
61
61
|
hardThreshold: 0.6,
|
|
62
62
|
initConst: 1.5,
|
|
63
63
|
c: 0.9,
|
|
@@ -80,7 +80,7 @@ exports.DEFAULT_SEMANTIC_CHUNKER_CONFIG = {
|
|
|
80
80
|
*/
|
|
81
81
|
class SemanticChunker {
|
|
82
82
|
constructor(config = {}) {
|
|
83
|
-
this.config = { ...
|
|
83
|
+
this.config = { ...DEFAULT_SEMANTIC_CHUNKER_CONFIG, ...config };
|
|
84
84
|
}
|
|
85
85
|
/**
|
|
86
86
|
* Split text into semantically coherent chunks
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"semantic-chunker.js","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":";AAAA,0DAA0D;AAC1D,0FAA0F;;;AA+D1F,wCAmBC;AA/ED,iEAA2D;AA4B3D,+CAA+C;AAC/C,qCAAqC;AACrC,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,WAAW,GAAG,CAAC,CAAA;AAErB;;;;GAIG;AACH,MAAM,aAAa,GAAG,EAAE,CAAA;AAExB;;;;;;;;;;;;;GAaG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAC3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IAErC,uDAAuD;IACvD,IAAI,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,KAAK,CAAA;IAE7C,yDAAyD;IACzD,IAAI,wCAAwC,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAA;IAEvE,kDAAkD;IAClD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,CAAA;IACjD,IAAI,QAAQ,GAAG,OAAO,CAAC,MAAM,GAAG,GAAG;QAAE,OAAO,IAAI,CAAA;IAEhD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;
|
|
1
|
+
{"version":3,"file":"semantic-chunker.js","sourceRoot":"","sources":["../../src/chunker/semantic-chunker.ts"],"names":[],"mappings":";AAAA,0DAA0D;AAC1D,0FAA0F;;;AA+D1F,wCAmBC;AA/ED,iEAA2D;AA4B3D,+CAA+C;AAC/C,qCAAqC;AACrC,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,WAAW,GAAG,CAAC,CAAA;AAErB;;;;GAIG;AACH,MAAM,aAAa,GAAG,EAAE,CAAA;AAExB;;;;;;;;;;;;;GAaG;AACH,SAAgB,cAAc,CAAC,IAAY;IACzC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAC3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IAErC,uDAAuD;IACvD,IAAI,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,KAAK,CAAA;IAE7C,yDAAyD;IACzD,IAAI,wCAAwC,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAA;IAEvE,kDAAkD;IAClD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;QAC3B,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IACvD,CAAC;IACD,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,CAAA;IACjD,IAAI,QAAQ,GAAG,OAAO,CAAC,MAAM,GAAG,GAAG;QAAE,OAAO,IAAI,CAAA;IAEhD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;AAE/C,MAAM,+BAA+B,GAA0B;IAC7D,aAAa,EAAE,GAAG;IAClB,SAAS,EAAE,GAAG;IACd,CAAC,EAAE,GAAG;IACN,cAAc,EAAE,EAAE;CACnB,CAAA;AAED,+CAA+C;AAC/C,wBAAwB;AACxB,+CAA+C;AAE/C;;;;;;;;;;;GAWG;AACH,MAAa,eAAe;IAG1B,YAAY,SAAyC,EAAE;QACrD,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,+BAA+B,EAAE,GAAG,MAAM,EAAE,CAAA;IACjE,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,QAA2B;QACvD,qBAAqB;QACrB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtC,OAAO,EAAE,CAAA;QACX,CAAC;QAED,uBAAuB;QACvB,MAAM,SAAS,GAAG,IAAA,yCAAkB,EAAC,IAAI,CAAC,CAAA;QAC1C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,EAAE,CAAA;QACX,CAAC;QAED,wCAAwC;QACxC,MAAM,UAAU,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC,CAAA;QAEvD,yDAAyD;QACzD,MAAM,cAAc,GAAG,IAAI,CAAC,cAAc,CAAC,SAAS,EAAE,UAAU,CAAC,CAAA;QAEjE,+BAA+B;QAC/B,MAAM,MAAM,GAAgB,EAAE,CAAA;QAC9B,IAAI,UAAU,GAAG,CAAC,CAAA;QAElB,KAAK,MAAM,KAAK,IAAI,cAAc,EAAE,CAAC;YACnC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YAEjC,kDAAkD;YAClD,IAAI,SAAS,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC,cAAc,IAAI,CAAC,cAAc,CAAC,SAAS,CAAC,EAAE,CAAC;gBACjF,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,SAAS;oBACf,KAAK,EAAE,UAAU;iBAClB,CAAC,CAAA;gBACF,UAAU,EAAE,CAAA;YACd,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,SAAmB,EAAE,UAAsB;QAChE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAA;QACrC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAA;QAEzD,MAAM,MAAM,GAAe,EAAE,CAAA;QAC7B,IAAI,YAAY,GAAa,EAAE,CAAA;QAC/B,IAAI,sBAAsB,GAAe,EAAE,CAAA;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAA;YAC7B,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;YAE/B,IAAI,CAAC,QAAQ,IAAI,CAAC,SAAS;gBAAE,SAAQ;YAErC,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,sCAAsC;gBACtC,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;gBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;YACxC,CAAC;iBAAM,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACrC,gDAAgD;gBAChD,MAAM,cAAc,GAAG,sBAAsB,CAAC,CAAC,CAAC,CAAA;gBAChD,IAAI,CAAC,cAAc;oBAAE,SAAQ;gBAE7B,MAAM,UAAU,GAAG,IAAI,CAAC,gBAAgB,CAAC,cAAc,EAAE,SAAS,CAAC,CAAA;gBAEnE,IAAI,IAAI,CAAC,MAAM,CAAC,SAAS,GAAG,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,EAAE,CAAC;oBACnE,uBAAuB;oBACvB,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;oBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBACxC,CAAC;qBAAM,CAAC;oBACN,kBAAkB;oBAClB,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;gBACtC,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,4EAA4E;gBAC5E,IAAI,YAAY,CAAC,MAAM,IAAI,aAAa,EAAE,CAAC;oBACzC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;oBACpC,SAAQ;gBACV,CAAC;gBAED,2DAA2D;gBAC3D,MAAM,SAAS,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,sBAAsB,CAAC,CAAA;gBAE1E,IAAI,SAAS,EAAE,CAAC;oBACd,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;oBAC3B,sBAAsB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;gBACxC,CAAC;qBAAM,CAAC;oBACN,kBAAkB;oBAClB,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,CAAA;oBAC9B,YAAY,GAAG,CAAC,QAAQ,CAAC,CAAA;oBACzB,sBAAsB,GAAG,CAAC,SAAS,CAAC,CAAA;gBACtC,CAAC;YACH,CAAC;QACH,CAAC;QAED,8BAA8B;QAC9B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAA;QAC3B,CAAC;QAED,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,gBAAgB,CAAC,YAAsB,EAAE,eAA2B;QAC1E,gDAAgD;QAChD,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,eAAe,CAAC,CAAA;QAErD,0DAA0D;QAC1D,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,YAAY,EAAE,eAAe,CAAC,CAAA;QAEnE,8BAA8B;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,MAAM,EAAE,eAAe,CAAC,MAAM,CAAC,CAAA;QAEzE,OAAO,MAAM,GAAG,SAAS,CAAA;IAC3B,CAAC;IAED;;;;;OAKG;IACK,gBAAgB,CAAC,UAAsB;QAC7C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,GAAG,CAAA;QAErC,uEAAuE;QACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC,MAAM,GAAG,WAAW,CAAC,CAAA;QAC7D,MAAM,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAA;QAEnD,IAAI,MAAM,GAAG,GAAG,CAAA;QAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACrD,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAA;gBAChC,MAAM,IAAI,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAA;gBAChC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI;oBAAE,SAAQ;gBAE5B,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;gBAC7C,IAAI,GAAG,GAAG,MAAM,EAAE,CAAC;oBACjB,MAAM,GAAG,GAAG,CAAA;gBACd,CAAC;YACH,CAAC;QACH,CAAC;QACD,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,SAAmB,EAAE,eAA2B;QACvE,IAAI,MAAM,GAAG,CAAC,GAAG,CAAA;QACjB,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACvC,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAA;YACtD,IAAI,GAAG,GAAG,MAAM,EAAE,CAAC;gBACjB,MAAM,GAAG,GAAG,CAAA;YACd,CAAC;QACH,CAAC;QACD,OAAO,MAAM,CAAA;IACf,CAAC;IAED;;;OAGG;IACK,kBAAkB,CAAC,MAAc,EAAE,SAAiB;QAC1D,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAA;QAC5C,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,MAAM,GAAG,YAAY,CAAA;QAC9D,OAAO,IAAI,CAAC,GAAG,CAAC,gBAAgB,EAAE,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,CAAA;IAC9D,CAAC;IAED;;OAEG;IACK,OAAO,CAAC,CAAS;QACvB,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC/B,CAAC;IAED;;;OAGG;IACH,gBAAgB,CAAC,IAAc,EAAE,IAAc;QAC7C,IAAI,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACrD,OAAO,CAAC,CAAA;QACV,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAA;QAClB,IAAI,KAAK,GAAG,CAAC,CAAA;QACb,IAAI,KAAK,GAAG,CAAC,CAAA;QAEb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACvB,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACvB,UAAU,IAAI,EAAE,GAAG,EAAE,CAAA;YACrB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;YAChB,KAAK,IAAI,EAAE,GAAG,EAAE,CAAA;QAClB,CAAC;QAED,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;QACvD,IAAI,WAAW,KAAK,CAAC;YAAE,OAAO,CAAC,CAAA;QAE/B,OAAO,UAAU,GAAG,WAAW,CAAA;IACjC,CAAC;CACF;AAjOD,0CAiOC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli-main.d.ts","sourceRoot":"","sources":["../src/cli-main.ts"],"names":[],"mappings":"AAGA;;;GAGG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,IAAI,CAsB9C"}
|
package/dist/cli-main.js
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.handleCli = handleCli;
|
|
4
|
+
// CLI entry point for subcommands (skills install, etc.)
|
|
5
|
+
const install_skills_js_1 = require("./bin/install-skills.js");
|
|
6
|
+
/**
|
|
7
|
+
* Handle CLI subcommands
|
|
8
|
+
* @param args - Command line arguments (after the binary name)
|
|
9
|
+
*/
|
|
10
|
+
function handleCli(args) {
|
|
11
|
+
const subcommand = args[0];
|
|
12
|
+
switch (subcommand) {
|
|
13
|
+
case 'skills':
|
|
14
|
+
if (args[1] === 'install') {
|
|
15
|
+
(0, install_skills_js_1.run)(args.slice(2));
|
|
16
|
+
process.exit(0);
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
console.error('Unknown skills subcommand. Usage: npx mcp-local-rag skills install [options]');
|
|
20
|
+
console.error('Run "npx mcp-local-rag skills install --help" for more information.');
|
|
21
|
+
process.exit(1);
|
|
22
|
+
}
|
|
23
|
+
break;
|
|
24
|
+
default:
|
|
25
|
+
console.error(`Unknown command: ${subcommand}`);
|
|
26
|
+
console.error('Available commands: skills');
|
|
27
|
+
process.exit(1);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
//# sourceMappingURL=cli-main.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli-main.js","sourceRoot":"","sources":["../src/cli-main.ts"],"names":[],"mappings":";;AAOA,8BAsBC;AA7BD,yDAAyD;AACzD,+DAAiE;AAEjE;;;GAGG;AACH,SAAgB,SAAS,CAAC,IAAc;IACtC,MAAM,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;IAE1B,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,QAAQ;YACX,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,SAAS,EAAE,CAAC;gBAC1B,IAAA,uBAAgB,EAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA;gBAC/B,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACjB,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,KAAK,CACX,8EAA8E,CAC/E,CAAA;gBACD,OAAO,CAAC,KAAK,CAAC,qEAAqE,CAAC,CAAA;gBACpF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACjB,CAAC;YACD,MAAK;QAEP;YACE,OAAO,CAAC,KAAK,CAAC,oBAAoB,UAAU,EAAE,CAAC,CAAA;YAC/C,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAA;YAC3C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACnB,CAAC;AACH,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -1,116 +1,30 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
"use strict";
|
|
3
|
-
// Entry point for
|
|
3
|
+
// Entry point for mcp-local-rag
|
|
4
|
+
// Routes to CLI subcommands or starts the MCP server
|
|
4
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
5
|
-
const
|
|
6
|
-
const
|
|
6
|
+
const cli_main_js_1 = require("./cli-main.js");
|
|
7
|
+
const server_main_js_1 = require("./server-main.js");
|
|
7
8
|
// ============================================
|
|
8
|
-
//
|
|
9
|
+
// Routing
|
|
9
10
|
// ============================================
|
|
11
|
+
const SUBCOMMANDS = new Set(['skills']);
|
|
10
12
|
const args = process.argv.slice(2);
|
|
11
|
-
|
|
12
|
-
if (
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
(0, install_skills_js_1.run)(args.slice(2));
|
|
16
|
-
process.exit(0);
|
|
17
|
-
}
|
|
18
|
-
else {
|
|
19
|
-
console.error('Unknown skills subcommand. Usage: npx mcp-local-rag skills install [options]');
|
|
20
|
-
console.error('Run "npx mcp-local-rag skills install --help" for more information.');
|
|
21
|
-
process.exit(1);
|
|
22
|
-
}
|
|
23
|
-
}
|
|
24
|
-
// ============================================
|
|
25
|
-
// MCP Server (default behavior)
|
|
26
|
-
// ============================================
|
|
27
|
-
/**
|
|
28
|
-
* Parse grouping mode from environment variable
|
|
29
|
-
*/
|
|
30
|
-
function parseGroupingMode(value) {
|
|
31
|
-
if (!value)
|
|
32
|
-
return undefined;
|
|
33
|
-
const normalized = value.toLowerCase().trim();
|
|
34
|
-
if (normalized === 'similar' || normalized === 'related') {
|
|
35
|
-
return normalized;
|
|
36
|
-
}
|
|
37
|
-
console.error(`Invalid RAG_GROUPING value: "${value}". Expected "similar" or "related". Ignoring.`);
|
|
38
|
-
return undefined;
|
|
13
|
+
const firstArg = args[0];
|
|
14
|
+
if (firstArg && SUBCOMMANDS.has(firstArg)) {
|
|
15
|
+
// CLI subcommand
|
|
16
|
+
(0, cli_main_js_1.handleCli)(args);
|
|
39
17
|
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
console.error(`Invalid RAG_MAX_DISTANCE value: "${value}". Expected positive number. Ignoring.`);
|
|
49
|
-
return undefined;
|
|
50
|
-
}
|
|
51
|
-
return parsed;
|
|
52
|
-
}
|
|
53
|
-
/**
|
|
54
|
-
* Parse hybrid weight from environment variable
|
|
55
|
-
*/
|
|
56
|
-
function parseHybridWeight(value) {
|
|
57
|
-
if (!value)
|
|
58
|
-
return undefined;
|
|
59
|
-
const parsed = Number.parseFloat(value);
|
|
60
|
-
if (Number.isNaN(parsed) || parsed < 0 || parsed > 1) {
|
|
61
|
-
console.error(`Invalid RAG_HYBRID_WEIGHT value: "${value}". Expected 0.0-1.0. Using default (0.6).`);
|
|
62
|
-
return undefined;
|
|
63
|
-
}
|
|
64
|
-
return parsed;
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Entry point - Start RAG MCP Server
|
|
68
|
-
*/
|
|
69
|
-
async function main() {
|
|
70
|
-
try {
|
|
71
|
-
// RAGServer configuration
|
|
72
|
-
const config = {
|
|
73
|
-
dbPath: process.env['DB_PATH'] || './lancedb/',
|
|
74
|
-
modelName: process.env['MODEL_NAME'] || 'Xenova/all-MiniLM-L6-v2',
|
|
75
|
-
cacheDir: process.env['CACHE_DIR'] || './models/',
|
|
76
|
-
baseDir: process.env['BASE_DIR'] || process.cwd(),
|
|
77
|
-
maxFileSize: Number.parseInt(process.env['MAX_FILE_SIZE'] || '104857600', 10), // 100MB
|
|
78
|
-
};
|
|
79
|
-
// Add quality filter settings only if defined
|
|
80
|
-
const maxDistance = parseMaxDistance(process.env['RAG_MAX_DISTANCE']);
|
|
81
|
-
const grouping = parseGroupingMode(process.env['RAG_GROUPING']);
|
|
82
|
-
const hybridWeight = parseHybridWeight(process.env['RAG_HYBRID_WEIGHT']);
|
|
83
|
-
if (maxDistance !== undefined) {
|
|
84
|
-
config.maxDistance = maxDistance;
|
|
85
|
-
}
|
|
86
|
-
if (grouping !== undefined) {
|
|
87
|
-
config.grouping = grouping;
|
|
88
|
-
}
|
|
89
|
-
if (hybridWeight !== undefined) {
|
|
90
|
-
config.hybridWeight = hybridWeight;
|
|
91
|
-
}
|
|
92
|
-
console.error('Starting RAG MCP Server...');
|
|
93
|
-
console.error('Configuration:', config);
|
|
94
|
-
// Start RAGServer
|
|
95
|
-
const server = new index_js_1.RAGServer(config);
|
|
96
|
-
await server.initialize();
|
|
97
|
-
await server.run();
|
|
98
|
-
console.error('RAG MCP Server started successfully');
|
|
99
|
-
}
|
|
100
|
-
catch (error) {
|
|
101
|
-
console.error('Failed to start RAG MCP Server:', error);
|
|
18
|
+
else {
|
|
19
|
+
// Default: start MCP server
|
|
20
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
21
|
+
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
|
|
22
|
+
process.exit(1);
|
|
23
|
+
});
|
|
24
|
+
process.on('uncaughtException', (error) => {
|
|
25
|
+
console.error('Uncaught Exception:', error);
|
|
102
26
|
process.exit(1);
|
|
103
|
-
}
|
|
27
|
+
});
|
|
28
|
+
(0, server_main_js_1.startServer)();
|
|
104
29
|
}
|
|
105
|
-
// Global error handling
|
|
106
|
-
process.on('unhandledRejection', (reason, promise) => {
|
|
107
|
-
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
|
|
108
|
-
process.exit(1);
|
|
109
|
-
});
|
|
110
|
-
process.on('uncaughtException', (error) => {
|
|
111
|
-
console.error('Uncaught Exception:', error);
|
|
112
|
-
process.exit(1);
|
|
113
|
-
});
|
|
114
|
-
// Execute main
|
|
115
|
-
main();
|
|
116
30
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;AACA,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;AACA,gCAAgC;AAChC,qDAAqD;;AAErD,+CAAyC;AACzC,qDAA8C;AAE9C,+CAA+C;AAC/C,UAAU;AACV,+CAA+C;AAE/C,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAA;AAEvC,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AAClC,MAAM,QAAQ,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;AAExB,IAAI,QAAQ,IAAI,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;IAC1C,iBAAiB;IACjB,IAAA,uBAAS,EAAC,IAAI,CAAC,CAAA;AACjB,CAAC;KAAM,CAAC;IACN,4BAA4B;IAC5B,OAAO,CAAC,EAAE,CAAC,oBAAoB,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE;QACnD,OAAO,CAAC,KAAK,CAAC,yBAAyB,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,CAAC,CAAA;QACpE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC,CAAC,CAAA;IAEF,OAAO,CAAC,EAAE,CAAC,mBAAmB,EAAE,CAAC,KAAK,EAAE,EAAE;QACxC,OAAO,CAAC,KAAK,CAAC,qBAAqB,EAAE,KAAK,CAAC,CAAA;QAC3C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC,CAAC,CAAA;IAEF,IAAA,4BAAW,GAAE,CAAA;AACf,CAAC"}
|
|
@@ -5,10 +5,14 @@
|
|
|
5
5
|
* 1. HTML string → JSDOM (DOM creation)
|
|
6
6
|
* 2. JSDOM → Readability (main content extraction, noise removal)
|
|
7
7
|
* 3. Readability result → Turndown (Markdown conversion)
|
|
8
|
+
* 4. Title extracted separately via extractHtmlTitle (NOT prepended to content)
|
|
8
9
|
*
|
|
9
10
|
* @param html - Raw HTML string
|
|
10
11
|
* @param url - Source URL (used for resolving relative links)
|
|
11
|
-
* @returns
|
|
12
|
+
* @returns Object with content (markdown) and title (extracted separately)
|
|
12
13
|
*/
|
|
13
|
-
export declare function parseHtml(html: string, url: string): Promise<
|
|
14
|
+
export declare function parseHtml(html: string, url: string): Promise<{
|
|
15
|
+
content: string;
|
|
16
|
+
title: string;
|
|
17
|
+
}>;
|
|
14
18
|
//# sourceMappingURL=html-parser.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-parser.d.ts","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":"AAuDA;;;;;;;;;;;;GAYG;AACH,wBAAsB,SAAS,CAC7B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CA0D7C"}
|
|
@@ -9,6 +9,7 @@ exports.parseHtml = parseHtml;
|
|
|
9
9
|
const readability_1 = require("@mozilla/readability");
|
|
10
10
|
const jsdom_1 = require("jsdom");
|
|
11
11
|
const turndown_1 = __importDefault(require("turndown"));
|
|
12
|
+
const title_extractor_js_1 = require("./title-extractor.js");
|
|
12
13
|
// ============================================
|
|
13
14
|
// Turndown Service Configuration
|
|
14
15
|
// ============================================
|
|
@@ -46,15 +47,16 @@ function createTurndownService() {
|
|
|
46
47
|
* 1. HTML string → JSDOM (DOM creation)
|
|
47
48
|
* 2. JSDOM → Readability (main content extraction, noise removal)
|
|
48
49
|
* 3. Readability result → Turndown (Markdown conversion)
|
|
50
|
+
* 4. Title extracted separately via extractHtmlTitle (NOT prepended to content)
|
|
49
51
|
*
|
|
50
52
|
* @param html - Raw HTML string
|
|
51
53
|
* @param url - Source URL (used for resolving relative links)
|
|
52
|
-
* @returns
|
|
54
|
+
* @returns Object with content (markdown) and title (extracted separately)
|
|
53
55
|
*/
|
|
54
56
|
async function parseHtml(html, url) {
|
|
55
57
|
// Handle empty or whitespace-only HTML
|
|
56
58
|
if (!html || html.trim().length === 0) {
|
|
57
|
-
return '';
|
|
59
|
+
return { content: '', title: '' };
|
|
58
60
|
}
|
|
59
61
|
try {
|
|
60
62
|
// Create DOM from HTML string
|
|
@@ -75,25 +77,32 @@ async function parseHtml(html, url) {
|
|
|
75
77
|
// Try to get body content directly
|
|
76
78
|
const bodyContent = document.body?.innerHTML || '';
|
|
77
79
|
if (!bodyContent.trim()) {
|
|
78
|
-
return '';
|
|
80
|
+
return { content: '', title: '' };
|
|
79
81
|
}
|
|
80
82
|
// Convert raw body HTML to Markdown
|
|
81
83
|
const turndownService = createTurndownService();
|
|
82
|
-
return turndownService.turndown(bodyContent).trim();
|
|
84
|
+
return { content: turndownService.turndown(bodyContent).trim(), title: '' };
|
|
83
85
|
}
|
|
84
86
|
// Convert extracted HTML content to Markdown
|
|
85
87
|
const turndownService = createTurndownService();
|
|
86
88
|
const markdown = turndownService.turndown(article.content);
|
|
87
|
-
//
|
|
88
|
-
|
|
89
|
-
|
|
89
|
+
// Extract title separately (NOT prepended to markdown content)
|
|
90
|
+
// Use URL-derived filename as fallback when Readability has no title
|
|
91
|
+
let urlFileName = '';
|
|
92
|
+
try {
|
|
93
|
+
urlFileName = new URL(url).pathname.split('/').filter(Boolean).pop() || '';
|
|
90
94
|
}
|
|
91
|
-
|
|
95
|
+
catch {
|
|
96
|
+
// Non-URL string, empty fallback
|
|
97
|
+
}
|
|
98
|
+
const titleResult = (0, title_extractor_js_1.extractHtmlTitle)(article.title || '', urlFileName);
|
|
99
|
+
const title = titleResult.title;
|
|
100
|
+
return { content: markdown.trim(), title };
|
|
92
101
|
}
|
|
93
102
|
catch (error) {
|
|
94
|
-
// Log error but don't throw - return empty
|
|
103
|
+
// Log error but don't throw - return empty values for graceful degradation
|
|
95
104
|
console.error('Failed to parse HTML:', error);
|
|
96
|
-
return '';
|
|
105
|
+
return { content: '', title: '' };
|
|
97
106
|
}
|
|
98
107
|
}
|
|
99
108
|
//# sourceMappingURL=html-parser.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":";AAAA,6CAA6C;AAC7C,2DAA2D;;;;;
|
|
1
|
+
{"version":3,"file":"html-parser.js","sourceRoot":"","sources":["../../src/parser/html-parser.ts"],"names":[],"mappings":";AAAA,6CAA6C;AAC7C,2DAA2D;;;;;AAmE3D,8BA6DC;AA9HD,sDAAkD;AAClD,iCAA6B;AAC7B,wDAAsC;AACtC,6DAAuD;AAcvD,+CAA+C;AAC/C,iCAAiC;AACjC,+CAA+C;AAE/C;;GAEG;AACH,SAAS,qBAAqB;IAC5B,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK,EAAE,uBAAuB;QAC5C,cAAc,EAAE,QAAQ,EAAE,0BAA0B;QACpD,gBAAgB,EAAE,GAAG,EAAE,yBAAyB;QAChD,WAAW,EAAE,GAAG,EAAE,qBAAqB;QACvC,eAAe,EAAE,IAAI,EAAE,kBAAkB;KAC1C,CAAC,CAAA;IAEF,0BAA0B;IAC1B,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE;QACpC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;YAC9B,MAAM,OAAO,GAAG,IAAe,CAAA;YAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,CAAA;YACjD,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAA;YACxE,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,IAAI,EAAE,CAAA;YACvE,OAAO,WAAW,QAAQ,KAAK,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,CAAA;QAC/D,CAAC;KACF,CAAC,CAAA;IAEF,OAAO,eAAe,CAAA;AACxB,CAAC;AAED,+CAA+C;AAC/C,cAAc;AACd,+CAA+C;AAE/C;;;;;;;;;;;;GAYG;AACI,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,GAAW;IAEX,uCAAuC;IACvC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;IACnC,CAAC;IAED,IAAI,CAAC;QACH,8BAA8B;QAC9B,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE;YAC1B,GAAG;YACH,yCAAyC;YACzC,UAAU,EAAE,cAAc;SAC3B,CAAC,CAAA;QAEF,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAA;QAEpC,0CAA0C;QAC1C,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,QAAQ,EAAE;YACvC,WAAW,EAAE,KAAK;YAClB,KAAK,EAAE,KAAK;SACb,CAAC,CAAA;QAEF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAA8B,CAAA;QAE1D,kEAAkE;QAClE,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YACjC,mCAAmC;YACnC,MAAM,WAAW,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,IAAI,EAAE,CAAA;YAClD,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;gBACxB,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;YACnC,CAAC;YAED,oCAAoC;YACpC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;YAC/C,OAAO,EAAE,OAAO,EAAE,eAAe,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAC7E,CAAC;QAED,6CAA6C;QAC7C,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA;QAC/C,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAA;QAE1D,+DAA+D;QAC/D,qEAAqE;QACrE,IAAI,WAAW,GAAG,EAAE,CAAA;QACpB,IAAI,CAAC;YACH,WAAW,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAA;QAC5E,CAAC;QAAC,MAAM,CAAC;YACP,iCAAiC;QACnC,CAAC;QACD,MAAM,WAAW,GAAG,IAAA,qCAAgB,EAAC,OAAO,CAAC,KAAK,IAAI,EAAE,EAAE,WAAW,CAAC,CAAA;QACtE,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,CAAA;QAE/B,OAAO,EAAE,OAAO,EAAE,QAAQ,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAA;IAC5C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,2EAA2E;QAC3E,OAAO,CAAC,KAAK,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;IACnC,CAAC;AACH,CAAC"}
|
package/dist/parser/index.d.ts
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
import { type EmbedderInterface } from './pdf-filter.js';
|
|
2
|
+
/**
|
|
3
|
+
* Result from parsing a document, containing both content and extracted title.
|
|
4
|
+
* Title is display-only metadata (NOT used for search scoring).
|
|
5
|
+
*/
|
|
6
|
+
export interface ParseResult {
|
|
7
|
+
content: string;
|
|
8
|
+
title: string;
|
|
9
|
+
}
|
|
2
10
|
/**
|
|
3
11
|
* DocumentParser configuration
|
|
4
12
|
*/
|
|
5
|
-
|
|
13
|
+
interface ParserConfig {
|
|
6
14
|
/** Security: allowed base directory */
|
|
7
15
|
baseDir: string;
|
|
8
16
|
/** Maximum file size (100MB) */
|
|
@@ -32,6 +40,8 @@ export declare class FileOperationError extends Error {
|
|
|
32
40
|
*/
|
|
33
41
|
export declare class DocumentParser {
|
|
34
42
|
private readonly config;
|
|
43
|
+
/** Lazily cached realpath of baseDir. Assumes baseDir is stable for the process lifetime. */
|
|
44
|
+
private resolvedBaseDir;
|
|
35
45
|
constructor(config: ParserConfig);
|
|
36
46
|
/**
|
|
37
47
|
* File path validation (Absolute path requirement + Path traversal prevention)
|
|
@@ -39,7 +49,7 @@ export declare class DocumentParser {
|
|
|
39
49
|
* @param filePath - File path to validate (must be absolute)
|
|
40
50
|
* @throws ValidationError - When path is not absolute or outside BASE_DIR
|
|
41
51
|
*/
|
|
42
|
-
validateFilePath(filePath: string): void
|
|
52
|
+
validateFilePath(filePath: string): Promise<void>;
|
|
43
53
|
/**
|
|
44
54
|
* File size validation (100MB limit)
|
|
45
55
|
*
|
|
@@ -52,11 +62,11 @@ export declare class DocumentParser {
|
|
|
52
62
|
* File parsing (auto format detection)
|
|
53
63
|
*
|
|
54
64
|
* @param filePath - File path to parse
|
|
55
|
-
* @returns
|
|
65
|
+
* @returns ParseResult with content and extracted title
|
|
56
66
|
* @throws ValidationError - Path traversal, size exceeded, unsupported format
|
|
57
67
|
* @throws FileOperationError - File read failed, parse failed
|
|
58
68
|
*/
|
|
59
|
-
parseFile(filePath: string): Promise<
|
|
69
|
+
parseFile(filePath: string): Promise<ParseResult>;
|
|
60
70
|
/**
|
|
61
71
|
* PDF parsing with header/footer filtering
|
|
62
72
|
*
|
|
@@ -64,18 +74,21 @@ export declare class DocumentParser {
|
|
|
64
74
|
* - Extracts text with position information (x, y, fontSize)
|
|
65
75
|
* - Semantic header/footer detection using embedding similarity
|
|
66
76
|
* - Uses hasEOL for proper line break handling
|
|
77
|
+
* - Extracts document title from PDF metadata and first page font heuristic
|
|
67
78
|
*
|
|
68
79
|
* @param filePath - PDF file path
|
|
69
80
|
* @param embedder - Embedder for semantic header/footer detection
|
|
70
|
-
* @returns
|
|
81
|
+
* @returns ParseResult with content and extracted title
|
|
71
82
|
* @throws FileOperationError - File read failed, parse failed
|
|
72
83
|
*/
|
|
73
|
-
parsePdf(filePath: string, embedder: EmbedderInterface): Promise<
|
|
84
|
+
parsePdf(filePath: string, embedder: EmbedderInterface): Promise<ParseResult>;
|
|
74
85
|
/**
|
|
75
86
|
* DOCX parsing (using mammoth)
|
|
76
87
|
*
|
|
88
|
+
* Uses extractRawText for content and convertToHtml additionally for title detection.
|
|
89
|
+
*
|
|
77
90
|
* @param filePath - DOCX file path
|
|
78
|
-
* @returns
|
|
91
|
+
* @returns ParseResult with content and extracted title
|
|
79
92
|
* @throws FileOperationError - File read failed, parse failed
|
|
80
93
|
*/
|
|
81
94
|
private parseDocx;
|
|
@@ -83,7 +96,7 @@ export declare class DocumentParser {
|
|
|
83
96
|
* TXT parsing (using fs.readFile)
|
|
84
97
|
*
|
|
85
98
|
* @param filePath - TXT file path
|
|
86
|
-
* @returns
|
|
99
|
+
* @returns ParseResult with content and extracted title
|
|
87
100
|
* @throws FileOperationError - File read failed
|
|
88
101
|
*/
|
|
89
102
|
private parseTxt;
|
|
@@ -91,9 +104,10 @@ export declare class DocumentParser {
|
|
|
91
104
|
* MD parsing (using fs.readFile)
|
|
92
105
|
*
|
|
93
106
|
* @param filePath - MD file path
|
|
94
|
-
* @returns
|
|
107
|
+
* @returns ParseResult with content and extracted title
|
|
95
108
|
* @throws FileOperationError - File read failed
|
|
96
109
|
*/
|
|
97
110
|
private parseMd;
|
|
98
111
|
}
|
|
112
|
+
export {};
|
|
99
113
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/parser/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,KAAK,iBAAiB,EAA8C,MAAM,iBAAiB,CAAA;AAYpG;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;GAEG;AACH,UAAU,YAAY;IACpB,uCAAuC;IACvC,OAAO,EAAE,MAAM,CAAA;IACf,gCAAgC;IAChC,WAAW,EAAE,MAAM,CAAA;CACpB;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,KAAK;aAGb,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAED;;GAEG;AACH,qBAAa,kBAAmB,SAAQ,KAAK;aAGhB,KAAK,CAAC,EAAE,KAAK;gBADtC,OAAO,EAAE,MAAM,EACU,KAAK,CAAC,EAAE,KAAK,YAAA;CAKzC;AAMD;;;;;;;GAOG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAc;IACrC,6FAA6F;IAC7F,OAAO,CAAC,eAAe,CAAsB;gBAEjC,MAAM,EAAE,YAAY;IAIhC;;;;;OAKG;IACG,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAiDvD;;;;;;OAMG;IACH,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,IAAI;IAgBxC;;;;;;;OAOG;IACG,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAmBvD;;;;;;;;;;;;;OAaG;IACG,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,iBAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IAqEnF;;;;;;;;OAQG;YACW,SAAS;IAqBvB;;;;;;OAMG;YACW,QAAQ;IAYtB;;;;;;OAMG;YACW,OAAO;CAWtB"}
|