localrag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +178 -0
- package/dist/chunking/chunking-service.d.ts +18 -0
- package/dist/chunking/chunking-service.d.ts.map +1 -0
- package/dist/chunking/chunking-service.js +71 -0
- package/dist/chunking/chunking-service.js.map +1 -0
- package/dist/cli/commands/init.d.ts +8 -0
- package/dist/cli/commands/init.d.ts.map +1 -0
- package/dist/cli/commands/init.js +107 -0
- package/dist/cli/commands/init.js.map +1 -0
- package/dist/cli/commands/open.d.ts +8 -0
- package/dist/cli/commands/open.d.ts.map +1 -0
- package/dist/cli/commands/open.js +105 -0
- package/dist/cli/commands/open.js.map +1 -0
- package/dist/cli/commands/search.d.ts +10 -0
- package/dist/cli/commands/search.d.ts.map +1 -0
- package/dist/cli/commands/search.js +73 -0
- package/dist/cli/commands/search.js.map +1 -0
- package/dist/cli/commands/start.d.ts +8 -0
- package/dist/cli/commands/start.d.ts.map +1 -0
- package/dist/cli/commands/start.js +122 -0
- package/dist/cli/commands/start.js.map +1 -0
- package/dist/cli/commands/status.d.ts +12 -0
- package/dist/cli/commands/status.d.ts.map +1 -0
- package/dist/cli/commands/status.js +89 -0
- package/dist/cli/commands/status.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +62 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/config/config-service.d.ts +22 -0
- package/dist/config/config-service.d.ts.map +1 -0
- package/dist/config/config-service.js +108 -0
- package/dist/config/config-service.js.map +1 -0
- package/dist/db/lancedb-repository.d.ts +28 -0
- package/dist/db/lancedb-repository.d.ts.map +1 -0
- package/dist/db/lancedb-repository.js +132 -0
- package/dist/db/lancedb-repository.js.map +1 -0
- package/dist/embedding/embedding-service.d.ts +22 -0
- package/dist/embedding/embedding-service.d.ts.map +1 -0
- package/dist/embedding/embedding-service.js +99 -0
- package/dist/embedding/embedding-service.js.map +1 -0
- package/dist/extractors/docx-extractor.d.ts +12 -0
- package/dist/extractors/docx-extractor.d.ts.map +1 -0
- package/dist/extractors/docx-extractor.js +29 -0
- package/dist/extractors/docx-extractor.js.map +1 -0
- package/dist/extractors/extractor.interface.d.ts +14 -0
- package/dist/extractors/extractor.interface.d.ts.map +1 -0
- package/dist/extractors/extractor.interface.js +63 -0
- package/dist/extractors/extractor.interface.js.map +1 -0
- package/dist/extractors/pdf-extractor.d.ts +11 -0
- package/dist/extractors/pdf-extractor.d.ts.map +1 -0
- package/dist/extractors/pdf-extractor.js +89 -0
- package/dist/extractors/pdf-extractor.js.map +1 -0
- package/dist/extractors/pptx-extractor.d.ts +12 -0
- package/dist/extractors/pptx-extractor.d.ts.map +1 -0
- package/dist/extractors/pptx-extractor.js +98 -0
- package/dist/extractors/pptx-extractor.js.map +1 -0
- package/dist/extractors/text-extractor.d.ts +10 -0
- package/dist/extractors/text-extractor.d.ts.map +1 -0
- package/dist/extractors/text-extractor.js +52 -0
- package/dist/extractors/text-extractor.js.map +1 -0
- package/dist/extractors/xlsx-extractor.d.ts +11 -0
- package/dist/extractors/xlsx-extractor.d.ts.map +1 -0
- package/dist/extractors/xlsx-extractor.js +28 -0
- package/dist/extractors/xlsx-extractor.js.map +1 -0
- package/dist/indexer/indexer.d.ts +34 -0
- package/dist/indexer/indexer.d.ts.map +1 -0
- package/dist/indexer/indexer.js +100 -0
- package/dist/indexer/indexer.js.map +1 -0
- package/dist/metadata/metadata-service.d.ts +34 -0
- package/dist/metadata/metadata-service.d.ts.map +1 -0
- package/dist/metadata/metadata-service.js +147 -0
- package/dist/metadata/metadata-service.js.map +1 -0
- package/dist/scanner/file-scanner.d.ts +20 -0
- package/dist/scanner/file-scanner.d.ts.map +1 -0
- package/dist/scanner/file-scanner.js +110 -0
- package/dist/scanner/file-scanner.js.map +1 -0
- package/dist/search/search-service.d.ts +18 -0
- package/dist/search/search-service.d.ts.map +1 -0
- package/dist/search/search-service.js +98 -0
- package/dist/search/search-service.js.map +1 -0
- package/dist/watcher/file-watcher.d.ts +27 -0
- package/dist/watcher/file-watcher.d.ts.map +1 -0
- package/dist/watcher/file-watcher.js +110 -0
- package/dist/watcher/file-watcher.js.map +1 -0
- package/package.json +53 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EmbeddingService
|
|
3
|
+
*
|
|
4
|
+
* Wraps @xenova/transformers (all-MiniLM-L6-v2) for local, normalized
|
|
5
|
+
* sentence embeddings. Uses a dynamic import because transformers is ESM-only
|
|
6
|
+
* while the rest of the package targets CommonJS.
|
|
7
|
+
*
|
|
8
|
+
* The model (~25 MB ONNX) is cached automatically in ~/.cache/huggingface
|
|
9
|
+
* after the first download. A progress spinner is shown on first load.
|
|
10
|
+
*/
|
|
11
|
+
export declare class EmbeddingService {
|
|
12
|
+
/**
|
|
13
|
+
* Generate normalized L2 embeddings for one or more texts.
|
|
14
|
+
* Returns a 2D array: one 384-dim vector per input text.
|
|
15
|
+
*/
|
|
16
|
+
embed(texts: string[]): Promise<number[][]>;
|
|
17
|
+
/**
|
|
18
|
+
* Embed a single text, returning a 384-dim number[].
|
|
19
|
+
*/
|
|
20
|
+
embedOne(text: string): Promise<number[]>;
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=embedding-service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedding-service.d.ts","sourceRoot":"","sources":["../../src/embedding/embedding-service.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAwCH,qBAAa,gBAAgB;IAC3B;;;OAGG;IACG,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;IAajD;;OAEG;IACG,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;CAIhD"}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* EmbeddingService
|
|
4
|
+
*
|
|
5
|
+
* Wraps @xenova/transformers (all-MiniLM-L6-v2) for local, normalized
|
|
6
|
+
* sentence embeddings. Uses a dynamic import because transformers is ESM-only
|
|
7
|
+
* while the rest of the package targets CommonJS.
|
|
8
|
+
*
|
|
9
|
+
* The model (~25 MB ONNX) is cached automatically in ~/.cache/huggingface
|
|
10
|
+
* after the first download. A progress spinner is shown on first load.
|
|
11
|
+
*/
|
|
12
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
13
|
+
if (k2 === undefined) k2 = k;
|
|
14
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
15
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
16
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
17
|
+
}
|
|
18
|
+
Object.defineProperty(o, k2, desc);
|
|
19
|
+
}) : (function(o, m, k, k2) {
|
|
20
|
+
if (k2 === undefined) k2 = k;
|
|
21
|
+
o[k2] = m[k];
|
|
22
|
+
}));
|
|
23
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
24
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
25
|
+
}) : function(o, v) {
|
|
26
|
+
o["default"] = v;
|
|
27
|
+
});
|
|
28
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
29
|
+
var ownKeys = function(o) {
|
|
30
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
31
|
+
var ar = [];
|
|
32
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
33
|
+
return ar;
|
|
34
|
+
};
|
|
35
|
+
return ownKeys(o);
|
|
36
|
+
};
|
|
37
|
+
return function (mod) {
|
|
38
|
+
if (mod && mod.__esModule) return mod;
|
|
39
|
+
var result = {};
|
|
40
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
41
|
+
__setModuleDefault(result, mod);
|
|
42
|
+
return result;
|
|
43
|
+
};
|
|
44
|
+
})();
|
|
45
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
46
|
+
exports.EmbeddingService = void 0;
|
|
47
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
48
|
+
const ora = require('ora');
|
|
49
|
+
const VECTOR_DIM = 384;
|
|
50
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
51
|
+
let pipelineInstance = null;
|
|
52
|
+
async function getPipeline() {
|
|
53
|
+
if (pipelineInstance)
|
|
54
|
+
return pipelineInstance;
|
|
55
|
+
const spinner = ora({
|
|
56
|
+
text: 'Loading embedding model (first run downloads ~25 MB — one-time only)…',
|
|
57
|
+
spinner: 'dots',
|
|
58
|
+
}).start();
|
|
59
|
+
try {
|
|
60
|
+
// Dynamic import: @xenova/transformers is ESM; works fine from CJS via import()
|
|
61
|
+
const { pipeline, env } = await Promise.resolve().then(() => __importStar(require('@xenova/transformers')));
|
|
62
|
+
// Allow downloading from HuggingFace Hub; cache locally forever
|
|
63
|
+
env.allowLocalModels = true;
|
|
64
|
+
env.useBrowserCache = false;
|
|
65
|
+
pipelineInstance = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', { quantized: true });
|
|
66
|
+
spinner.succeed('Embedding model ready');
|
|
67
|
+
return pipelineInstance;
|
|
68
|
+
}
|
|
69
|
+
catch (err) {
|
|
70
|
+
spinner.fail('Failed to load embedding model');
|
|
71
|
+
throw err;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
class EmbeddingService {
|
|
75
|
+
/**
|
|
76
|
+
* Generate normalized L2 embeddings for one or more texts.
|
|
77
|
+
* Returns a 2D array: one 384-dim vector per input text.
|
|
78
|
+
*/
|
|
79
|
+
async embed(texts) {
|
|
80
|
+
if (texts.length === 0)
|
|
81
|
+
return [];
|
|
82
|
+
const pipe = await getPipeline();
|
|
83
|
+
const output = await pipe(texts, { pooling: 'mean', normalize: true });
|
|
84
|
+
// Output is a Tensor with shape [texts.length, 384].
|
|
85
|
+
// .tolist() converts to nested JS arrays.
|
|
86
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
87
|
+
const nested = output.tolist();
|
|
88
|
+
return nested;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Embed a single text, returning a 384-dim number[].
|
|
92
|
+
*/
|
|
93
|
+
async embedOne(text) {
|
|
94
|
+
const results = await this.embed([text]);
|
|
95
|
+
return results[0] ?? new Array(VECTOR_DIM).fill(0);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
exports.EmbeddingService = EmbeddingService;
|
|
99
|
+
//# sourceMappingURL=embedding-service.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedding-service.js","sourceRoot":"","sources":["../../src/embedding/embedding-service.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iEAAiE;AACjE,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC;AAE3B,MAAM,UAAU,GAAG,GAAG,CAAC;AAEvB,8DAA8D;AAC9D,IAAI,gBAAgB,GAAQ,IAAI,CAAC;AAEjC,KAAK,UAAU,WAAW;IACxB,IAAI,gBAAgB;QAAE,OAAO,gBAAgB,CAAC;IAE9C,MAAM,OAAO,GAAG,GAAG,CAAC;QAClB,IAAI,EAAE,uEAAuE;QAC7E,OAAO,EAAE,MAAM;KAChB,CAAC,CAAC,KAAK,EAAE,CAAC;IAEX,IAAI,CAAC;QACH,gFAAgF;QAChF,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,wDAAa,sBAAsB,GAAC,CAAC;QAE/D,gEAAgE;QAChE,GAAG,CAAC,gBAAgB,GAAG,IAAI,CAAC;QAC5B,GAAG,CAAC,eAAe,GAAG,KAAK,CAAC;QAE5B,gBAAgB,GAAG,MAAM,QAAQ,CAC/B,oBAAoB,EACpB,yBAAyB,EACzB,EAAE,SAAS,EAAE,IAAI,EAAE,CACpB,CAAC;QAEF,OAAO,CAAC,OAAO,CAAC,uBAAuB,CAAC,CAAC;QACzC,OAAO,gBAAgB,CAAC;IAC1B,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAC;QAC/C,MAAM,GAAG,CAAC;IACZ,CAAC;AACH,CAAC;AAED,MAAa,gBAAgB;IAC3B;;;OAGG;IACH,KAAK,CAAC,KAAK,CAAC,KAAe;QACzB,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAElC,MAAM,IAAI,GAAG,MAAM,WAAW,EAAE,CAAC;QACjC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,KAAK,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAEvE,qDAAqD;QACrD,0CAA0C;QAC1C,8DAA8D;QAC9D,MAAM,MAAM,GAAgB,MAAc,CAAC,MAAM,EAAE,CAAC;QACpD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAY;QACzB,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QACzC,OAAO,OAAO,CAAC,CAAC,CAAC,IAAI,IAAI,KAAK,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrD,CAAC;CACF;AAzBD,4CAyBC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { Extractor, ExtractedChunk } from './extractor.interface';
|
|
2
|
+
/**
|
|
3
|
+
* DOCX extractor using mammoth.
|
|
4
|
+
*
|
|
5
|
+
* mammoth.extractRawText() strips all formatting and returns clean plain text,
|
|
6
|
+
* which is ideal for embedding. Word documents don't have a reliable page-
|
|
7
|
+
* number concept in the file format, so page is left undefined.
|
|
8
|
+
*/
|
|
9
|
+
export declare class DocxExtractor implements Extractor {
|
|
10
|
+
extract(filePath: string): Promise<ExtractedChunk[]>;
|
|
11
|
+
}
|
|
12
|
+
//# sourceMappingURL=docx-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/docx-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAElE;;;;;;GAMG;AACH,qBAAa,aAAc,YAAW,SAAS;IACvC,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;CAoB3D"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.DocxExtractor = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* DOCX extractor using mammoth.
|
|
6
|
+
*
|
|
7
|
+
* mammoth.extractRawText() strips all formatting and returns clean plain text,
|
|
8
|
+
* which is ideal for embedding. Word documents don't have a reliable page-
|
|
9
|
+
* number concept in the file format, so page is left undefined.
|
|
10
|
+
*/
|
|
11
|
+
class DocxExtractor {
|
|
12
|
+
async extract(filePath) {
|
|
13
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
14
|
+
const mammoth = require('mammoth');
|
|
15
|
+
const result = await mammoth.extractRawText({ path: filePath });
|
|
16
|
+
if (result.messages && result.messages.length > 0) {
|
|
17
|
+
const errors = result.messages.filter((m) => m.type === 'error');
|
|
18
|
+
if (errors.length > 0) {
|
|
19
|
+
throw new Error(`DOCX extraction errors in ${filePath}: ${errors.map((e) => e.message).join(', ')}`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
const text = result.value.trim();
|
|
23
|
+
if (!text)
|
|
24
|
+
return [];
|
|
25
|
+
return [{ text }];
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
exports.DocxExtractor = DocxExtractor;
|
|
29
|
+
//# sourceMappingURL=docx-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docx-extractor.js","sourceRoot":"","sources":["../../src/extractors/docx-extractor.ts"],"names":[],"mappings":";;;AAEA;;;;;;GAMG;AACH,MAAa,aAAa;IACxB,KAAK,CAAC,OAAO,CAAC,QAAgB;QAC5B,iEAAiE;QACjE,MAAM,OAAO,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;QAEnC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC;QAEhE,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClD,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAmB,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC,CAAC;YACnF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtB,MAAM,IAAI,KAAK,CACb,6BAA6B,QAAQ,KAAK,MAAM,CAAC,GAAG,CAAC,CAAC,CAAsB,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACzG,CAAC;YACJ,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAI,MAAM,CAAC,KAAgB,CAAC,IAAI,EAAE,CAAC;QAC7C,IAAI,CAAC,IAAI;YAAE,OAAO,EAAE,CAAC;QAErB,OAAO,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;IACpB,CAAC;CACF;AArBD,sCAqBC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/** A unit of text extracted from a source document, optionally carrying a page number. */
|
|
2
|
+
export interface ExtractedChunk {
|
|
3
|
+
text: string;
|
|
4
|
+
page?: number;
|
|
5
|
+
}
|
|
6
|
+
/** Common interface all file-type extractors must implement. */
|
|
7
|
+
export interface Extractor {
|
|
8
|
+
extract(filePath: string): Promise<ExtractedChunk[]>;
|
|
9
|
+
}
|
|
10
|
+
export declare function isSupportedFile(filePath: string): boolean;
|
|
11
|
+
export declare function getSupportedExtensions(): string[];
|
|
12
|
+
/** Returns the appropriate extractor for a given file path, or null if unsupported. */
|
|
13
|
+
export declare function getExtractor(filePath: string): Promise<Extractor | null>;
|
|
14
|
+
//# sourceMappingURL=extractor.interface.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.interface.d.ts","sourceRoot":"","sources":["../../src/extractors/extractor.interface.ts"],"names":[],"mappings":"AAIA,0FAA0F;AAC1F,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED,gEAAgE;AAChE,MAAM,WAAW,SAAS;IACxB,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC,CAAC;CACtD;AAaD,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAEzD;AAED,wBAAgB,sBAAsB,IAAI,MAAM,EAAE,CAEjD;AAED,uFAAuF;AACvF,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,CAK9E"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.isSupportedFile = isSupportedFile;
|
|
37
|
+
exports.getSupportedExtensions = getSupportedExtensions;
|
|
38
|
+
exports.getExtractor = getExtractor;
|
|
39
|
+
const path = __importStar(require("path"));
|
|
40
|
+
// ── Supported extensions ─────────────────────────────────────────────────
|
|
41
|
+
const SUPPORTED = {
|
|
42
|
+
'.txt': async () => { const { TextExtractor } = await Promise.resolve().then(() => __importStar(require('./text-extractor'))); return new TextExtractor(); },
|
|
43
|
+
'.md': async () => { const { TextExtractor } = await Promise.resolve().then(() => __importStar(require('./text-extractor'))); return new TextExtractor(); },
|
|
44
|
+
'.pdf': async () => { const { PdfExtractor } = await Promise.resolve().then(() => __importStar(require('./pdf-extractor'))); return new PdfExtractor(); },
|
|
45
|
+
'.docx': async () => { const { DocxExtractor } = await Promise.resolve().then(() => __importStar(require('./docx-extractor'))); return new DocxExtractor(); },
|
|
46
|
+
'.xlsx': async () => { const { XlsxExtractor } = await Promise.resolve().then(() => __importStar(require('./xlsx-extractor'))); return new XlsxExtractor(); },
|
|
47
|
+
'.pptx': async () => { const { PptxExtractor } = await Promise.resolve().then(() => __importStar(require('./pptx-extractor'))); return new PptxExtractor(); },
|
|
48
|
+
};
|
|
49
|
+
function isSupportedFile(filePath) {
|
|
50
|
+
return path.extname(filePath).toLowerCase() in SUPPORTED;
|
|
51
|
+
}
|
|
52
|
+
function getSupportedExtensions() {
|
|
53
|
+
return Object.keys(SUPPORTED);
|
|
54
|
+
}
|
|
55
|
+
/** Returns the appropriate extractor for a given file path, or null if unsupported. */
|
|
56
|
+
async function getExtractor(filePath) {
|
|
57
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
58
|
+
const factory = SUPPORTED[ext];
|
|
59
|
+
if (!factory)
|
|
60
|
+
return null;
|
|
61
|
+
return factory();
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=extractor.interface.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.interface.js","sourceRoot":"","sources":["../../src/extractors/extractor.interface.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA0BA,0CAEC;AAED,wDAEC;AAGD,oCAKC;AAxCD,2CAA6B;AAe7B,4EAA4E;AAE5E,MAAM,SAAS,GAA6C;IAC1D,MAAM,EAAE,KAAK,IAAI,EAAE,GAAG,MAAM,EAAE,aAAa,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC;IAC/G,KAAK,EAAG,KAAK,IAAI,EAAE,GAAG,MAAM,EAAE,aAAa,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC;IAC/G,MAAM,EAAE,KAAK,IAAI,EAAE,GAAG,MAAM,EAAE,YAAY,EAAE,GAAI,wDAAa,iBAAiB,GAAC,CAAC,CAAE,OAAO,IAAI,YAAY,EAAE,CAAC,CAAE,CAAC;IAC/G,OAAO,EAAC,KAAK,IAAI,EAAE,GAAG,MAAM,EAAE,aAAa,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC;IAC/G,OAAO,EAAC,KAAK,IAAI,EAAE,GAAG,MAAM,EAAE,aAAa,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC;IAC/G,OAAO,EAAC,KAAK,IAAI,EAAE,GAAG,MAAM,EAAE,aAAa,EAAE,GAAG,wDAAa,kBAAkB,GAAC,CAAC,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC;CAChH,CAAC;AAEF,SAAgB,eAAe,CAAC,QAAgB;IAC9C,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,IAAI,SAAS,CAAC;AAC3D,CAAC;AAED,SAAgB,sBAAsB;IACpC,OAAO,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;AAChC,CAAC;AAED,uFAAuF;AAChF,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACjD,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;IAC/B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IAC1B,OAAO,OAAO,EAAE,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { Extractor, ExtractedChunk } from './extractor.interface';
|
|
2
|
+
/**
|
|
3
|
+
* PDF extractor using pdf-parse.
|
|
4
|
+
*
|
|
5
|
+
* Uses pdf-parse's pagerender hook to collect per-page text.
|
|
6
|
+
* Falls back to full-document text if per-page extraction fails.
|
|
7
|
+
*/
|
|
8
|
+
export declare class PdfExtractor implements Extractor {
|
|
9
|
+
extract(filePath: string): Promise<ExtractedChunk[]>;
|
|
10
|
+
}
|
|
11
|
+
//# sourceMappingURL=pdf-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/pdf-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAElE;;;;;GAKG;AACH,qBAAa,YAAa,YAAW,SAAS;IACtC,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;CA6C3D"}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.PdfExtractor = void 0;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
/**
|
|
39
|
+
* PDF extractor using pdf-parse.
|
|
40
|
+
*
|
|
41
|
+
* Uses pdf-parse's pagerender hook to collect per-page text.
|
|
42
|
+
* Falls back to full-document text if per-page extraction fails.
|
|
43
|
+
*/
|
|
44
|
+
class PdfExtractor {
|
|
45
|
+
async extract(filePath) {
|
|
46
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
47
|
+
const pdfParse = require('pdf-parse');
|
|
48
|
+
const buffer = fs.readFileSync(filePath);
|
|
49
|
+
const pageTexts = [];
|
|
50
|
+
let pageIndex = 0;
|
|
51
|
+
try {
|
|
52
|
+
await pdfParse(buffer, {
|
|
53
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
54
|
+
pagerender: (pageData) => {
|
|
55
|
+
const pageNum = ++pageIndex;
|
|
56
|
+
return pageData
|
|
57
|
+
.getTextContent()
|
|
58
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
59
|
+
.then((content) => {
|
|
60
|
+
const text = content.items
|
|
61
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
62
|
+
.map((item) => item.str)
|
|
63
|
+
.join(' ');
|
|
64
|
+
pageTexts[pageNum - 1] = text;
|
|
65
|
+
return text;
|
|
66
|
+
})
|
|
67
|
+
.catch(() => {
|
|
68
|
+
pageTexts[pageNum - 1] = '';
|
|
69
|
+
return '';
|
|
70
|
+
});
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
const result = pageTexts
|
|
74
|
+
.map((text, idx) => ({ text: text.trim(), page: idx + 1 }))
|
|
75
|
+
.filter(c => c.text.length > 0);
|
|
76
|
+
return result.length > 0 ? result : [];
|
|
77
|
+
}
|
|
78
|
+
catch (err) {
|
|
79
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
80
|
+
// Surface common failure modes
|
|
81
|
+
if (msg.includes('encrypted') || msg.includes('password')) {
|
|
82
|
+
throw new Error(`PDF is encrypted/password-protected: ${filePath}`);
|
|
83
|
+
}
|
|
84
|
+
throw new Error(`Failed to parse PDF ${filePath}: ${msg}`);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
exports.PdfExtractor = PdfExtractor;
|
|
89
|
+
//# sourceMappingURL=pdf-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-extractor.js","sourceRoot":"","sources":["../../src/extractors/pdf-extractor.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,uCAAyB;AAGzB;;;;;GAKG;AACH,MAAa,YAAY;IACvB,KAAK,CAAC,OAAO,CAAC,QAAgB;QAC5B,iEAAiE;QACjE,MAAM,QAAQ,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;QAEtC,MAAM,MAAM,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,SAAS,GAAa,EAAE,CAAC;QAC/B,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,IAAI,CAAC;YACH,MAAM,QAAQ,CAAC,MAAM,EAAE;gBACrB,8DAA8D;gBAC9D,UAAU,EAAE,CAAC,QAAa,EAAmB,EAAE;oBAC7C,MAAM,OAAO,GAAG,EAAE,SAAS,CAAC;oBAC5B,OAAO,QAAQ;yBACZ,cAAc,EAAE;wBACjB,8DAA8D;yBAC7D,IAAI,CAAC,CAAC,OAAY,EAAE,EAAE;wBACrB,MAAM,IAAI,GAAW,OAAO,CAAC,KAAK;4BAChC,8DAA8D;6BAC7D,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAa,CAAC;6BACtC,IAAI,CAAC,GAAG,CAAC,CAAC;wBACb,SAAS,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC;wBAC9B,OAAO,IAAI,CAAC;oBACd,CAAC,CAAC;yBACD,KAAK,CAAC,GAAG,EAAE;wBACV,SAAS,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC;wBAC5B,OAAO,EAAE,CAAC;oBACZ,CAAC,CAAC,CAAC;gBACP,CAAC;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAqB,SAAS;iBACvC,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;iBAC1D,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAElC,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;QACzC,CAAC;QAAC,OAAO,GAAY,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC7D,+BAA+B;YAC/B,IAAI,GAAG,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC1D,MAAM,IAAI,KAAK,CAAC,wCAAwC,QAAQ,EAAE,CAAC,CAAC;YACtE,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,KAAK,GAAG,EAAE,CAAC,CAAC;QAC7D,CAAC;IACH,CAAC;CACF;AA9CD,oCA8CC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { Extractor, ExtractedChunk } from './extractor.interface';
|
|
2
|
+
/**
|
|
3
|
+
* PPTX extractor.
|
|
4
|
+
*
|
|
5
|
+
* PPTX files are ZIP archives whose slides live at ppt/slides/slideN.xml.
|
|
6
|
+
* We unzip the file with adm-zip and extract all <a:t> text nodes from
|
|
7
|
+
* each slide's XML using fast-xml-parser, treating each slide as a page.
|
|
8
|
+
*/
|
|
9
|
+
export declare class PptxExtractor implements Extractor {
|
|
10
|
+
extract(filePath: string): Promise<ExtractedChunk[]>;
|
|
11
|
+
}
|
|
12
|
+
//# sourceMappingURL=pptx-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pptx-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/pptx-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAElE;;;;;;GAMG;AACH,qBAAa,aAAc,YAAW,SAAS;IACvC,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;CAgD3D"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.PptxExtractor = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* PPTX extractor.
|
|
6
|
+
*
|
|
7
|
+
* PPTX files are ZIP archives whose slides live at ppt/slides/slideN.xml.
|
|
8
|
+
* We unzip the file with adm-zip and extract all <a:t> text nodes from
|
|
9
|
+
* each slide's XML using fast-xml-parser, treating each slide as a page.
|
|
10
|
+
*/
|
|
11
|
+
class PptxExtractor {
|
|
12
|
+
async extract(filePath) {
|
|
13
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
14
|
+
const AdmZip = require('adm-zip');
|
|
15
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
16
|
+
const { XMLParser } = require('fast-xml-parser');
|
|
17
|
+
const zip = new AdmZip(filePath);
|
|
18
|
+
const parser = new XMLParser({
|
|
19
|
+
ignoreAttributes: true,
|
|
20
|
+
parseTagValue: true,
|
|
21
|
+
parseAttributeValue: false,
|
|
22
|
+
trimValues: true,
|
|
23
|
+
});
|
|
24
|
+
// Collect slide entries sorted by slide number
|
|
25
|
+
const entries = zip
|
|
26
|
+
.getEntries()
|
|
27
|
+
.filter((e) => /^ppt\/slides\/slide(\d+)\.xml$/.test(e.entryName))
|
|
28
|
+
.map((e) => {
|
|
29
|
+
const m = e.entryName.match(/slide(\d+)\.xml$/);
|
|
30
|
+
return { name: e.entryName, index: m ? parseInt(m[1], 10) : 0 };
|
|
31
|
+
})
|
|
32
|
+
.sort((a, b) => a.index - b.index);
|
|
33
|
+
const chunks = [];
|
|
34
|
+
for (const { name, index } of entries) {
|
|
35
|
+
const entry = zip.getEntry(name);
|
|
36
|
+
if (!entry)
|
|
37
|
+
continue;
|
|
38
|
+
const xml = entry.getData().toString('utf-8');
|
|
39
|
+
let parsed;
|
|
40
|
+
try {
|
|
41
|
+
parsed = parser.parse(xml);
|
|
42
|
+
}
|
|
43
|
+
catch {
|
|
44
|
+
continue; // skip malformed slides
|
|
45
|
+
}
|
|
46
|
+
const textParts = collectTag(parsed, 'a:t');
|
|
47
|
+
const text = textParts.join(' ').replace(/\s+/g, ' ').trim();
|
|
48
|
+
if (text.length > 0) {
|
|
49
|
+
chunks.push({ text, page: index });
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return chunks;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
exports.PptxExtractor = PptxExtractor;
|
|
56
|
+
// ── Helpers ───────────────────────────────────────────────────────────────
|
|
57
|
+
/**
|
|
58
|
+
* Recursively collect all values of a given XML tag name from a parsed object.
|
|
59
|
+
*/
|
|
60
|
+
function collectTag(node, tag) {
|
|
61
|
+
if (node === null || node === undefined)
|
|
62
|
+
return [];
|
|
63
|
+
if (typeof node === 'string')
|
|
64
|
+
return [];
|
|
65
|
+
if (typeof node === 'number' || typeof node === 'boolean')
|
|
66
|
+
return [];
|
|
67
|
+
if (Array.isArray(node)) {
|
|
68
|
+
return node.flatMap(item => collectTag(item, tag));
|
|
69
|
+
}
|
|
70
|
+
if (typeof node === 'object') {
|
|
71
|
+
const results = [];
|
|
72
|
+
for (const [key, value] of Object.entries(node)) {
|
|
73
|
+
if (key === tag) {
|
|
74
|
+
if (typeof value === 'string')
|
|
75
|
+
results.push(value);
|
|
76
|
+
else if (typeof value === 'number')
|
|
77
|
+
results.push(String(value));
|
|
78
|
+
else if (Array.isArray(value)) {
|
|
79
|
+
for (const v of value) {
|
|
80
|
+
if (typeof v === 'string')
|
|
81
|
+
results.push(v);
|
|
82
|
+
else
|
|
83
|
+
results.push(...collectTag(v, tag));
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
results.push(...collectTag(value, tag));
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
else {
|
|
91
|
+
results.push(...collectTag(value, tag));
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return results;
|
|
95
|
+
}
|
|
96
|
+
return [];
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=pptx-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pptx-extractor.js","sourceRoot":"","sources":["../../src/extractors/pptx-extractor.ts"],"names":[],"mappings":";;;AAEA;;;;;;GAMG;AACH,MAAa,aAAa;IACxB,KAAK,CAAC,OAAO,CAAC,QAAgB;QAC5B,iEAAiE;QACjE,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;QAClC,iEAAiE;QACjE,MAAM,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAAC;QAEjD,MAAM,GAAG,GAAG,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC;QACjC,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC;YAC3B,gBAAgB,EAAE,IAAI;YACtB,aAAa,EAAE,IAAI;YACnB,mBAAmB,EAAE,KAAK;YAC1B,UAAU,EAAE,IAAI;SACjB,CAAC,CAAC;QAEH,+CAA+C;QAC/C,MAAM,OAAO,GAA2C,GAAG;aACxD,UAAU,EAAE;aACZ,MAAM,CAAC,CAAC,CAAwB,EAAE,EAAE,CAAC,gCAAgC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;aACxF,GAAG,CAAC,CAAC,CAAwB,EAAE,EAAE;YAChC,MAAM,CAAC,GAAG,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;YAChD,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAClE,CAAC,CAAC;aACD,IAAI,CAAC,CAAC,CAAoB,EAAE,CAAoB,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAE3E,MAAM,MAAM,GAAqB,EAAE,CAAC;QAEpC,KAAK,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;YACtC,MAAM,KAAK,GAAG,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YACjC,IAAI,CAAC,KAAK;gBAAE,SAAS;YAErB,MAAM,GAAG,GAAW,KAAK,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;YACtD,IAAI,MAAe,CAAC;YACpB,IAAI,CAAC;gBACH,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC7B,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS,CAAC,wBAAwB;YACpC,CAAC;YAED,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;YAC5C,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAE7D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACpB,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAjDD,sCAiDC;AAED,6EAA6E;AAE7E;;GAEG;AACH,SAAS,UAAU,CAAC,IAAa,EAAE,GAAW;IAC5C,IAAI,IAAI,KAAK,IAAI,IAAI,IAAI,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IACnD,IAAI,OAAO,IAAI,KAAK,QAAQ;QAAE,OAAO,EAAE,CAAC;IACxC,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,OAAO,IAAI,KAAK,SAAS;QAAE,OAAO,EAAE,CAAC;IAErE,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC;IACrD,CAAC;IAED,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAA+B,CAAC,EAAE,CAAC;YAC3E,IAAI,GAAG,KAAK,GAAG,EAAE,CAAC;gBAChB,IAAI,OAAO,KAAK,KAAK,QAAQ;oBAAE,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;qBAC9C,IAAI,OAAO,KAAK,KAAK,QAAQ;oBAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;qBAC3D,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC9B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;wBACtB,IAAI,OAAO,CAAC,KAAK,QAAQ;4BAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;4BACtC,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;oBAC3C,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;gBAC1C,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { Extractor, ExtractedChunk } from './extractor.interface';
|
|
2
|
+
/**
|
|
3
|
+
* Plain-text extractor for .txt and .md files.
|
|
4
|
+
* Returns the entire file content as a single chunk (the chunking service
|
|
5
|
+
* will split it into appropriately sized pieces).
|
|
6
|
+
*/
|
|
7
|
+
export declare class TextExtractor implements Extractor {
|
|
8
|
+
extract(filePath: string): Promise<ExtractedChunk[]>;
|
|
9
|
+
}
|
|
10
|
+
//# sourceMappingURL=text-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/text-extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAElE;;;;GAIG;AACH,qBAAa,aAAc,YAAW,SAAS;IACvC,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;CAK3D"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.TextExtractor = void 0;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
/**
|
|
39
|
+
* Plain-text extractor for .txt and .md files.
|
|
40
|
+
* Returns the entire file content as a single chunk (the chunking service
|
|
41
|
+
* will split it into appropriately sized pieces).
|
|
42
|
+
*/
|
|
43
|
+
class TextExtractor {
|
|
44
|
+
async extract(filePath) {
|
|
45
|
+
const text = fs.readFileSync(filePath, 'utf-8');
|
|
46
|
+
if (!text.trim())
|
|
47
|
+
return [];
|
|
48
|
+
return [{ text }];
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
exports.TextExtractor = TextExtractor;
|
|
52
|
+
//# sourceMappingURL=text-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-extractor.js","sourceRoot":"","sources":["../../src/extractors/text-extractor.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,uCAAyB;AAGzB;;;;GAIG;AACH,MAAa,aAAa;IACxB,KAAK,CAAC,OAAO,CAAC,QAAgB;QAC5B,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAChD,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,OAAO,EAAE,CAAC;QAC5B,OAAO,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC;IACpB,CAAC;CACF;AAND,sCAMC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { Extractor, ExtractedChunk } from './extractor.interface';
|
|
2
|
+
/**
|
|
3
|
+
* XLSX extractor using the xlsx (SheetJS) package.
|
|
4
|
+
*
|
|
5
|
+
* Each worksheet is treated as a separate "page". The sheet content is
|
|
6
|
+
* serialised to CSV which captures all cell values in a readable format.
|
|
7
|
+
*/
|
|
8
|
+
export declare class XlsxExtractor implements Extractor {
|
|
9
|
+
extract(filePath: string): Promise<ExtractedChunk[]>;
|
|
10
|
+
}
|
|
11
|
+
//# sourceMappingURL=xlsx-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"xlsx-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/xlsx-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAElE;;;;;GAKG;AACH,qBAAa,aAAc,YAAW,SAAS;IACvC,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;CAkB3D"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.XlsxExtractor = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* XLSX extractor using the xlsx (SheetJS) package.
|
|
6
|
+
*
|
|
7
|
+
* Each worksheet is treated as a separate "page". The sheet content is
|
|
8
|
+
* serialised to CSV which captures all cell values in a readable format.
|
|
9
|
+
*/
|
|
10
|
+
class XlsxExtractor {
|
|
11
|
+
async extract(filePath) {
|
|
12
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
13
|
+
const XLSX = require('xlsx');
|
|
14
|
+
const workbook = XLSX.readFile(filePath, { type: 'file', cellText: true });
|
|
15
|
+
const chunks = [];
|
|
16
|
+
workbook.SheetNames.forEach((sheetName, idx) => {
|
|
17
|
+
const sheet = workbook.Sheets[sheetName];
|
|
18
|
+
const csv = XLSX.utils.sheet_to_csv(sheet, { strip: true });
|
|
19
|
+
const text = `Sheet: ${sheetName}\n${csv}`.trim();
|
|
20
|
+
if (text.length > 0) {
|
|
21
|
+
chunks.push({ text, page: idx + 1 });
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
return chunks;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
exports.XlsxExtractor = XlsxExtractor;
|
|
28
|
+
//# sourceMappingURL=xlsx-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"xlsx-extractor.js","sourceRoot":"","sources":["../../src/extractors/xlsx-extractor.ts"],"names":[],"mappings":";;;AAEA;;;;;GAKG;AACH,MAAa,aAAa;IACxB,KAAK,CAAC,OAAO,CAAC,QAAgB;QAC5B,iEAAiE;QACjE,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;QAE7B,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QAC3E,MAAM,MAAM,GAAqB,EAAE,CAAC;QAEpC,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,SAAiB,EAAE,GAAW,EAAE,EAAE;YAC7D,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;YACzC,MAAM,GAAG,GAAW,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,KAAK,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACpE,MAAM,IAAI,GAAG,UAAU,SAAS,KAAK,GAAG,EAAE,CAAC,IAAI,EAAE,CAAC;YAClD,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACpB,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC;YACvC,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAnBD,sCAmBC"}
|