any-extractor 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ type AnyParserMethod = {
2
+ mimes: string[];
3
+ apply: (_: Buffer, ___: ExtractingOptions, ____: ExtractorConfig) => Promise<string>;
4
+ };
5
+ type ExtractedFile = {
6
+ path: string;
7
+ content: Buffer;
8
+ };
9
+ type ExtractorConfig = {
10
+ llm?: {
11
+ llmProvider: 'openai' | 'google' | 'anthropic';
12
+ visionModel: string;
13
+ apikey: string;
14
+ };
15
+ confluence?: {
16
+ baseUrl: string;
17
+ email: string;
18
+ apiKey: string;
19
+ };
20
+ };
21
+ type ExtractingOptions = {
22
+ extractImages: boolean;
23
+ imageExtractionMethod: 'llm' | 'ocr';
24
+ language: SupportedOCRLanguage;
25
+ };
26
+ type ConfluenceOptions = {
27
+ extractAttachments: boolean;
28
+ extractImages: boolean;
29
+ imageExtractionMethod: 'llm' | 'ocr';
30
+ language: SupportedOCRLanguage;
31
+ };
32
+ type SupportedOCRLanguage = 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos' | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu' | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra' | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv' | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat' | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit' | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori' | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv' | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel' | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl' | 'vie' | 'yid';
33
+ type ExtractedXmlItem = {
34
+ type: string;
35
+ content: string;
36
+ };
37
+
38
+ declare class AnyExtractor {
39
+ private extractorConfig;
40
+ constructor(extractorConfig?: ExtractorConfig);
41
+ private mimeParserMap;
42
+ addParser: (method: AnyParserMethod) => this;
43
+ parseFile: (input: string | Buffer, basicAuth?: string | null, extractingOptions?: ExtractingOptions) => Promise<string>;
44
+ parseConfluenceDoc: (pageId: string, extractingOptions?: ConfluenceOptions) => Promise<string>;
45
+ }
46
+
47
+ /**
48
+ * Get an extractor with parsers for various file formats.
49
+ *
50
+ * @param {ExtractorConfig} [config] - Optional configuration for the extractor.
51
+ * @returns {AnyExtractor} - The configured AnyExtractor instance.
52
+ */
53
+ declare const getAnyExtractor: (config?: ExtractorConfig) => AnyExtractor;
54
+
55
+ export { type AnyParserMethod, type ConfluenceOptions, type ExtractedFile, type ExtractedXmlItem, type ExtractingOptions, type ExtractorConfig, type SupportedOCRLanguage, getAnyExtractor };
@@ -0,0 +1,55 @@
1
+ type AnyParserMethod = {
2
+ mimes: string[];
3
+ apply: (_: Buffer, ___: ExtractingOptions, ____: ExtractorConfig) => Promise<string>;
4
+ };
5
+ type ExtractedFile = {
6
+ path: string;
7
+ content: Buffer;
8
+ };
9
+ type ExtractorConfig = {
10
+ llm?: {
11
+ llmProvider: 'openai' | 'google' | 'anthropic';
12
+ visionModel: string;
13
+ apikey: string;
14
+ };
15
+ confluence?: {
16
+ baseUrl: string;
17
+ email: string;
18
+ apiKey: string;
19
+ };
20
+ };
21
+ type ExtractingOptions = {
22
+ extractImages: boolean;
23
+ imageExtractionMethod: 'llm' | 'ocr';
24
+ language: SupportedOCRLanguage;
25
+ };
26
+ type ConfluenceOptions = {
27
+ extractAttachments: boolean;
28
+ extractImages: boolean;
29
+ imageExtractionMethod: 'llm' | 'ocr';
30
+ language: SupportedOCRLanguage;
31
+ };
32
+ type SupportedOCRLanguage = 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos' | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu' | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra' | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv' | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat' | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit' | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori' | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv' | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel' | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl' | 'vie' | 'yid';
33
+ type ExtractedXmlItem = {
34
+ type: string;
35
+ content: string;
36
+ };
37
+
38
+ declare class AnyExtractor {
39
+ private extractorConfig;
40
+ constructor(extractorConfig?: ExtractorConfig);
41
+ private mimeParserMap;
42
+ addParser: (method: AnyParserMethod) => this;
43
+ parseFile: (input: string | Buffer, basicAuth?: string | null, extractingOptions?: ExtractingOptions) => Promise<string>;
44
+ parseConfluenceDoc: (pageId: string, extractingOptions?: ConfluenceOptions) => Promise<string>;
45
+ }
46
+
47
+ /**
48
+ * Get an extractor with parsers for various file formats.
49
+ *
50
+ * @param {ExtractorConfig} [config] - Optional configuration for the extractor.
51
+ * @returns {AnyExtractor} - The configured AnyExtractor instance.
52
+ */
53
+ declare const getAnyExtractor: (config?: ExtractorConfig) => AnyExtractor;
54
+
55
+ export { type AnyParserMethod, type ConfluenceOptions, type ExtractedFile, type ExtractedXmlItem, type ExtractingOptions, type ExtractorConfig, type SupportedOCRLanguage, getAnyExtractor };