@lov3kaizen/agentsea-ingest 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +278 -0
- package/dist/index.d.ts +1558 -0
- package/dist/index.js +4007 -0
- package/dist/index.js.map +1 -0
- package/package.json +89 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 lovekaizen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# @lov3kaizen/agentsea-ingest
|
|
2
|
+
|
|
3
|
+
TypeScript-native document processing pipeline for AI/RAG applications.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multi-format Parsing**: PDF, DOCX, HTML, Markdown, CSV, Excel, JSON
|
|
8
|
+
- **Intelligent Chunking**: Fixed, recursive, sentence, paragraph, semantic, hierarchical
|
|
9
|
+
- **Table & Image Extraction**: Automatic extraction with metadata
|
|
10
|
+
- **Text Cleaning**: Normalization, deduplication, PII removal
|
|
11
|
+
- **Flexible Pipelines**: Configurable processing stages
|
|
12
|
+
- **Streaming Support**: Process large documents efficiently
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pnpm add @lov3kaizen/agentsea-ingest
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
import { createIngester, pipelines } from '@lov3kaizen/agentsea-ingest';
|
|
24
|
+
|
|
25
|
+
// Simple ingestion
|
|
26
|
+
const ingester = createIngester();
|
|
27
|
+
const doc = await ingester.ingestFile('./document.pdf');
|
|
28
|
+
console.log(`Extracted ${doc.chunks.length} chunks`);
|
|
29
|
+
|
|
30
|
+
// RAG-optimized pipeline
|
|
31
|
+
const pipeline = pipelines.rag().build();
|
|
32
|
+
const result = await pipeline.process({ path: './document.md' });
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Parsing Documents
|
|
36
|
+
|
|
37
|
+
### Supported Formats
|
|
38
|
+
|
|
39
|
+
| Format | Parser | MIME Types |
|
|
40
|
+
| -------- | -------------- | ----------------------------------------------------------------------- |
|
|
41
|
+
| PDF | PDFParser | application/pdf |
|
|
42
|
+
| DOCX | DOCXParser | application/vnd.openxmlformats-officedocument.wordprocessingml.document |
|
|
43
|
+
| HTML | HTMLParser | text/html |
|
|
44
|
+
| Markdown | MarkdownParser | text/markdown |
|
|
45
|
+
| Text | TextParser | text/plain |
|
|
46
|
+
| CSV | CSVParser | text/csv |
|
|
47
|
+
| Excel | ExcelParser | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet |
|
|
48
|
+
| JSON | JSONParser | application/json |
|
|
49
|
+
|
|
50
|
+
### Direct Parser Usage
|
|
51
|
+
|
|
52
|
+
```typescript
|
|
53
|
+
import {
|
|
54
|
+
createPDFParser,
|
|
55
|
+
createMarkdownParser,
|
|
56
|
+
} from '@lov3kaizen/agentsea-ingest';
|
|
57
|
+
import { readFileSync } from 'fs';
|
|
58
|
+
|
|
59
|
+
const pdfParser = createPDFParser();
|
|
60
|
+
const buffer = readFileSync('./document.pdf');
|
|
61
|
+
const result = await pdfParser.parse(buffer);
|
|
62
|
+
|
|
63
|
+
console.log(result.text);
|
|
64
|
+
console.log(result.elements);
|
|
65
|
+
console.log(result.tables);
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Chunking Strategies
|
|
69
|
+
|
|
70
|
+
### Fixed Size
|
|
71
|
+
|
|
72
|
+
```typescript
|
|
73
|
+
import { createFixedChunker } from '@lov3kaizen/agentsea-ingest';
|
|
74
|
+
|
|
75
|
+
const chunker = createFixedChunker();
|
|
76
|
+
const chunks = chunker.chunk(text, {
|
|
77
|
+
maxTokens: 512,
|
|
78
|
+
overlap: 50,
|
|
79
|
+
splitOnSentences: true,
|
|
80
|
+
});
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Recursive
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
import { createRecursiveChunker } from '@lov3kaizen/agentsea-ingest';
|
|
87
|
+
|
|
88
|
+
const chunker = createRecursiveChunker();
|
|
89
|
+
const chunks = chunker.chunk(text, {
|
|
90
|
+
maxTokens: 512,
|
|
91
|
+
separators: ['\n\n', '\n', '. ', ' '],
|
|
92
|
+
keepSeparator: true,
|
|
93
|
+
});
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Semantic
|
|
97
|
+
|
|
98
|
+
```typescript
|
|
99
|
+
import { createSemanticChunker } from '@lov3kaizen/agentsea-ingest';
|
|
100
|
+
|
|
101
|
+
const chunker = createSemanticChunker();
|
|
102
|
+
const chunks = await chunker.chunk(text, {
|
|
103
|
+
maxTokens: 512,
|
|
104
|
+
similarityThreshold: 0.5,
|
|
105
|
+
embedFunction: async (text) => myEmbeddingModel(text),
|
|
106
|
+
});
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Hierarchical
|
|
110
|
+
|
|
111
|
+
```typescript
|
|
112
|
+
import { createHierarchicalChunker } from '@lov3kaizen/agentsea-ingest';
|
|
113
|
+
|
|
114
|
+
const chunker = createHierarchicalChunker();
|
|
115
|
+
const chunks = chunker.chunk(markdownText, {
|
|
116
|
+
maxTokens: 512,
|
|
117
|
+
headingLevels: [1, 2, 3],
|
|
118
|
+
includeParentContext: true,
|
|
119
|
+
});
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Pipeline Builder
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
import { createPipelineBuilder } from '@lov3kaizen/agentsea-ingest';
|
|
126
|
+
|
|
127
|
+
const pipeline = createPipelineBuilder()
|
|
128
|
+
.withName('my-pipeline')
|
|
129
|
+
.withStages(['load', 'parse', 'clean', 'chunk', 'embed'])
|
|
130
|
+
.withChunking({
|
|
131
|
+
strategy: 'semantic',
|
|
132
|
+
maxTokens: 512,
|
|
133
|
+
overlap: 50,
|
|
134
|
+
})
|
|
135
|
+
.withCleaning({
|
|
136
|
+
operations: ['normalize_whitespace', 'remove_urls', 'trim'],
|
|
137
|
+
})
|
|
138
|
+
.withCallbacks({
|
|
139
|
+
onDocumentComplete: (doc) => console.log(`Processed: ${doc.id}`),
|
|
140
|
+
})
|
|
141
|
+
.build();
|
|
142
|
+
|
|
143
|
+
const result = await pipeline.process({ path: './document.pdf' });
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Pre-built Pipelines
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
import { pipelines } from '@lov3kaizen/agentsea-ingest';
|
|
150
|
+
|
|
151
|
+
// Simple text extraction
|
|
152
|
+
const simple = pipelines.simple().build();
|
|
153
|
+
|
|
154
|
+
// Full processing with all stages
|
|
155
|
+
const full = pipelines.full().build();
|
|
156
|
+
|
|
157
|
+
// RAG-optimized pipeline
|
|
158
|
+
const rag = pipelines.rag().build();
|
|
159
|
+
|
|
160
|
+
// Document analysis (no chunking)
|
|
161
|
+
const analysis = pipelines.analysis().build();
|
|
162
|
+
|
|
163
|
+
// OCR pipeline for scanned documents
|
|
164
|
+
const ocr = pipelines.ocr().build();
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Ingester
|
|
168
|
+
|
|
169
|
+
The `Ingester` class provides a high-level API for document ingestion:
|
|
170
|
+
|
|
171
|
+
```typescript
|
|
172
|
+
import { createIngester } from '@lov3kaizen/agentsea-ingest';
|
|
173
|
+
|
|
174
|
+
const ingester = createIngester({
|
|
175
|
+
chunking: {
|
|
176
|
+
strategy: 'recursive',
|
|
177
|
+
maxTokens: 512,
|
|
178
|
+
},
|
|
179
|
+
concurrency: 4,
|
|
180
|
+
fileSizeLimit: 10 * 1024 * 1024, // 10MB
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
// Ingest single file
|
|
184
|
+
const doc = await ingester.ingestFile('./document.pdf');
|
|
185
|
+
|
|
186
|
+
// Ingest from URL
|
|
187
|
+
const webDoc = await ingester.ingestUrl('https://example.com/page.html');
|
|
188
|
+
|
|
189
|
+
// Ingest from buffer
|
|
190
|
+
const bufferDoc = await ingester.ingestBuffer(buffer, 'document.pdf');
|
|
191
|
+
|
|
192
|
+
// Ingest directory
|
|
193
|
+
const results = await ingester.ingestDirectory('./documents', {
|
|
194
|
+
recursive: true,
|
|
195
|
+
include: ['*.pdf', '*.docx'],
|
|
196
|
+
exclude: ['draft-*'],
|
|
197
|
+
});
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Watch Mode
|
|
201
|
+
|
|
202
|
+
```typescript
|
|
203
|
+
const ingester = createIngester({
|
|
204
|
+
watchMode: {
|
|
205
|
+
enabled: true,
|
|
206
|
+
paths: ['./documents'],
|
|
207
|
+
include: ['*.pdf', '*.md'],
|
|
208
|
+
debounceDelay: 1000,
|
|
209
|
+
processExisting: true,
|
|
210
|
+
},
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
ingester.startWatching();
|
|
214
|
+
// Files added/modified in ./documents will be automatically processed
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Events
|
|
218
|
+
|
|
219
|
+
```typescript
|
|
220
|
+
const pipeline = createPipeline(config);
|
|
221
|
+
const emitter = pipeline.getEventEmitter();
|
|
222
|
+
|
|
223
|
+
emitter.on('document:loaded', (event) => {
|
|
224
|
+
console.log(`Loaded: ${event.documentId}`);
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
emitter.on('document:chunked', (event) => {
|
|
228
|
+
console.log(`Created ${event.chunkCount} chunks`);
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
emitter.on('document:completed', (event) => {
|
|
232
|
+
console.log(`Completed: ${event.document.id}`);
|
|
233
|
+
});
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## API Reference
|
|
237
|
+
|
|
238
|
+
### Types
|
|
239
|
+
|
|
240
|
+
- `ProcessedDocument` - Processed document with chunks and metadata
|
|
241
|
+
- `Chunk` - Text chunk with metadata and optional embedding
|
|
242
|
+
- `Element` - Document element (paragraph, heading, list, etc.)
|
|
243
|
+
- `TableData` - Extracted table data
|
|
244
|
+
- `ImageData` - Extracted image data
|
|
245
|
+
- `PipelineConfig` - Pipeline configuration options
|
|
246
|
+
- `ChunkingOptions` - Chunking configuration options
|
|
247
|
+
|
|
248
|
+
### Core Classes
|
|
249
|
+
|
|
250
|
+
- `Pipeline` - Document processing pipeline
|
|
251
|
+
- `PipelineBuilder` - Fluent pipeline builder
|
|
252
|
+
- `Ingester` - High-level document ingester
|
|
253
|
+
- `ParserRegistry` - Parser management
|
|
254
|
+
- `ChunkerRegistry` - Chunker management
|
|
255
|
+
|
|
256
|
+
### Parsers
|
|
257
|
+
|
|
258
|
+
- `PDFParser` - PDF document parsing
|
|
259
|
+
- `DOCXParser` - Word document parsing
|
|
260
|
+
- `HTMLParser` - HTML document parsing
|
|
261
|
+
- `MarkdownParser` - Markdown parsing
|
|
262
|
+
- `TextParser` - Plain text parsing
|
|
263
|
+
- `CSVParser` - CSV file parsing
|
|
264
|
+
- `ExcelParser` - Excel file parsing
|
|
265
|
+
- `JSONParser` - JSON file parsing
|
|
266
|
+
|
|
267
|
+
### Chunkers
|
|
268
|
+
|
|
269
|
+
- `FixedChunker` - Fixed-size chunks
|
|
270
|
+
- `RecursiveChunker` - Recursive splitting
|
|
271
|
+
- `SentenceChunker` - Sentence-based chunks
|
|
272
|
+
- `ParagraphChunker` - Paragraph-based chunks
|
|
273
|
+
- `SemanticChunker` - Semantic similarity-based
|
|
274
|
+
- `HierarchicalChunker` - Heading-based hierarchy
|
|
275
|
+
|
|
276
|
+
## License
|
|
277
|
+
|
|
278
|
+
MIT
|