@lov3kaizen/agentsea-ingest 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 lovekaizen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,278 @@
1
+ # @lov3kaizen/agentsea-ingest
2
+
3
+ TypeScript-native document processing pipeline for AI/RAG applications.
4
+
5
+ ## Features
6
+
7
+ - **Multi-format Parsing**: PDF, DOCX, HTML, Markdown, CSV, Excel, JSON
8
+ - **Intelligent Chunking**: Fixed, recursive, sentence, paragraph, semantic, hierarchical
9
+ - **Table & Image Extraction**: Automatic extraction with metadata
10
+ - **Text Cleaning**: Normalization, deduplication, PII removal
11
+ - **Flexible Pipelines**: Configurable processing stages
12
+ - **Streaming Support**: Process large documents efficiently
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pnpm add @lov3kaizen/agentsea-ingest
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```typescript
23
+ import { createIngester, pipelines } from '@lov3kaizen/agentsea-ingest';
24
+
25
+ // Simple ingestion
26
+ const ingester = createIngester();
27
+ const doc = await ingester.ingestFile('./document.pdf');
28
+ console.log(`Extracted ${doc.chunks.length} chunks`);
29
+
30
+ // RAG-optimized pipeline
31
+ const pipeline = pipelines.rag().build();
32
+ const result = await pipeline.process({ path: './document.md' });
33
+ ```
34
+
35
+ ## Parsing Documents
36
+
37
+ ### Supported Formats
38
+
39
+ | Format | Parser | MIME Types |
40
+ | -------- | -------------- | ----------------------------------------------------------------------- |
41
+ | PDF | PDFParser | application/pdf |
42
+ | DOCX | DOCXParser | application/vnd.openxmlformats-officedocument.wordprocessingml.document |
43
+ | HTML | HTMLParser | text/html |
44
+ | Markdown | MarkdownParser | text/markdown |
45
+ | Text | TextParser | text/plain |
46
+ | CSV | CSVParser | text/csv |
47
+ | Excel | ExcelParser | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet |
48
+ | JSON | JSONParser | application/json |
49
+
50
+ ### Direct Parser Usage
51
+
52
+ ```typescript
53
+ import {
54
+ createPDFParser,
55
+ createMarkdownParser,
56
+ } from '@lov3kaizen/agentsea-ingest';
57
+ import { readFileSync } from 'fs';
58
+
59
+ const pdfParser = createPDFParser();
60
+ const buffer = readFileSync('./document.pdf');
61
+ const result = await pdfParser.parse(buffer);
62
+
63
+ console.log(result.text);
64
+ console.log(result.elements);
65
+ console.log(result.tables);
66
+ ```
67
+
68
+ ## Chunking Strategies
69
+
70
+ ### Fixed Size
71
+
72
+ ```typescript
73
+ import { createFixedChunker } from '@lov3kaizen/agentsea-ingest';
74
+
75
+ const chunker = createFixedChunker();
76
+ const chunks = chunker.chunk(text, {
77
+ maxTokens: 512,
78
+ overlap: 50,
79
+ splitOnSentences: true,
80
+ });
81
+ ```
82
+
83
+ ### Recursive
84
+
85
+ ```typescript
86
+ import { createRecursiveChunker } from '@lov3kaizen/agentsea-ingest';
87
+
88
+ const chunker = createRecursiveChunker();
89
+ const chunks = chunker.chunk(text, {
90
+ maxTokens: 512,
91
+ separators: ['\n\n', '\n', '. ', ' '],
92
+ keepSeparator: true,
93
+ });
94
+ ```
95
+
96
+ ### Semantic
97
+
98
+ ```typescript
99
+ import { createSemanticChunker } from '@lov3kaizen/agentsea-ingest';
100
+
101
+ const chunker = createSemanticChunker();
102
+ const chunks = await chunker.chunk(text, {
103
+ maxTokens: 512,
104
+ similarityThreshold: 0.5,
105
+ embedFunction: async (text) => myEmbeddingModel(text),
106
+ });
107
+ ```
108
+
109
+ ### Hierarchical
110
+
111
+ ```typescript
112
+ import { createHierarchicalChunker } from '@lov3kaizen/agentsea-ingest';
113
+
114
+ const chunker = createHierarchicalChunker();
115
+ const chunks = chunker.chunk(markdownText, {
116
+ maxTokens: 512,
117
+ headingLevels: [1, 2, 3],
118
+ includeParentContext: true,
119
+ });
120
+ ```
121
+
122
+ ## Pipeline Builder
123
+
124
+ ```typescript
125
+ import { createPipelineBuilder } from '@lov3kaizen/agentsea-ingest';
126
+
127
+ const pipeline = createPipelineBuilder()
128
+ .withName('my-pipeline')
129
+ .withStages(['load', 'parse', 'clean', 'chunk', 'embed'])
130
+ .withChunking({
131
+ strategy: 'semantic',
132
+ maxTokens: 512,
133
+ overlap: 50,
134
+ })
135
+ .withCleaning({
136
+ operations: ['normalize_whitespace', 'remove_urls', 'trim'],
137
+ })
138
+ .withCallbacks({
139
+ onDocumentComplete: (doc) => console.log(`Processed: ${doc.id}`),
140
+ })
141
+ .build();
142
+
143
+ const result = await pipeline.process({ path: './document.pdf' });
144
+ ```
145
+
146
+ ## Pre-built Pipelines
147
+
148
+ ```typescript
149
+ import { pipelines } from '@lov3kaizen/agentsea-ingest';
150
+
151
+ // Simple text extraction
152
+ const simple = pipelines.simple().build();
153
+
154
+ // Full processing with all stages
155
+ const full = pipelines.full().build();
156
+
157
+ // RAG-optimized pipeline
158
+ const rag = pipelines.rag().build();
159
+
160
+ // Document analysis (no chunking)
161
+ const analysis = pipelines.analysis().build();
162
+
163
+ // OCR pipeline for scanned documents
164
+ const ocr = pipelines.ocr().build();
165
+ ```
166
+
167
+ ## Ingester
168
+
169
+ The `Ingester` class provides a high-level API for document ingestion:
170
+
171
+ ```typescript
172
+ import { createIngester } from '@lov3kaizen/agentsea-ingest';
173
+
174
+ const ingester = createIngester({
175
+ chunking: {
176
+ strategy: 'recursive',
177
+ maxTokens: 512,
178
+ },
179
+ concurrency: 4,
180
+ fileSizeLimit: 10 * 1024 * 1024, // 10MB
181
+ });
182
+
183
+ // Ingest single file
184
+ const doc = await ingester.ingestFile('./document.pdf');
185
+
186
+ // Ingest from URL
187
+ const webDoc = await ingester.ingestUrl('https://example.com/page.html');
188
+
189
+ // Ingest from buffer
190
+ const bufferDoc = await ingester.ingestBuffer(buffer, 'document.pdf');
191
+
192
+ // Ingest directory
193
+ const results = await ingester.ingestDirectory('./documents', {
194
+ recursive: true,
195
+ include: ['*.pdf', '*.docx'],
196
+ exclude: ['draft-*'],
197
+ });
198
+ ```
199
+
200
+ ## Watch Mode
201
+
202
+ ```typescript
203
+ const ingester = createIngester({
204
+ watchMode: {
205
+ enabled: true,
206
+ paths: ['./documents'],
207
+ include: ['*.pdf', '*.md'],
208
+ debounceDelay: 1000,
209
+ processExisting: true,
210
+ },
211
+ });
212
+
213
+ ingester.startWatching();
214
+ // Files added/modified in ./documents will be automatically processed
215
+ ```
216
+
217
+ ## Events
218
+
219
+ ```typescript
220
+ const pipeline = createPipeline(config);
221
+ const emitter = pipeline.getEventEmitter();
222
+
223
+ emitter.on('document:loaded', (event) => {
224
+ console.log(`Loaded: ${event.documentId}`);
225
+ });
226
+
227
+ emitter.on('document:chunked', (event) => {
228
+ console.log(`Created ${event.chunkCount} chunks`);
229
+ });
230
+
231
+ emitter.on('document:completed', (event) => {
232
+ console.log(`Completed: ${event.document.id}`);
233
+ });
234
+ ```
235
+
236
+ ## API Reference
237
+
238
+ ### Types
239
+
240
+ - `ProcessedDocument` - Processed document with chunks and metadata
241
+ - `Chunk` - Text chunk with metadata and optional embedding
242
+ - `Element` - Document element (paragraph, heading, list, etc.)
243
+ - `TableData` - Extracted table data
244
+ - `ImageData` - Extracted image data
245
+ - `PipelineConfig` - Pipeline configuration options
246
+ - `ChunkingOptions` - Chunking configuration options
247
+
248
+ ### Core Classes
249
+
250
+ - `Pipeline` - Document processing pipeline
251
+ - `PipelineBuilder` - Fluent pipeline builder
252
+ - `Ingester` - High-level document ingester
253
+ - `ParserRegistry` - Parser management
254
+ - `ChunkerRegistry` - Chunker management
255
+
256
+ ### Parsers
257
+
258
+ - `PDFParser` - PDF document parsing
259
+ - `DOCXParser` - Word document parsing
260
+ - `HTMLParser` - HTML document parsing
261
+ - `MarkdownParser` - Markdown parsing
262
+ - `TextParser` - Plain text parsing
263
+ - `CSVParser` - CSV file parsing
264
+ - `ExcelParser` - Excel file parsing
265
+ - `JSONParser` - JSON file parsing
266
+
267
+ ### Chunkers
268
+
269
+ - `FixedChunker` - Fixed-size chunks
270
+ - `RecursiveChunker` - Recursive splitting
271
+ - `SentenceChunker` - Sentence-based chunks
272
+ - `ParagraphChunker` - Paragraph-based chunks
273
+ - `SemanticChunker` - Semantic similarity-based
274
+ - `HierarchicalChunker` - Heading-based hierarchy
275
+
276
+ ## License
277
+
278
+ MIT