@ooneex/rag 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +337 -0
- package/dist/index.d.ts +165 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +17 -0
- package/package.json +43 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ooneex
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
# @ooneex/rag
|
|
2
|
+
|
|
3
|
+
Retrieval-Augmented Generation toolkit with vector database integration, document embedding, and semantic search for AI-powered knowledge retrieval.
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
✅ **Vector Database** - Abstract base class for building custom vector databases with LanceDB
|
|
12
|
+
|
|
13
|
+
✅ **PDF Conversion** - Convert PDF documents into structured chunks with heading and page metadata
|
|
14
|
+
|
|
15
|
+
✅ **Hybrid Search** - Full-text and vector-based hybrid search with RRF reranking
|
|
16
|
+
|
|
17
|
+
✅ **OpenAI Embeddings** - Built-in support for OpenAI embedding models (ada-002, 3-small, 3-large)
|
|
18
|
+
|
|
19
|
+
✅ **Schema Definition** - Typed schema definitions using Apache Arrow data types
|
|
20
|
+
|
|
21
|
+
✅ **Index Management** - Create scalar (btree, bitmap, labelList) and IVF-PQ vector indexes
|
|
22
|
+
|
|
23
|
+
✅ **Query Filtering** - Composable filter conditions with AND, OR, NOT logical operators
|
|
24
|
+
|
|
25
|
+
✅ **Query Analysis** - Explain and analyze query plans for performance tuning
|
|
26
|
+
|
|
27
|
+
✅ **Container Integration** - Decorator-based registration with the DI container
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
bun add @ooneex/rag
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
### Defining a Vector Database
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
import { AbstractVectorDatabase } from '@ooneex/rag';
|
|
41
|
+
import { Utf8 } from 'apache-arrow';
|
|
42
|
+
import type { EmbeddingProviderType, EmbeddingModelType, FieldValueType } from '@ooneex/rag';
|
|
43
|
+
|
|
44
|
+
type ArticleData = {
|
|
45
|
+
title: string;
|
|
46
|
+
category: string;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
class ArticleVectorDatabase extends AbstractVectorDatabase<ArticleData> {
|
|
50
|
+
public getDatabaseUri(): string {
|
|
51
|
+
return './data/articles.lance';
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
public getEmbeddingModel(): { provider: EmbeddingProviderType; model: EmbeddingModelType['model'] } {
|
|
55
|
+
return { provider: 'openai', model: 'text-embedding-3-small' };
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
public getSchema(): { [K in keyof ArticleData]: FieldValueType } {
|
|
59
|
+
return {
|
|
60
|
+
title: new Utf8(),
|
|
61
|
+
category: new Utf8(),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Connecting and Adding Data
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
const db = new ArticleVectorDatabase();
|
|
71
|
+
await db.connect();
|
|
72
|
+
|
|
73
|
+
const table = await db.open('articles');
|
|
74
|
+
|
|
75
|
+
await table.add([
|
|
76
|
+
{ id: '1', text: 'Introduction to RAG systems', title: 'RAG Intro', category: 'AI' },
|
|
77
|
+
{ id: '2', text: 'Vector databases explained', title: 'Vector DBs', category: 'Database' },
|
|
78
|
+
]);
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Searching
|
|
82
|
+
|
|
83
|
+
```typescript
|
|
84
|
+
const results = await table.search('retrieval augmented generation', {
|
|
85
|
+
limit: 5,
|
|
86
|
+
select: ['title', 'category'],
|
|
87
|
+
filter: { field: 'category', op: '=', value: 'AI' },
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
console.log(results);
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Converting PDFs to Chunks
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
import { Convertor } from '@ooneex/rag';
|
|
97
|
+
|
|
98
|
+
const convertor = new Convertor('/path/to/document.pdf');
|
|
99
|
+
|
|
100
|
+
for await (const chunk of convertor.convert({ outputDir: './output' })) {
|
|
101
|
+
console.log(chunk.text);
|
|
102
|
+
console.log(chunk.metadata.heading);
|
|
103
|
+
console.log(chunk.metadata.pages);
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Composable Filters
|
|
108
|
+
|
|
109
|
+
```typescript
|
|
110
|
+
const results = await table.search('machine learning', {
|
|
111
|
+
limit: 10,
|
|
112
|
+
filter: {
|
|
113
|
+
AND: [
|
|
114
|
+
{ field: 'category', op: '=', value: 'AI' },
|
|
115
|
+
{ NOT: { field: 'title', op: 'LIKE', value: '%draft%' } },
|
|
116
|
+
],
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Query Plan Analysis
|
|
122
|
+
|
|
123
|
+
```typescript
|
|
124
|
+
// Explain the query plan
|
|
125
|
+
const plan = await table.explainPlan('search query', {
|
|
126
|
+
limit: 10,
|
|
127
|
+
verbose: true,
|
|
128
|
+
});
|
|
129
|
+
console.log(plan);
|
|
130
|
+
|
|
131
|
+
// Analyze with runtime metrics
|
|
132
|
+
const analysis = await table.analyzePlan('search query', {
|
|
133
|
+
limit: 10,
|
|
134
|
+
});
|
|
135
|
+
console.log(analysis);
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## API Reference
|
|
139
|
+
|
|
140
|
+
### Classes
|
|
141
|
+
|
|
142
|
+
#### `AbstractVectorDatabase<DataType>` (Abstract)
|
|
143
|
+
|
|
144
|
+
Abstract base class for creating vector database implementations.
|
|
145
|
+
|
|
146
|
+
**Type Parameter:**
|
|
147
|
+
- `DataType` - Record type for additional schema fields
|
|
148
|
+
|
|
149
|
+
**Abstract Methods:**
|
|
150
|
+
|
|
151
|
+
##### `getDatabaseUri(): string`
|
|
152
|
+
|
|
153
|
+
Returns the URI for the LanceDB database storage.
|
|
154
|
+
|
|
155
|
+
##### `getEmbeddingModel(): { provider: EmbeddingProviderType; model: EmbeddingModelType['model'] }`
|
|
156
|
+
|
|
157
|
+
Returns the embedding provider and model configuration.
|
|
158
|
+
|
|
159
|
+
##### `getSchema(): { [K in keyof DataType]: FieldValueType }`
|
|
160
|
+
|
|
161
|
+
Returns the schema definition using Apache Arrow types.
|
|
162
|
+
|
|
163
|
+
**Concrete Methods:**
|
|
164
|
+
|
|
165
|
+
##### `connect(): Promise<void>`
|
|
166
|
+
|
|
167
|
+
Connect to the LanceDB database.
|
|
168
|
+
|
|
169
|
+
##### `getDatabase(): Connection`
|
|
170
|
+
|
|
171
|
+
Get the underlying LanceDB connection. Throws `VectorDatabaseException` if not connected.
|
|
172
|
+
|
|
173
|
+
##### `open(name: string, options?): Promise<VectorTable<DataType>>`
|
|
174
|
+
|
|
175
|
+
Open or create a vector table. Automatically creates btree, full-text search, and IVF-PQ indexes on new tables.
|
|
176
|
+
|
|
177
|
+
**Parameters:**
|
|
178
|
+
- `name` - Table name
|
|
179
|
+
- `options.mode` - `"create"` or `"overwrite"` (default: `"overwrite"`)
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
#### `VectorTable<DataType>`
|
|
184
|
+
|
|
185
|
+
Provides search, indexing, and data operations on a vector table.
|
|
186
|
+
|
|
187
|
+
**Methods:**
|
|
188
|
+
|
|
189
|
+
##### `add(data): Promise<this>`
|
|
190
|
+
|
|
191
|
+
Add records to the table.
|
|
192
|
+
|
|
193
|
+
##### `search(query, options?): Promise<DataType[]>`
|
|
194
|
+
|
|
195
|
+
Perform hybrid (vector + full-text) search with RRF reranking.
|
|
196
|
+
|
|
197
|
+
**Parameters:**
|
|
198
|
+
- `query` - Search query string
|
|
199
|
+
- `options.limit` - Maximum results (default: 10)
|
|
200
|
+
- `options.select` - Fields to return
|
|
201
|
+
- `options.filter` - Filter conditions
|
|
202
|
+
- `options.nprobes` - IVF partitions to search
|
|
203
|
+
- `options.refineFactor` - Refine step multiplier
|
|
204
|
+
- `options.fastSearch` - Skip un-indexed data (default: true)
|
|
205
|
+
|
|
206
|
+
##### `createIndex(column, options?): Promise<this>`
|
|
207
|
+
|
|
208
|
+
Create a scalar index (btree, bitmap, or labelList).
|
|
209
|
+
|
|
210
|
+
##### `createVectorIndex(column?, options?): Promise<this>`
|
|
211
|
+
|
|
212
|
+
Create an IVF-PQ vector index.
|
|
213
|
+
|
|
214
|
+
##### `explainPlan(query, options?): Promise<string>`
|
|
215
|
+
|
|
216
|
+
Print the resolved query plan.
|
|
217
|
+
|
|
218
|
+
##### `analyzePlan(query, options?): Promise<string>`
|
|
219
|
+
|
|
220
|
+
Execute and return a physical plan with runtime metrics.
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
#### `Convertor`
|
|
225
|
+
|
|
226
|
+
Converts PDF documents into structured text chunks.
|
|
227
|
+
|
|
228
|
+
**Constructor:**
|
|
229
|
+
```typescript
|
|
230
|
+
new Convertor(source: string)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
**Methods:**
|
|
234
|
+
|
|
235
|
+
##### `convert(options?): AsyncGenerator<ChunkType, { json: ConvertorFileType; markdown: ConvertorFileType }>`
|
|
236
|
+
|
|
237
|
+
Convert a PDF to chunks, yielding each chunk as it is processed.
|
|
238
|
+
|
|
239
|
+
**Parameters:**
|
|
240
|
+
- `options.outputDir` - Output directory
|
|
241
|
+
- `options.password` - PDF password
|
|
242
|
+
- `options.imageFormat` - `"png"` or `"jpeg"`
|
|
243
|
+
- `options.pages` - Page range
|
|
244
|
+
- `options.quiet` - Suppress output
|
|
245
|
+
|
|
246
|
+
### Types
|
|
247
|
+
|
|
248
|
+
#### `ChunkType`
|
|
249
|
+
|
|
250
|
+
```typescript
|
|
251
|
+
type ChunkType = {
|
|
252
|
+
text: string;
|
|
253
|
+
metadata: {
|
|
254
|
+
heading: string | null;
|
|
255
|
+
page: number | null;
|
|
256
|
+
pages: number[];
|
|
257
|
+
source: string | null;
|
|
258
|
+
};
|
|
259
|
+
};
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
#### `Filter<T>`
|
|
263
|
+
|
|
264
|
+
Composable filter type supporting field conditions and logical operators.
|
|
265
|
+
|
|
266
|
+
```typescript
|
|
267
|
+
type Filter<T> =
|
|
268
|
+
| FilterCondition<T>
|
|
269
|
+
| { AND: Filter<T>[] }
|
|
270
|
+
| { OR: Filter<T>[] }
|
|
271
|
+
| { NOT: Filter<T> };
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
#### `FilterCondition<T>`
|
|
275
|
+
|
|
276
|
+
Individual filter conditions with typed operators.
|
|
277
|
+
|
|
278
|
+
```typescript
|
|
279
|
+
type FilterCondition<T> =
|
|
280
|
+
| { field: FilterField<T>; op: '>' | '>=' | '<' | '<=' | '='; value: string | number }
|
|
281
|
+
| { field: FilterField<T>; op: 'IN'; value: (string | number)[] }
|
|
282
|
+
| { field: FilterField<T>; op: 'LIKE' | 'NOT LIKE'; value: string }
|
|
283
|
+
| { field: FilterField<T>; op: 'IS NULL' | 'IS NOT NULL' }
|
|
284
|
+
| { field: FilterField<T>; op: 'IS TRUE' | 'IS NOT TRUE' | 'IS FALSE' | 'IS NOT FALSE' };
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
#### `EmbeddingProviderType`
|
|
288
|
+
|
|
289
|
+
```typescript
|
|
290
|
+
type EmbeddingProviderType = 'openai';
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
#### `FieldValueType`
|
|
294
|
+
|
|
295
|
+
Apache Arrow types supported for schema fields: `Null`, `Bool`, `Int8`-`Int64`, `Uint8`-`Uint64`, `Float16`-`Float64`, `Utf8`, `LargeUtf8`, `Binary`, `LargeBinary`, `Decimal`, `DateDay`, `DateMillisecond`, and `EmbeddingFunction`.
|
|
296
|
+
|
|
297
|
+
### Exceptions
|
|
298
|
+
|
|
299
|
+
#### `VectorDatabaseException`
|
|
300
|
+
|
|
301
|
+
Thrown when vector database operations fail (e.g., not connected).
|
|
302
|
+
|
|
303
|
+
#### `ConvertorException`
|
|
304
|
+
|
|
305
|
+
Thrown when PDF conversion fails.
|
|
306
|
+
|
|
307
|
+
### Decorators
|
|
308
|
+
|
|
309
|
+
#### `@decorator.rag()`
|
|
310
|
+
|
|
311
|
+
Decorator to register RAG classes with the DI container.
|
|
312
|
+
|
|
313
|
+
## License
|
|
314
|
+
|
|
315
|
+
This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details.
|
|
316
|
+
|
|
317
|
+
## Contributing
|
|
318
|
+
|
|
319
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
320
|
+
|
|
321
|
+
### Development Setup
|
|
322
|
+
|
|
323
|
+
1. Clone the repository
|
|
324
|
+
2. Install dependencies: `bun install`
|
|
325
|
+
3. Run tests: `bun run test`
|
|
326
|
+
4. Build the project: `bun run build`
|
|
327
|
+
|
|
328
|
+
### Guidelines
|
|
329
|
+
|
|
330
|
+
- Write tests for new features
|
|
331
|
+
- Follow the existing code style
|
|
332
|
+
- Update documentation for API changes
|
|
333
|
+
- Ensure all tests pass before submitting PR
|
|
334
|
+
|
|
335
|
+
---
|
|
336
|
+
|
|
337
|
+
Made with ❤️ by the Ooneex team
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import * as lancedb2 from "@lancedb/lancedb";
|
|
2
|
+
import { ScalarType } from "@ooneex/types";
|
|
3
|
+
import { Connection as Connection2 } from "@lancedb/lancedb";
|
|
4
|
+
import { EmbeddingFunction } from "@lancedb/lancedb/embedding";
|
|
5
|
+
import { Binary, Bool, DateDay, DateMillisecond, Decimal, Float16, Float32, Float64, Int8, Int16, Int32, Int64, LargeBinary, LargeUtf8, Null, Uint8, Uint16, Uint32, Uint64, Utf8 } from "apache-arrow";
|
|
6
|
+
import * as lancedb from "@lancedb/lancedb";
|
|
7
|
+
declare class VectorTable<DataType extends Record<string, unknown>> {
|
|
8
|
+
private table;
|
|
9
|
+
private reranker;
|
|
10
|
+
constructor(table: lancedb.Table);
|
|
11
|
+
add(data: ({
|
|
12
|
+
id: string;
|
|
13
|
+
text: string;
|
|
14
|
+
} & DataType)[]): Promise<this>;
|
|
15
|
+
createIndex(column: string, options?: {
|
|
16
|
+
config?: ReturnType<typeof lancedb.Index.btree | typeof lancedb.Index.bitmap | typeof lancedb.Index.labelList>;
|
|
17
|
+
}): Promise<this>;
|
|
18
|
+
createVectorIndex(column?: string, options?: Partial<Parameters<lancedb.Table["createIndex"]>[1] & object>): Promise<this>;
|
|
19
|
+
search(query: string, options?: {
|
|
20
|
+
limit?: number;
|
|
21
|
+
select?: (keyof DataType | "id" | "text")[];
|
|
22
|
+
filter?: Filter<DataType>;
|
|
23
|
+
nprobes?: number;
|
|
24
|
+
refineFactor?: number;
|
|
25
|
+
fastSearch?: boolean;
|
|
26
|
+
}): Promise<DataType[]>;
|
|
27
|
+
explainPlan(query: string, options?: {
|
|
28
|
+
limit?: number;
|
|
29
|
+
filter?: Filter<DataType>;
|
|
30
|
+
verbose?: boolean;
|
|
31
|
+
}): Promise<string>;
|
|
32
|
+
analyzePlan(query: string, options?: {
|
|
33
|
+
limit?: number;
|
|
34
|
+
filter?: Filter<DataType>;
|
|
35
|
+
}): Promise<string>;
|
|
36
|
+
}
|
|
37
|
+
type ConvertorOptionsType = {
|
|
38
|
+
outputDir?: string;
|
|
39
|
+
password?: string;
|
|
40
|
+
imageFormat?: "png" | "jpeg";
|
|
41
|
+
pages?: string;
|
|
42
|
+
quiet?: boolean;
|
|
43
|
+
};
|
|
44
|
+
type ChunkType = {
|
|
45
|
+
text: string;
|
|
46
|
+
metadata: {
|
|
47
|
+
heading: string | null;
|
|
48
|
+
page: number | null;
|
|
49
|
+
pages: number[];
|
|
50
|
+
source: string | null;
|
|
51
|
+
};
|
|
52
|
+
};
|
|
53
|
+
type ConvertorFileType = {
|
|
54
|
+
name: string;
|
|
55
|
+
path: string;
|
|
56
|
+
};
|
|
57
|
+
type IConvertor = {
|
|
58
|
+
convert: (options?: ConvertorOptionsType) => AsyncGenerator<ChunkType, {
|
|
59
|
+
json: ConvertorFileType;
|
|
60
|
+
markdown: ConvertorFileType;
|
|
61
|
+
}>;
|
|
62
|
+
};
|
|
63
|
+
type IVectorDatabase<DataType extends Record<string, unknown>> = {
|
|
64
|
+
getDatabaseUri: () => string;
|
|
65
|
+
connect: () => Promise<void>;
|
|
66
|
+
getDatabase: () => Connection2;
|
|
67
|
+
getEmbeddingModel: () => {
|
|
68
|
+
provider: EmbeddingProviderType;
|
|
69
|
+
model: EmbeddingModelType["model"];
|
|
70
|
+
};
|
|
71
|
+
getSchema: () => { [K in keyof DataType] : FieldValueType };
|
|
72
|
+
open: (name: string, options?: {
|
|
73
|
+
mode?: "create" | "overwrite";
|
|
74
|
+
}) => Promise<VectorTable<DataType>>;
|
|
75
|
+
};
|
|
76
|
+
type OpenAIModelType = "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large";
|
|
77
|
+
type EmbeddingProviderType = "openai";
|
|
78
|
+
type EmbeddingModelMapType = {
|
|
79
|
+
openai: OpenAIModelType;
|
|
80
|
+
};
|
|
81
|
+
type EmbeddingModelType<P extends EmbeddingProviderType = EmbeddingProviderType> = {
|
|
82
|
+
provider: P;
|
|
83
|
+
model: EmbeddingModelMapType[P];
|
|
84
|
+
};
|
|
85
|
+
type FieldValueType = Null | Bool | Int8 | Int16 | Int32 | Int64 | Uint8 | Uint16 | Uint32 | Uint64 | Float16 | Float32 | Float64 | Utf8 | LargeUtf8 | Binary | LargeBinary | Decimal | DateDay | DateMillisecond | EmbeddingFunction;
|
|
86
|
+
type VectorDatabaseClassType = new (...args: any[]) => IVectorDatabase<any>;
|
|
87
|
+
type FilterField<T> = keyof T | "id" | "text";
|
|
88
|
+
type FilterCondition<T> = {
|
|
89
|
+
field: FilterField<T>;
|
|
90
|
+
op: ">" | ">=" | "<" | "<=" | "=";
|
|
91
|
+
value: string | number;
|
|
92
|
+
} | {
|
|
93
|
+
field: FilterField<T>;
|
|
94
|
+
op: "IN";
|
|
95
|
+
value: (string | number)[];
|
|
96
|
+
} | {
|
|
97
|
+
field: FilterField<T>;
|
|
98
|
+
op: "LIKE" | "NOT LIKE";
|
|
99
|
+
value: string;
|
|
100
|
+
} | {
|
|
101
|
+
field: FilterField<T>;
|
|
102
|
+
op: "IS NULL" | "IS NOT NULL";
|
|
103
|
+
value?: never;
|
|
104
|
+
} | {
|
|
105
|
+
field: FilterField<T>;
|
|
106
|
+
op: "IS TRUE" | "IS NOT TRUE" | "IS FALSE" | "IS NOT FALSE";
|
|
107
|
+
value?: never;
|
|
108
|
+
};
|
|
109
|
+
type Filter<T> = FilterCondition<T> | {
|
|
110
|
+
AND: Filter<T>[];
|
|
111
|
+
} | {
|
|
112
|
+
OR: Filter<T>[];
|
|
113
|
+
} | {
|
|
114
|
+
NOT: Filter<T>;
|
|
115
|
+
};
|
|
116
|
+
declare abstract class AbstractVectorDatabase<DataType extends Record<string, ScalarType>> implements IVectorDatabase<DataType> {
|
|
117
|
+
private db;
|
|
118
|
+
private embedding;
|
|
119
|
+
constructor();
|
|
120
|
+
abstract getEmbeddingModel(): {
|
|
121
|
+
provider: EmbeddingProviderType;
|
|
122
|
+
model: EmbeddingModelType["model"];
|
|
123
|
+
};
|
|
124
|
+
abstract getSchema(): { [K in keyof DataType] : FieldValueType };
|
|
125
|
+
connect(): Promise<void>;
|
|
126
|
+
getDatabase(): lancedb2.Connection;
|
|
127
|
+
open(name: string, options?: {
|
|
128
|
+
mode?: "create" | "overwrite";
|
|
129
|
+
}): Promise<VectorTable<DataType>>;
|
|
130
|
+
}
|
|
131
|
+
declare class Convertor implements IConvertor {
|
|
132
|
+
private readonly source;
|
|
133
|
+
constructor(source: string);
|
|
134
|
+
convert(options?: ConvertorOptionsType): AsyncGenerator<ChunkType, {
|
|
135
|
+
json: ConvertorFileType;
|
|
136
|
+
markdown: ConvertorFileType;
|
|
137
|
+
}>;
|
|
138
|
+
private generateChunks;
|
|
139
|
+
private extractContent;
|
|
140
|
+
private extractTexts;
|
|
141
|
+
}
|
|
142
|
+
import { Exception } from "@ooneex/exception";
|
|
143
|
+
declare class ConvertorException extends Exception {
|
|
144
|
+
constructor(message: string, data?: Record<string, unknown>);
|
|
145
|
+
}
|
|
146
|
+
import { EContainerScope } from "@ooneex/container";
|
|
147
|
+
declare const decorator: {
|
|
148
|
+
vectorDatabase: (scope?: EContainerScope) => (target: VectorDatabaseClassType) => void;
|
|
149
|
+
};
|
|
150
|
+
type DataType2 = {
|
|
151
|
+
name: string;
|
|
152
|
+
};
|
|
153
|
+
declare class VectorDatabase extends AbstractVectorDatabase<DataType2> {
|
|
154
|
+
getDatabaseUri(): string;
|
|
155
|
+
getEmbeddingModel(): {
|
|
156
|
+
provider: EmbeddingProviderType;
|
|
157
|
+
model: EmbeddingModelType["model"];
|
|
158
|
+
};
|
|
159
|
+
getSchema(): { [K in keyof DataType2] : FieldValueType };
|
|
160
|
+
}
|
|
161
|
+
import { Exception as Exception2 } from "@ooneex/exception";
|
|
162
|
+
declare class VectorDatabaseException extends Exception2 {
|
|
163
|
+
constructor(message: string, data?: Record<string, unknown>);
|
|
164
|
+
}
|
|
165
|
+
export { decorator, VectorTable, VectorDatabaseException, VectorDatabaseClassType, VectorDatabase, OpenAIModelType, IVectorDatabase, IConvertor, FilterField, FilterCondition, Filter, FieldValueType, EmbeddingProviderType, EmbeddingModelType, EmbeddingModelMapType, ConvertorOptionsType, ConvertorFileType, ConvertorException, Convertor, ChunkType, AbstractVectorDatabase };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
import*as f from"@lancedb/lancedb";import"@lancedb/lancedb/embedding/openai";import{getRegistry as P,LanceSchema as R}from"@lancedb/lancedb/embedding";import{Utf8 as S}from"apache-arrow";import{Exception as O}from"@ooneex/exception";import{HttpStatus as A}from"@ooneex/http-status";class T extends O{constructor(n,o={}){super(n,{status:A.Code.InternalServerError,data:o});this.name="VectorDatabaseException"}}import*as g from"@lancedb/lancedb";var m=(n)=>{if("AND"in n)return`(${n.AND.map(m).join(" AND ")})`;if("OR"in n)return`(${n.OR.map(m).join(" OR ")})`;if("NOT"in n)return`NOT (${m(n.NOT)})`;let o=String(n.field);if(n.op==="IS NULL"||n.op==="IS NOT NULL"||n.op==="IS TRUE"||n.op==="IS NOT TRUE"||n.op==="IS FALSE"||n.op==="IS NOT FALSE")return`${o} ${n.op}`;if(n.op==="IN"){let r=n.value.map((e)=>typeof e==="string"?`'${e}'`:e);return`${o} IN (${r.join(", ")})`}if(n.op==="LIKE"||n.op==="NOT LIKE")return`${o} ${n.op} '${n.value}'`;if(n.op===">"||n.op===">="||n.op==="<"||n.op==="<="||n.op==="=")return`${o} ${n.op} ${typeof n.value==="string"?`'${n.value}'`:n.value}`;return`${o} ${n.op} ${typeof n.value==="string"?`'${n.value}'`:n.value}`};class h{table;reranker=null;constructor(n){this.table=n}async add(n){return await this.table.add(n),this}async createIndex(n,o){return await this.table.createIndex(n,o),this}async createVectorIndex(n="vector",o){return await this.table.createIndex(n,{config:g.Index.ivfPq(),...o}),this}async search(n,o){let{limit:r=10,select:e,filter:s,nprobes:a,refineFactor:i,fastSearch:p=!0}=o??{};if(!this.reranker)this.reranker=await g.rerankers.RRFReranker.create();let t=this.table.search(n,"hybrid","text").rerank(this.reranker).limit(r);if(a)t=t.nprobes(a);if(i)t=t.refineFactor(i);if(p)t=t.fastSearch();if(e)t=t.select(e);if(s)t=t.where(m(s));return t.toArray()}async explainPlan(n,o){let{limit:r=10,filter:e,verbose:s=!0}=o??{};if(!this.reranker)this.reranker=await g.rerankers.RRFReranker.create();let a=this.table.search(n,"hybrid","text").rerank(this.reranker).limit(r);if(e)a=a.where(m(e));return a.explainPlan(s)}async analyzePlan(n,o){let{limit:r=10,filter:e}=o??{};if(!this.reranker)this.reranker=await g.rerankers.RRFReranker.create();let s=this.table.search(n,"hybrid","text").rerank(this.reranker).limit(r);if(e)s=s.where(m(e));return s.analyzePlan()}}class w{db=null;embedding;constructor(){let{provider:n,model:o}=this.getEmbeddingModel();this.embedding=P().get(n)?.create({model:o})}async connect(){this.db=await f.connect(this.getDatabaseUri())}getDatabase(){if(!this.db)throw new T("Database not connected. Call connect() first.");return this.db}async open(n,o){if((await this.getDatabase().tableNames()).includes(n)){let a=await this.getDatabase().openTable(n);return new h(a)}let e=R({id:new S,text:this.embedding.sourceField(new S),vector:this.embedding.vectorField(),...this.getSchema()}),s=await this.getDatabase().createEmptyTable(n,e,{mode:"overwrite",...o});return await s.createIndex("id",{config:f.Index.btree()}),await s.createIndex("text",{config:f.Index.fts()}),await s.createIndex("vector",{config:f.Index.ivfPq()}),new h(s)}}import u from"path";import{random as l}from"@ooneex/utils";import{convert as B}from"@opendataloader/pdf";import{Exception as b}from"@ooneex/exception";import{HttpStatus as F}from"@ooneex/http-status";class c extends b{constructor(n,o={}){super(n,{status:F.Code.InternalServerError,data:o});this.name="ConvertorException"}}class v{source;constructor(n){this.source=u.join(...n.split(/[/\\]/))}async*convert(n={}){try{let o=l.nanoid(15),r=u.join(n.outputDir??"",o),{password:e,imageFormat:s,quiet:a,pages:i}=n;await B([this.source],{outputDir:r,format:"json,markdown",imageDir:u.join(r,"images"),imageOutput:"external",...e!==void 0&&{password:e},...s!==void 0&&{imageFormat:s},...a!==void 0&&{quiet:a},...i!==void 0&&{pages:i}});let p=new Bun.Glob("*"),t,d;for await(let x of p.scan(r)){if(!t&&x.endsWith(".json"))t=x;if(!d&&x.endsWith(".md"))d=x;if(t&&d)break}if(!t)throw new c("No JSON output file found after conversion",{source:this.source});if(!d)throw new c("No Markdown output file found after conversion",{source:this.source});let C=u.join(r,t),k=await Bun.file(C).json(),I=k["file name"]??u.basename(this.source);yield*this.generateChunks(k.kids??[],I);let j=`${l.nanoid(15)}.json`,E=`${l.nanoid(15)}.md`,$=u.join(r,d),D=u.join(r,j),y=u.join(r,E);return await Promise.all([Bun.write(D,Bun.file(C)),Bun.write(y,Bun.file($))]),await Promise.all([Bun.file(C).delete(),Bun.file($).delete()]),{json:{name:j,path:D},markdown:{name:E,path:y}}}catch(o){if(o instanceof c)throw o;throw new c(o instanceof Error?o.message:"PDF conversion with chunking failed",{source:this.source})}}*generateChunks(n,o){let r=null,e=[],s=null,a=new Set;for(let i of n){let p=i.type;if(!p)continue;if(p==="heading"){if(e.length>0)yield{text:e.join(`
|
|
3
|
+
`),metadata:{heading:r,page:s,pages:Array.from(a),source:o}};let t=this.extractContent(i);r=t,e=t?[t]:[],s=i["page number"]??null,a=new Set(s!==null?[s]:[]);continue}if(p==="paragraph"||p==="list"){let t=this.extractContent(i);if(t){e.push(t);let d=i["page number"];if(d!==void 0)a.add(d)}}}if(e.length>0)yield{text:e.join(`
|
|
4
|
+
`),metadata:{heading:r,page:s,pages:Array.from(a),source:o}}}extractContent(n){let o=Array.from(this.extractTexts(n));return o.length>0?o.join(" "):null}*extractTexts(n){let o=n.content;if(o){yield o;return}let r=n.kids;if(r)for(let e of r)yield*this.extractTexts(e)}}import{container as L,EContainerScope as V}from"@ooneex/container";var M={vectorDatabase:(n=V.Singleton)=>{return(o)=>{L.add(o,n)}}};import{Utf8 as G}from"apache-arrow";class N extends w{getDatabaseUri(){return""}getEmbeddingModel(){return{provider:"openai",model:"text-embedding-ada-002"}}getSchema(){return{name:new G}}}export{M as decorator,h as VectorTable,T as VectorDatabaseException,N as VectorDatabase,c as ConvertorException,v as Convertor,w as AbstractVectorDatabase};
|
|
5
|
+
|
|
6
|
+
//# debugId=FB64C08DA10152BA64756E2164756E21
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["src/AbstractVectorDatabase.ts", "src/VectorDatabaseException.ts", "src/VectorTable.ts", "src/utils.ts", "src/Convertor.ts", "src/ConvertorException.ts", "src/decorators.ts", "src/VectorDatabase.ts"],
|
|
4
|
+
"sourcesContent": [
|
|
5
|
+
"import * as lancedb from \"@lancedb/lancedb\";\nimport \"@lancedb/lancedb/embedding/openai\";\nimport type { EmbeddingFunction } from \"@lancedb/lancedb/embedding\";\nimport { getRegistry, LanceSchema } from \"@lancedb/lancedb/embedding\";\nimport type { ScalarType } from \"@ooneex/types\";\nimport { Utf8 } from \"apache-arrow\";\nimport type { EmbeddingModelType, EmbeddingProviderType, FieldValueType, IVectorDatabase } from \"./types.ts\";\nimport { VectorDatabaseException } from \"./VectorDatabaseException.ts\";\nimport { VectorTable } from \"./VectorTable.ts\";\n\nexport abstract class AbstractVectorDatabase<DataType extends Record<string, ScalarType>>\n implements IVectorDatabase<DataType>\n{\n private db: lancedb.Connection | null = null;\n private embedding: EmbeddingFunction;\n\n constructor() {\n const { provider, model } = this.getEmbeddingModel();\n this.embedding = getRegistry().get(provider)?.create({ model }) as EmbeddingFunction;\n }\n\n public abstract getDatabaseUri(): string;\n public abstract getEmbeddingModel(): { provider: EmbeddingProviderType; model: EmbeddingModelType[\"model\"] };\n public abstract getSchema(): { [K in keyof DataType]: FieldValueType };\n\n public async connect(): Promise<void> {\n this.db = await lancedb.connect(this.getDatabaseUri());\n }\n\n public getDatabase(): lancedb.Connection {\n if (!this.db) {\n throw new VectorDatabaseException(\"Database not connected. Call connect() first.\");\n }\n\n return this.db;\n }\n\n public async open(name: string, options?: { mode?: \"create\" | \"overwrite\" }): Promise<VectorTable<DataType>> {\n const tableNames = await this.getDatabase().tableNames();\n if (tableNames.includes(name)) {\n const table = await this.getDatabase().openTable(name);\n return new VectorTable<DataType>(table);\n }\n\n const schema = LanceSchema({\n id: new Utf8(),\n text: this.embedding.sourceField(new Utf8()),\n vector: this.embedding.vectorField(),\n ...this.getSchema(),\n });\n const table = await this.getDatabase().createEmptyTable(name, schema, { mode: \"overwrite\", ...options });\n await table.createIndex(\"id\", { config: lancedb.Index.btree() });\n await table.createIndex(\"text\", { config: lancedb.Index.fts() });\n await table.createIndex(\"vector\", { config: lancedb.Index.ivfPq() });\n\n return new VectorTable<DataType>(table);\n }\n}\n",
|
|
6
|
+
"import { Exception } from \"@ooneex/exception\";\nimport { HttpStatus } from \"@ooneex/http-status\";\n\nexport class VectorDatabaseException extends Exception {\n constructor(message: string, data: Record<string, unknown> = {}) {\n super(message, {\n status: HttpStatus.Code.InternalServerError,\n data,\n });\n this.name = \"VectorDatabaseException\";\n }\n}\n",
|
|
7
|
+
"import * as lancedb from \"@lancedb/lancedb\";\nimport type { Filter } from \"./types.ts\";\nimport { buildFilter } from \"./utils.ts\";\n\nexport class VectorTable<DataType extends Record<string, unknown>> {\n private table: lancedb.Table;\n private reranker: Awaited<ReturnType<typeof lancedb.rerankers.RRFReranker.create>> | null = null;\n\n constructor(table: lancedb.Table) {\n this.table = table;\n }\n\n public async add(data: ({ id: string; text: string } & DataType)[]): Promise<this> {\n await this.table.add(data);\n\n return this;\n }\n\n // Create a scalar index (btree, bitmap, or labelList) on a column used in filters.\n public async createIndex(\n column: string,\n options?: {\n config?: ReturnType<typeof lancedb.Index.btree | typeof lancedb.Index.bitmap | typeof lancedb.Index.labelList>;\n },\n ): Promise<this> {\n await this.table.createIndex(column, options);\n\n return this;\n }\n\n // Create an IVF PQ vector index for approximate nearest neighbor search.\n public async createVectorIndex(\n column = \"vector\",\n options?: Partial<Parameters<lancedb.Table[\"createIndex\"]>[1] & object>,\n ): Promise<this> {\n await this.table.createIndex(column, {\n config: lancedb.Index.ivfPq(),\n ...options,\n });\n\n return this;\n }\n\n public async search(\n query: string,\n options?: {\n limit?: number;\n select?: (keyof DataType | \"id\" | \"text\")[];\n filter?: Filter<DataType>;\n // Number of IVF partitions to search. Higher values improve recall but reduce speed.\n nprobes?: number;\n // Multiplier for additional candidate rows during IVF PQ refine step to improve recall accuracy.\n refineFactor?: number;\n // Skip un-indexed data for faster queries when indices are up to date.\n fastSearch?: boolean;\n },\n ): Promise<DataType[]> {\n const { limit = 10, select, filter, nprobes, refineFactor, fastSearch = true } = options ?? {};\n\n if (!this.reranker) {\n this.reranker = await lancedb.rerankers.RRFReranker.create();\n }\n\n let vectorQuery = (this.table.search(query, \"hybrid\", \"text\") as lancedb.VectorQuery)\n .rerank(this.reranker)\n .limit(limit);\n\n if (nprobes) {\n vectorQuery = vectorQuery.nprobes(nprobes);\n }\n\n if (refineFactor) {\n vectorQuery = vectorQuery.refineFactor(refineFactor);\n }\n\n if (fastSearch) {\n vectorQuery = vectorQuery.fastSearch();\n }\n\n if (select) {\n vectorQuery = vectorQuery.select(select as string[]);\n }\n\n if (filter) {\n vectorQuery = vectorQuery.where(buildFilter(filter));\n }\n\n return vectorQuery.toArray();\n }\n\n // Print the resolved query plan to identify slow queries and missing indices.\n public async explainPlan(\n query: string,\n options?: {\n limit?: number;\n filter?: Filter<DataType>;\n verbose?: boolean;\n },\n ): Promise<string> {\n const { limit = 10, filter, verbose = true } = options ?? {};\n\n if (!this.reranker) {\n this.reranker = await lancedb.rerankers.RRFReranker.create();\n }\n\n let vectorQuery = (this.table.search(query, \"hybrid\", \"text\") as lancedb.VectorQuery)\n .rerank(this.reranker)\n .limit(limit);\n\n if (filter) {\n vectorQuery = vectorQuery.where(buildFilter(filter));\n }\n\n return vectorQuery.explainPlan(verbose);\n }\n\n // Execute the query and return a physical plan annotated with runtime metrics.\n public async analyzePlan(\n query: string,\n options?: {\n limit?: number;\n filter?: Filter<DataType>;\n },\n ): Promise<string> {\n const { limit = 10, filter } = options ?? {};\n\n if (!this.reranker) {\n this.reranker = await lancedb.rerankers.RRFReranker.create();\n }\n\n let vectorQuery = (this.table.search(query, \"hybrid\", \"text\") as lancedb.VectorQuery)\n .rerank(this.reranker)\n .limit(limit);\n\n if (filter) {\n vectorQuery = vectorQuery.where(buildFilter(filter));\n }\n\n return vectorQuery.analyzePlan();\n }\n}\n",
|
|
8
|
+
"import type { Filter } from \"./types.ts\";\n\nexport const buildFilter = <T>(filter: Filter<T>): string => {\n if (\"AND\" in filter) {\n return `(${filter.AND.map(buildFilter).join(\" AND \")})`;\n }\n if (\"OR\" in filter) {\n return `(${filter.OR.map(buildFilter).join(\" OR \")})`;\n }\n if (\"NOT\" in filter) {\n return `NOT (${buildFilter(filter.NOT)})`;\n }\n\n const col = String(filter.field);\n\n if (\n filter.op === \"IS NULL\" ||\n filter.op === \"IS NOT NULL\" ||\n filter.op === \"IS TRUE\" ||\n filter.op === \"IS NOT TRUE\" ||\n filter.op === \"IS FALSE\" ||\n filter.op === \"IS NOT FALSE\"\n ) {\n return `${col} ${filter.op}`;\n }\n\n if (filter.op === \"IN\") {\n const values = filter.value.map((v) => (typeof v === \"string\" ? `'${v}'` : v));\n return `${col} IN (${values.join(\", \")})`;\n }\n\n if (filter.op === \"LIKE\" || filter.op === \"NOT LIKE\") {\n return `${col} ${filter.op} '${filter.value}'`;\n }\n\n if (filter.op === \">\" || filter.op === \">=\" || filter.op === \"<\" || filter.op === \"<=\" || filter.op === \"=\") {\n return `${col} ${filter.op} ${typeof filter.value === \"string\" ? `'${filter.value}'` : filter.value}`;\n }\n\n return `${col} ${filter.op} ${typeof filter.value === \"string\" ? `'${filter.value}'` : filter.value}`;\n};\n",
|
|
9
|
+
"import path from \"node:path\";\nimport { random } from \"@ooneex/utils\";\nimport { convert } from \"@opendataloader/pdf\";\nimport { ConvertorException } from \"./ConvertorException\";\nimport type { ChunkType, ConvertorFileType, ConvertorOptionsType, IConvertor } from \"./types\";\n\nexport class Convertor implements IConvertor {\n private readonly source: string;\n\n constructor(source: string) {\n this.source = path.join(...source.split(/[/\\\\]/));\n }\n\n public async *convert(\n options: ConvertorOptionsType = {},\n ): AsyncGenerator<ChunkType, { json: ConvertorFileType; markdown: ConvertorFileType }> {\n try {\n const subDir = random.nanoid(15);\n const outputDir = path.join(options.outputDir ?? \"\", subDir);\n const { password, imageFormat, quiet, pages } = options;\n\n await convert([this.source], {\n outputDir,\n format: \"json,markdown\",\n imageDir: path.join(outputDir, \"images\"),\n imageOutput: \"external\",\n ...(password !== undefined && { password }),\n ...(imageFormat !== undefined && { imageFormat }),\n ...(quiet !== undefined && { quiet }),\n ...(pages !== undefined && { pages }),\n });\n\n const glob = new Bun.Glob(\"*\");\n let jsonFile: string | undefined;\n let mdFile: string | undefined;\n for await (const file of glob.scan(outputDir)) {\n if (!jsonFile && file.endsWith(\".json\")) jsonFile = file;\n if (!mdFile && file.endsWith(\".md\")) mdFile = file;\n if (jsonFile && mdFile) break;\n }\n\n if (!jsonFile) {\n throw new ConvertorException(\"No JSON output file found after conversion\", { source: this.source });\n }\n\n if (!mdFile) {\n throw new ConvertorException(\"No Markdown output file found after conversion\", { source: this.source });\n }\n\n const jsonPath = path.join(outputDir, jsonFile);\n const doc = await Bun.file(jsonPath).json();\n const fileName = doc[\"file name\"] ?? path.basename(this.source);\n\n yield* this.generateChunks(doc.kids ?? [], fileName);\n\n const renamedJson = `${random.nanoid(15)}.json`;\n const renamedMd = `${random.nanoid(15)}.md`;\n\n const mdPath = path.join(outputDir, mdFile);\n const renamedJsonPath = path.join(outputDir, renamedJson);\n const renamedMdPath = path.join(outputDir, renamedMd);\n\n await Promise.all([Bun.write(renamedJsonPath, Bun.file(jsonPath)), Bun.write(renamedMdPath, Bun.file(mdPath))]);\n\n await Promise.all([Bun.file(jsonPath).delete(), Bun.file(mdPath).delete()]);\n\n return {\n json: { name: renamedJson, path: renamedJsonPath },\n markdown: { name: renamedMd, path: renamedMdPath },\n };\n } catch (error) {\n if (error instanceof ConvertorException) throw error;\n throw new ConvertorException(error instanceof Error ? error.message : \"PDF conversion with chunking failed\", {\n source: this.source,\n });\n }\n }\n\n private *generateChunks(kids: Record<string, unknown>[], source: string): Generator<ChunkType> {\n let currentHeading: string | null = null;\n let currentContent: string[] = [];\n let startPage: number | null = null;\n let pageSet = new Set<number>();\n\n for (const element of kids) {\n const type = element.type as string | undefined;\n if (!type) continue;\n\n if (type === \"heading\") {\n if (currentContent.length > 0) {\n yield {\n text: currentContent.join(\"\\n\"),\n metadata: { heading: currentHeading, page: startPage, pages: Array.from(pageSet), source },\n };\n }\n const content = this.extractContent(element);\n currentHeading = content;\n currentContent = content ? [content] : [];\n startPage = (element[\"page number\"] as number) ?? null;\n pageSet = new Set(startPage !== null ? [startPage] : []);\n continue;\n }\n\n if (type === \"paragraph\" || type === \"list\") {\n const content = this.extractContent(element);\n if (content) {\n currentContent.push(content);\n const page = element[\"page number\"] as number | undefined;\n if (page !== undefined) {\n pageSet.add(page);\n }\n }\n }\n }\n\n if (currentContent.length > 0) {\n yield {\n text: currentContent.join(\"\\n\"),\n metadata: { heading: currentHeading, page: startPage, pages: Array.from(pageSet), source },\n };\n }\n }\n\n private extractContent(element: Record<string, unknown>): string | null {\n const parts = Array.from(this.extractTexts(element));\n return parts.length > 0 ? parts.join(\" \") : null;\n }\n\n private *extractTexts(element: Record<string, unknown>): Generator<string> {\n const content = element.content as string | undefined;\n if (content) {\n yield content;\n return;\n }\n\n const kids = element.kids as Record<string, unknown>[] | undefined;\n if (kids) {\n for (const kid of kids) {\n yield* this.extractTexts(kid);\n }\n }\n }\n}\n",
|
|
10
|
+
"import { Exception } from \"@ooneex/exception\";\nimport { HttpStatus } from \"@ooneex/http-status\";\n\nexport class ConvertorException extends Exception {\n constructor(message: string, data: Record<string, unknown> = {}) {\n super(message, {\n status: HttpStatus.Code.InternalServerError,\n data,\n });\n this.name = \"ConvertorException\";\n }\n}\n",
|
|
11
|
+
"import { container, EContainerScope } from \"@ooneex/container\";\nimport type { VectorDatabaseClassType } from \"./types\";\n\nexport const decorator = {\n vectorDatabase: (scope: EContainerScope = EContainerScope.Singleton) => {\n return (target: VectorDatabaseClassType): void => {\n container.add(target, scope);\n };\n },\n};\n",
|
|
12
|
+
"import { AbstractVectorDatabase } from \"./AbstractVectorDatabase.ts\";\nimport type { EmbeddingModelType, EmbeddingProviderType, FieldValueType } from \"./types.ts\";\nimport { Utf8 } from \"apache-arrow\";\n\ntype DataType = {\n name: string;\n}\n\nexport class VectorDatabase extends AbstractVectorDatabase<DataType> {\n public getDatabaseUri(): string {\n return \"\";\n }\n\n public getEmbeddingModel(): { provider: EmbeddingProviderType; model: EmbeddingModelType[\"model\"] } {\n return { provider: \"openai\", model: \"text-embedding-ada-002\" };\n }\n\n public getSchema(): { [K in keyof DataType]: FieldValueType } {\n return {\n name: new Utf8(),\n };\n }\n}\n"
|
|
13
|
+
],
|
|
14
|
+
"mappings": ";AAAA,mCACA,0CAEA,sBAAS,iBAAa,mCAEtB,eAAS,qBCLT,oBAAS,0BACT,qBAAS,4BAEF,MAAM,UAAgC,CAAU,CACrD,WAAW,CAAC,EAAiB,EAAgC,CAAC,EAAG,CAC/D,MAAM,EAAS,CACb,OAAQ,EAAW,KAAK,oBACxB,MACF,CAAC,EACD,KAAK,KAAO,0BAEhB,CCXA,mCCEO,IAAM,EAAc,CAAI,IAA8B,CAC3D,GAAI,QAAS,EACX,MAAO,IAAI,EAAO,IAAI,IAAI,CAAW,EAAE,KAAK,OAAO,KAErD,GAAI,OAAQ,EACV,MAAO,IAAI,EAAO,GAAG,IAAI,CAAW,EAAE,KAAK,MAAM,KAEnD,GAAI,QAAS,EACX,MAAO,QAAQ,EAAY,EAAO,GAAG,KAGvC,IAAM,EAAM,OAAO,EAAO,KAAK,EAE/B,GACE,EAAO,KAAO,WACd,EAAO,KAAO,eACd,EAAO,KAAO,WACd,EAAO,KAAO,eACd,EAAO,KAAO,YACd,EAAO,KAAO,eAEd,MAAO,GAAG,KAAO,EAAO,KAG1B,GAAI,EAAO,KAAO,KAAM,CACtB,IAAM,EAAS,EAAO,MAAM,IAAI,CAAC,IAAO,OAAO,IAAM,SAAW,IAAI,KAAO,CAAE,EAC7E,MAAO,GAAG,SAAW,EAAO,KAAK,IAAI,KAGvC,GAAI,EAAO,KAAO,QAAU,EAAO,KAAO,WACxC,MAAO,GAAG,KAAO,EAAO,OAAO,EAAO,SAGxC,GAAI,EAAO,KAAO,KAAO,EAAO,KAAO,MAAQ,EAAO,KAAO,KAAO,EAAO,KAAO,MAAQ,EAAO,KAAO,IACtG,MAAO,GAAG,KAAO,EAAO,MAAM,OAAO,EAAO,QAAU,SAAW,IAAI,EAAO,SAAW,EAAO,QAGhG,MAAO,GAAG,KAAO,EAAO,MAAM,OAAO,EAAO,QAAU,SAAW,IAAI,EAAO,SAAW,EAAO,SDnCzF,MAAM,CAAsD,CACzD,MACA,SAAoF,KAE5F,WAAW,CAAC,EAAsB,CAChC,KAAK,MAAQ,OAGF,IAAG,CAAC,EAAkE,CAGjF,OAFA,MAAM,KAAK,MAAM,IAAI,CAAI,EAElB,UAII,YAAW,CACtB,EACA,EAGe,CAGf,OAFA,MAAM,KAAK,MAAM,YAAY,EAAQ,CAAO,EAErC,UAII,kBAAiB,CAC5B,EAAS,SACT,EACe,CAMf,OALA,MAAM,KAAK,MAAM,YAAY,EAAQ,CACnC,OAAgB,QAAM,MAAM,KACzB,CACL,CAAC,EAEM,UAGI,OAAM,CACjB,EACA,EAWqB,CACrB,IAAQ,QAAQ,GAAI,SAAQ,SAAQ,UAAS,eAAc,aAAa,IAAS,GAAW,CAAC,EAE7F,GAAI,CAAC,KAAK,SACR,KAAK,SAAW,MAAc,YAAU,YAAY,OAAO,EAG7D,IAAI,EAAe,KAAK,MAAM,OAAO,EAAO,SAAU,MAAM,EACzD,OAAO,KAAK,QAAQ,EACpB,MAAM,CAAK,EAEd,GAAI,EACF,EAAc,EAAY,QAAQ,CAAO,EAG3C,GAAI,EACF,EAAc,EAAY,aAAa,CAAY,EAGrD,GAAI,EACF,EAAc,EAAY,WAAW,EAGvC,GAAI,EACF,EAAc,EAAY,OAAO,CAAkB,EAGrD,GAAI,EACF,EAAc,EAAY,MAAM,EAAY,CAAM,CAAC,EAGrD,OAAO,EAAY,QAAQ,OAIhB,YAAW,CACtB,EACA,EAKiB,CACjB,IAAQ,QAAQ,GAAI,SAAQ,UAAU,IAAS,GAAW,CAAC,EAE3D,GAAI,CAAC,KAAK,SACR,KAAK,SAAW,MAAc,YAAU,YAAY,OAAO,EAG7D,IAAI,EAAe,KAAK,MAAM,OAAO,EAAO,SAAU,MAAM,EACzD,OAAO,KAAK,QAAQ,EACpB,MAAM,CAAK,EAEd,GAAI,EACF,EAAc,EAAY,MAAM,EAAY,CAAM,CAAC,EAGrD,OAAO,EAAY,YAAY,CAAO,OAI3B,YAAW,CACtB,EACA,EAIiB,CACjB,IAAQ,QAAQ,GAAI,UAAW,GAAW,CAAC,EAE3C,GAAI,CAAC,KAAK,SACR,KAAK,SAAW,MAAc,YAAU,YAAY,OAAO,EAG7D,IAAI,EAAe,KAAK,MAAM,OAAO,EAAO,SAAU,MAAM,EACzD,OAAO,KAAK,QAAQ,EACpB,MAAM,CAAK,EAEd,GAAI,EACF,EAAc,EAAY,MAAM,EAAY,CAAM,CAAC,EAGrD,OAAO,EAAY,YAAY,EAEnC,CFlIO,MAAe,CAEtB,CACU,GAAgC,KAChC,UAER,WAAW,EAAG,CACZ,IAAQ,WAAU,SAAU,KAAK,kBAAkB,EACnD,KAAK,UAAY,EAAY,EAAE,IAAI,CAAQ,GAAG,OAAO,CAAE,OAAM,CAAC,OAOnD,QAAO,EAAkB,CACpC,KAAK,GAAK,MAAc,UAAQ,KAAK,eAAe,CAAC,EAGhD,WAAW,EAAuB,CACvC,GAAI,CAAC,KAAK,GACR,MAAM,IAAI,EAAwB,+CAA+C,EAGnF,OAAO,KAAK,QAGD,KAAI,CAAC,EAAc,EAA6E,CAE3G,IADmB,MAAM,KAAK,YAAY,EAAE,WAAW,GACxC,SAAS,CAAI,EAAG,CAC7B,IAAM,EAAQ,MAAM,KAAK,YAAY,EAAE,UAAU,CAAI,EACrD,OAAO,IAAI,EAAsB,CAAK,EAGxC,IAAM,EAAS,EAAY,CACzB,GAAI,IAAI,EACR,KAAM,KAAK,UAAU,YAAY,IAAI,CAAM,EAC3C,OAAQ,KAAK,UAAU,YAAY,KAChC,KAAK,UAAU,CACpB,CAAC,EACK,EAAQ,MAAM,KAAK,YAAY,EAAE,iBAAiB,EAAM,EAAQ,CAAE,KAAM,eAAgB,CAAQ,CAAC,EAKvG,OAJA,MAAM,EAAM,YAAY,KAAM,CAAE,OAAgB,QAAM,MAAM,CAAE,CAAC,EAC/D,MAAM,EAAM,YAAY,OAAQ,CAAE,OAAgB,QAAM,IAAI,CAAE,CAAC,EAC/D,MAAM,EAAM,YAAY,SAAU,CAAE,OAAgB,QAAM,MAAM,CAAE,CAAC,EAE5D,IAAI,EAAsB,CAAK,EAE1C,CIzDA,oBACA,iBAAS,sBACT,kBAAS,4BCFT,oBAAS,0BACT,qBAAS,4BAEF,MAAM,UAA2B,CAAU,CAChD,WAAW,CAAC,EAAiB,EAAgC,CAAC,EAAG,CAC/D,MAAM,EAAS,CACb,OAAQ,EAAW,KAAK,oBACxB,MACF,CAAC,EACD,KAAK,KAAO,qBAEhB,CDLO,MAAM,CAAgC,CAC1B,OAEjB,WAAW,CAAC,EAAgB,CAC1B,KAAK,OAAS,EAAK,KAAK,GAAG,EAAO,MAAM,OAAO,CAAC,QAGpC,OAAO,CACnB,EAAgC,CAAC,EACoD,CACrF,GAAI,CACF,IAAM,EAAS,EAAO,OAAO,EAAE,EACzB,EAAY,EAAK,KAAK,EAAQ,WAAa,GAAI,CAAM,GACnD,WAAU,cAAa,QAAO,SAAU,EAEhD,MAAM,EAAQ,CAAC,KAAK,MAAM,EAAG,CAC3B,YACA,OAAQ,gBACR,SAAU,EAAK,KAAK,EAAW,QAAQ,EACvC,YAAa,cACT,IAAa,QAAa,CAAE,UAAS,KACrC,IAAgB,QAAa,CAAE,aAAY,KAC3C,IAAU,QAAa,CAAE,OAAM,KAC/B,IAAU,QAAa,CAAE,OAAM,CACrC,CAAC,EAED,IAAM,EAAO,IAAI,IAAI,KAAK,GAAG,EACzB,EACA,EACJ,cAAiB,KAAQ,EAAK,KAAK,CAAS,EAAG,CAC7C,GAAI,CAAC,GAAY,EAAK,SAAS,OAAO,EAAG,EAAW,EACpD,GAAI,CAAC,GAAU,EAAK,SAAS,KAAK,EAAG,EAAS,EAC9C,GAAI,GAAY,EAAQ,MAG1B,GAAI,CAAC,EACH,MAAM,IAAI,EAAmB,6CAA8C,CAAE,OAAQ,KAAK,MAAO,CAAC,EAGpG,GAAI,CAAC,EACH,MAAM,IAAI,EAAmB,iDAAkD,CAAE,OAAQ,KAAK,MAAO,CAAC,EAGxG,IAAM,EAAW,EAAK,KAAK,EAAW,CAAQ,EACxC,EAAM,MAAM,IAAI,KAAK,CAAQ,EAAE,KAAK,EACpC,EAAW,EAAI,cAAgB,EAAK,SAAS,KAAK,MAAM,EAE9D,MAAO,KAAK,eAAe,EAAI,MAAQ,CAAC,EAAG,CAAQ,EAEnD,IAAM,EAAc,GAAG,EAAO,OAAO,EAAE,SACjC,EAAY,GAAG,EAAO,OAAO,EAAE,OAE/B,EAAS,EAAK,KAAK,EAAW,CAAM,EACpC,EAAkB,EAAK,KAAK,EAAW,CAAW,EAClD,EAAgB,EAAK,KAAK,EAAW,CAAS,EAMpD,OAJA,MAAM,QAAQ,IAAI,CAAC,IAAI,MAAM,EAAiB,IAAI,KAAK,CAAQ,CAAC,EAAG,IAAI,MAAM,EAAe,IAAI,KAAK,CAAM,CAAC,CAAC,CAAC,EAE9G,MAAM,QAAQ,IAAI,CAAC,IAAI,KAAK,CAAQ,EAAE,OAAO,EAAG,IAAI,KAAK,CAAM,EAAE,OAAO,CAAC,CAAC,EAEnE,CACL,KAAM,CAAE,KAAM,EAAa,KAAM,CAAgB,EACjD,SAAU,CAAE,KAAM,EAAW,KAAM,CAAc,CACnD,EACA,MAAO,EAAO,CACd,GAAI,aAAiB,EAAoB,MAAM,EAC/C,MAAM,IAAI,EAAmB,aAAiB,MAAQ,EAAM,QAAU,sCAAuC,CAC3G,OAAQ,KAAK,MACf,CAAC,IAII,cAAc,CAAC,EAAiC,EAAsC,CAC7F,IAAI,EAAgC,KAChC,EAA2B,CAAC,EAC5B,EAA2B,KAC3B,EAAU,IAAI,IAElB,QAAW,KAAW,EAAM,CAC1B,IAAM,EAAO,EAAQ,KACrB,GAAI,CAAC,EAAM,SAEX,GAAI,IAAS,UAAW,CACtB,GAAI,EAAe,OAAS,EAC1B,KAAM,CACJ,KAAM,EAAe,KAAK;AAAA,CAAI,EAC9B,SAAU,CAAE,QAAS,EAAgB,KAAM,EAAW,MAAO,MAAM,KAAK,CAAO,EAAG,QAAO,CAC3F,EAEF,IAAM,EAAU,KAAK,eAAe,CAAO,EAC3C,EAAiB,EACjB,EAAiB,EAAU,CAAC,CAAO,EAAI,CAAC,EACxC,EAAa,EAAQ,gBAA6B,KAClD,EAAU,IAAI,IAAI,IAAc,KAAO,CAAC,CAAS,EAAI,CAAC,CAAC,EACvD,SAGF,GAAI,IAAS,aAAe,IAAS,OAAQ,CAC3C,IAAM,EAAU,KAAK,eAAe,CAAO,EAC3C,GAAI,EAAS,CACX,EAAe,KAAK,CAAO,EAC3B,IAAM,EAAO,EAAQ,eACrB,GAAI,IAAS,OACX,EAAQ,IAAI,CAAI,IAMxB,GAAI,EAAe,OAAS,EAC1B,KAAM,CACJ,KAAM,EAAe,KAAK;AAAA,CAAI,EAC9B,SAAU,CAAE,QAAS,EAAgB,KAAM,EAAW,MAAO,MAAM,KAAK,CAAO,EAAG,QAAO,CAC3F,EAII,cAAc,CAAC,EAAiD,CACtE,IAAM,EAAQ,MAAM,KAAK,KAAK,aAAa,CAAO,CAAC,EACnD,OAAO,EAAM,OAAS,EAAI,EAAM,KAAK,GAAG,EAAI,MAGrC,YAAY,CAAC,EAAqD,CACzE,IAAM,EAAU,EAAQ,QACxB,GAAI,EAAS,CACX,MAAM,EACN,OAGF,IAAM,EAAO,EAAQ,KACrB,GAAI,EACF,QAAW,KAAO,EAChB,MAAO,KAAK,aAAa,CAAG,EAIpC,CE9IA,oBAAS,qBAAW,0BAGb,IAAM,EAAY,CACvB,eAAgB,CAAC,EAAyB,EAAgB,YAAc,CACtE,MAAO,CAAC,IAA0C,CAChD,EAAU,IAAI,EAAQ,CAAK,GAGjC,ECPA,eAAS,qBAMF,MAAM,UAAuB,CAAiC,CAC5D,cAAc,EAAW,CAC9B,MAAO,GAGF,iBAAiB,EAA4E,CAClG,MAAO,CAAE,SAAU,SAAU,MAAO,wBAAyB,EAGxD,SAAS,EAA8C,CAC5D,MAAO,CACL,KAAM,IAAI,CACZ,EAEJ",
|
|
15
|
+
"debugId": "FB64C08DA10152BA64756E2164756E21",
|
|
16
|
+
"names": []
|
|
17
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@ooneex/rag",
|
|
3
|
+
"description": "Retrieval-Augmented Generation toolkit with vector database integration, document embedding, and semantic search for AI-powered knowledge retrieval",
|
|
4
|
+
"version": "1.0.0",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"files": [
|
|
7
|
+
"dist",
|
|
8
|
+
"LICENSE",
|
|
9
|
+
"README.md",
|
|
10
|
+
"package.json"
|
|
11
|
+
],
|
|
12
|
+
"module": "./dist/index.js",
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"exports": {
|
|
15
|
+
".": {
|
|
16
|
+
"import": {
|
|
17
|
+
"types": "./dist/index.d.ts",
|
|
18
|
+
"default": "./dist/index.js"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"./package.json": "./package.json"
|
|
22
|
+
},
|
|
23
|
+
"license": "MIT",
|
|
24
|
+
"scripts": {
|
|
25
|
+
"test": "bun test tests",
|
|
26
|
+
"build": "bunup",
|
|
27
|
+
"lint": "tsgo --noEmit && bunx biome lint",
|
|
28
|
+
"npm:publish": "bun publish --tolerate-republish --access public"
|
|
29
|
+
},
|
|
30
|
+
"keywords": [],
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"@lancedb/lancedb": "^0.27.1",
|
|
33
|
+
"@ooneex/exception": "0.0.18",
|
|
34
|
+
"@ooneex/container": "0.0.19",
|
|
35
|
+
"@ooneex/http-status": "0.0.18",
|
|
36
|
+
"@ooneex/utils": "0.1.1",
|
|
37
|
+
"@opendataloader/pdf": "^2.0.2",
|
|
38
|
+
"apache-arrow": "^21.1.0"
|
|
39
|
+
},
|
|
40
|
+
"devDependencies": {
|
|
41
|
+
"@ooneex/types": "0.0.19"
|
|
42
|
+
}
|
|
43
|
+
}
|