@opensaas/stack-rag 0.1.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +141 -0
- package/README.md +82 -6
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +9 -0
- package/dist/config/index.js.map +1 -1
- package/dist/config/plugin.d.ts.map +1 -1
- package/dist/config/plugin.js +61 -1
- package/dist/config/plugin.js.map +1 -1
- package/dist/config/plugin.test.js +70 -14
- package/dist/config/plugin.test.js.map +1 -1
- package/dist/config/types.d.ts +186 -0
- package/dist/config/types.d.ts.map +1 -1
- package/dist/fields/index.d.ts +1 -0
- package/dist/fields/index.d.ts.map +1 -1
- package/dist/fields/index.js +1 -0
- package/dist/fields/index.js.map +1 -1
- package/dist/fields/searchable.d.ts +42 -0
- package/dist/fields/searchable.d.ts.map +1 -0
- package/dist/fields/searchable.js +51 -0
- package/dist/fields/searchable.js.map +1 -0
- package/dist/fields/searchable.test.d.ts +2 -0
- package/dist/fields/searchable.test.d.ts.map +1 -0
- package/dist/fields/searchable.test.js +112 -0
- package/dist/fields/searchable.test.js.map +1 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/providers/openai.d.ts +2 -0
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +35 -20
- package/dist/providers/openai.js.map +1 -1
- package/dist/runtime/batch.test.js +1 -1
- package/dist/runtime/build-time.d.ts +100 -0
- package/dist/runtime/build-time.d.ts.map +1 -0
- package/dist/runtime/build-time.js +185 -0
- package/dist/runtime/build-time.js.map +1 -0
- package/dist/runtime/index.d.ts +3 -0
- package/dist/runtime/index.d.ts.map +1 -1
- package/dist/runtime/index.js +6 -0
- package/dist/runtime/index.js.map +1 -1
- package/dist/runtime/markdown.d.ts +33 -0
- package/dist/runtime/markdown.d.ts.map +1 -0
- package/dist/runtime/markdown.js +94 -0
- package/dist/runtime/markdown.js.map +1 -0
- package/dist/runtime/provider-helpers.d.ts +56 -0
- package/dist/runtime/provider-helpers.d.ts.map +1 -0
- package/dist/runtime/provider-helpers.js +95 -0
- package/dist/runtime/provider-helpers.js.map +1 -0
- package/dist/runtime/types.d.ts +29 -0
- package/dist/runtime/types.d.ts.map +1 -0
- package/dist/runtime/types.js +6 -0
- package/dist/runtime/types.js.map +1 -0
- package/dist/storage/access-filter.d.ts +30 -0
- package/dist/storage/access-filter.d.ts.map +1 -0
- package/dist/storage/access-filter.js +241 -0
- package/dist/storage/access-filter.js.map +1 -0
- package/dist/storage/index.d.ts +2 -0
- package/dist/storage/index.d.ts.map +1 -1
- package/dist/storage/index.js +3 -0
- package/dist/storage/index.js.map +1 -1
- package/dist/storage/json-file.d.ts +53 -0
- package/dist/storage/json-file.d.ts.map +1 -0
- package/dist/storage/json-file.js +124 -0
- package/dist/storage/json-file.js.map +1 -0
- package/dist/storage/pgvector.d.ts.map +1 -1
- package/dist/storage/pgvector.js +26 -11
- package/dist/storage/pgvector.js.map +1 -1
- package/dist/storage/storage.test.js +2 -0
- package/dist/storage/storage.test.js.map +1 -1
- package/dist/storage/types.d.ts +5 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/dist/storage/types.js.map +1 -1
- package/package.json +6 -5
- package/src/config/index.ts +9 -0
- package/src/config/plugin.test.ts +70 -14
- package/src/config/plugin.ts +72 -2
- package/src/config/types.ts +217 -0
- package/src/fields/index.ts +2 -0
- package/src/fields/searchable.test.ts +136 -0
- package/src/fields/searchable.ts +57 -0
- package/src/index.ts +6 -0
- package/src/providers/openai.ts +37 -22
- package/src/runtime/batch.test.ts +1 -1
- package/src/runtime/build-time.ts +216 -0
- package/src/runtime/index.ts +18 -0
- package/src/runtime/markdown.ts +119 -0
- package/src/runtime/provider-helpers.ts +115 -0
- package/src/runtime/types.ts +30 -0
- package/src/storage/access-filter.ts +303 -0
- package/src/storage/index.ts +4 -0
- package/src/storage/json-file.ts +157 -0
- package/src/storage/pgvector.ts +31 -11
- package/src/storage/storage.test.ts +2 -0
- package/src/storage/types.ts +6 -0
- package/tsconfig.tsbuildinfo +1 -1
package/dist/providers/openai.js
CHANGED
|
@@ -6,6 +6,19 @@ const MODEL_DIMENSIONS = {
|
|
|
6
6
|
'text-embedding-3-large': 3072,
|
|
7
7
|
'text-embedding-ada-002': 1536,
|
|
8
8
|
};
|
|
9
|
+
/**
|
|
10
|
+
* Lazily load OpenAI to avoid requiring it at import time
|
|
11
|
+
*/
|
|
12
|
+
async function getOpenAI() {
|
|
13
|
+
try {
|
|
14
|
+
const module = await import('openai');
|
|
15
|
+
return module.default;
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
throw new Error('OpenAI package not found. Install it with: npm install openai\n' +
|
|
19
|
+
'Make sure to run: pnpm install openai');
|
|
20
|
+
}
|
|
21
|
+
}
|
|
9
22
|
/**
|
|
10
23
|
* OpenAI embedding provider
|
|
11
24
|
* Requires the `openai` package to be installed
|
|
@@ -14,30 +27,30 @@ export class OpenAIEmbeddingProvider {
|
|
|
14
27
|
type = 'openai';
|
|
15
28
|
model;
|
|
16
29
|
dimensions;
|
|
17
|
-
client;
|
|
30
|
+
client = null;
|
|
18
31
|
config;
|
|
32
|
+
clientPromise = null;
|
|
19
33
|
constructor(config) {
|
|
20
34
|
this.config = config;
|
|
21
35
|
this.model = config.model || 'text-embedding-3-small';
|
|
22
36
|
this.dimensions = MODEL_DIMENSIONS[this.model] || 1536;
|
|
23
|
-
// Initialize OpenAI client
|
|
24
|
-
this.client = this.initializeClient();
|
|
25
37
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
return
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
async ensureClient() {
|
|
39
|
+
if (this.client)
|
|
40
|
+
return this.client;
|
|
41
|
+
if (this.clientPromise)
|
|
42
|
+
return this.clientPromise;
|
|
43
|
+
this.clientPromise = this.initializeClient();
|
|
44
|
+
this.client = await this.clientPromise;
|
|
45
|
+
return this.client;
|
|
46
|
+
}
|
|
47
|
+
async initializeClient() {
|
|
48
|
+
const OpenAI = await getOpenAI();
|
|
49
|
+
return new OpenAI({
|
|
50
|
+
apiKey: this.config.apiKey,
|
|
51
|
+
organization: this.config.organization,
|
|
52
|
+
baseURL: this.config.baseURL,
|
|
53
|
+
});
|
|
41
54
|
}
|
|
42
55
|
/**
|
|
43
56
|
* Generate embedding for a single text
|
|
@@ -47,7 +60,8 @@ export class OpenAIEmbeddingProvider {
|
|
|
47
60
|
throw new Error('Cannot generate embedding for empty text');
|
|
48
61
|
}
|
|
49
62
|
try {
|
|
50
|
-
const
|
|
63
|
+
const client = await this.ensureClient();
|
|
64
|
+
const response = await client.embeddings.create({
|
|
51
65
|
model: this.model,
|
|
52
66
|
input: text,
|
|
53
67
|
encoding_format: 'float',
|
|
@@ -79,7 +93,8 @@ export class OpenAIEmbeddingProvider {
|
|
|
79
93
|
}
|
|
80
94
|
try {
|
|
81
95
|
// OpenAI supports batch embedding
|
|
82
|
-
const
|
|
96
|
+
const client = await this.ensureClient();
|
|
97
|
+
const response = await client.embeddings.create({
|
|
83
98
|
model: this.model,
|
|
84
99
|
input: validTexts,
|
|
85
100
|
encoding_format: 'float',
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"openai.js","sourceRoot":"","sources":["../../src/providers/openai.ts"],"names":[],"mappings":"AAGA;;GAEG;AACH,MAAM,gBAAgB,GAAyC;IAC7D,wBAAwB,EAAE,IAAI;IAC9B,wBAAwB,EAAE,IAAI;IAC9B,wBAAwB,EAAE,IAAI;CAC/B,CAAA;AAiBD;;;GAGG;AACH,MAAM,OAAO,uBAAuB;IACzB,IAAI,GAAG,QAAQ,CAAA;IACf,KAAK,CAAQ;IACb,UAAU,CAAQ;IAEnB,MAAM,
|
|
1
|
+
{"version":3,"file":"openai.js","sourceRoot":"","sources":["../../src/providers/openai.ts"],"names":[],"mappings":"AAGA;;GAEG;AACH,MAAM,gBAAgB,GAAyC;IAC7D,wBAAwB,EAAE,IAAI;IAC9B,wBAAwB,EAAE,IAAI;IAC9B,wBAAwB,EAAE,IAAI;CAC/B,CAAA;AAED;;GAEG;AACH,KAAK,UAAU,SAAS;IACtB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAA;QACrC,OAAO,MAAM,CAAC,OAAO,CAAA;IACvB,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,iEAAiE;YAC/D,uCAAuC,CAC1C,CAAA;IACH,CAAC;AACH,CAAC;AAiBD;;;GAGG;AACH,MAAM,OAAO,uBAAuB;IACzB,IAAI,GAAG,QAAQ,CAAA;IACf,KAAK,CAAQ;IACb,UAAU,CAAQ;IAEnB,MAAM,GAAwB,IAAI,CAAA;IAClC,MAAM,CAAuB;IAC7B,aAAa,GAAiC,IAAI,CAAA;IAE1D,YAAY,MAA6B;QACvC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAA;QACpB,IAAI,CAAC,KAAK,GAAG,MAAM,CAAC,KAAK,IAAI,wBAAwB,CAAA;QACrD,IAAI,CAAC,UAAU,GAAG,gBAAgB,CAAC,IAAI,CAAC,KAA6B,CAAC,IAAI,IAAI,CAAA;IAChF,CAAC;IAEO,KAAK,CAAC,YAAY;QACxB,IAAI,IAAI,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC,MAAM,CAAA;QACnC,IAAI,IAAI,CAAC,aAAa;YAAE,OAAO,IAAI,CAAC,aAAa,CAAA;QAEjD,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAA;QAC5C,IAAI,CAAC,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAA;QACtC,OAAO,IAAI,CAAC,MAAM,CAAA;IACpB,CAAC;IAEO,KAAK,CAAC,gBAAgB;QAC5B,MAAM,MAAM,GAAG,MAAM,SAAS,EAAE,CAAA;QAEhC,OAAO,IAAI,MAAM,CAAC;YAChB,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;YAC1B,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;YACtC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;SAC7B,CAAiB,CAAA;IACpB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,IAAY;QACtB,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAA;QAC7D,CAAC;QAED,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,EAAE,CAAA;YACxC,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;gBAC9C,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,OAAO;aACzB,CAAC,CAAA;YAEF,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;QACnC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,uCAAwC,KAAe,CAAC,OAAO,EAAE,CAAC,CAAA;QACpF,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CAAC,KAAe;QAC9B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,CAAA;QACX,CAAC;QAED,mDAAmD;QACnD,MAAM,UAAU,GAAa,EAAE,CAAA;QAC/B,MAAM,YAAY,GAAa,EAAE,CAAA;QAEjC,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE;YAC5B,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;gBACrB,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;YAC1B,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAA;QACnE,CAAC;QAED,IAAI,CAAC;YACH,kCAAkC;YAClC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,EAAE,CAAA;YACxC,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC;gBAC9C,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,KAAK,EAAE,UAAU;gBACjB,eAAe,EAAE,OAAO;aACzB,CAAC,CAAA;YAEF,wCAAwC;YACxC,MAAM,OAAO,GAAe,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAA;YAEnD,qCAAqC;YACrC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,IAA6B,EAAE,CAAS,EAAE,EAAE;gBACjE,MAAM,aAAa,GAAG,YAAY,CAAC,CAAC,CAAC,CAAA;gBACrC,OAAO,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,SAAS,CAAA;YACzC,CAAC,CAAC,CAAA;YAEF,yCAAyC;YACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC;oBAChB,OAAO,CAAC,CAAC,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;gBACjD,CAAC;YACH,CAAC;YAED,OAAO,OAAO,CAAA;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,6CAA8C,KAAe,CAAC,OAAO,EAAE,CAAC,CAAA;QAC1F,CAAC;IACH,CAAC;CACF;AAED;;;;;;;;;;;;;;;GAeG;AACH,MAAM,UAAU,oBAAoB,CAAC,MAA6B;IAChE,OAAO,IAAI,uBAAuB,CAAC,MAAM,CAAC,CAAA;AAC5C,CAAC"}
|
|
@@ -200,7 +200,7 @@ describe('ProcessingQueue', () => {
|
|
|
200
200
|
// With concurrency 3, should be faster than sequential
|
|
201
201
|
// 5 items with 10ms each sequentially = 50ms
|
|
202
202
|
// With concurrency 3: ceil(5/3) * 10ms = 20ms
|
|
203
|
-
expect(duration).toBeLessThan(
|
|
203
|
+
expect(duration).toBeLessThan(50);
|
|
204
204
|
});
|
|
205
205
|
it('should track queue size', async () => {
|
|
206
206
|
const queue = new ProcessingQueue(async (item) => {
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build-time utilities for generating and managing embeddings
|
|
3
|
+
* Used by CLI tools and custom build scripts
|
|
4
|
+
*/
|
|
5
|
+
import type { EmbeddingProvider } from '../providers/types.js';
|
|
6
|
+
import type { EmbeddingsIndex, EmbeddedDocument } from '../config/types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Simple character-based text chunking for build-time generation
|
|
9
|
+
*
|
|
10
|
+
* Simpler than the runtime chunking strategies, optimized for build-time batch processing.
|
|
11
|
+
* Splits text into fixed-size chunks with overlap.
|
|
12
|
+
*
|
|
13
|
+
* @param text - Text to chunk
|
|
14
|
+
* @param chunkSize - Size of each chunk in characters
|
|
15
|
+
* @param overlap - Overlap between chunks in characters
|
|
16
|
+
* @returns Array of text chunks
|
|
17
|
+
*
|
|
18
|
+
* @example
|
|
19
|
+
* ```typescript
|
|
20
|
+
* import { simpleChunkText } from '@opensaas/stack-rag/runtime'
|
|
21
|
+
*
|
|
22
|
+
* const chunks = simpleChunkText("Long document...", 500, 50)
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
export declare function simpleChunkText(text: string, chunkSize: number, overlap: number): string[];
|
|
26
|
+
/**
|
|
27
|
+
* Compute SHA256 hash of content for change detection
|
|
28
|
+
*
|
|
29
|
+
* @param content - Content to hash
|
|
30
|
+
* @returns Hexadecimal hash string
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```typescript
|
|
34
|
+
* import { hashContent } from '@opensaas/stack-rag/runtime'
|
|
35
|
+
*
|
|
36
|
+
* const hash = hashContent("document content")
|
|
37
|
+
* ```
|
|
38
|
+
*/
|
|
39
|
+
export declare function hashContent(content: string): string;
|
|
40
|
+
/**
|
|
41
|
+
* Load existing embeddings index from file
|
|
42
|
+
*
|
|
43
|
+
* Used for differential updates - only regenerate embeddings for changed content.
|
|
44
|
+
*
|
|
45
|
+
* @param filePath - Path to embeddings JSON file
|
|
46
|
+
* @returns Loaded index or null if file doesn't exist or can't be loaded
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```typescript
|
|
50
|
+
* import { loadExistingIndex } from '@opensaas/stack-rag/runtime'
|
|
51
|
+
*
|
|
52
|
+
* const existing = loadExistingIndex('.embeddings/docs.json')
|
|
53
|
+
* if (existing) {
|
|
54
|
+
* console.log(`Found ${Object.keys(existing.documents).length} existing documents`)
|
|
55
|
+
* }
|
|
56
|
+
* ```
|
|
57
|
+
*/
|
|
58
|
+
export declare function loadExistingIndex(filePath: string): EmbeddingsIndex | null;
|
|
59
|
+
/**
|
|
60
|
+
* Generate embeddings for a document with chunking
|
|
61
|
+
*
|
|
62
|
+
* Main utility for build-time embedding generation. Chunks the document,
|
|
63
|
+
* generates embeddings for each chunk, and returns a complete EmbeddedDocument.
|
|
64
|
+
*
|
|
65
|
+
* @param documentId - Unique identifier for the document
|
|
66
|
+
* @param content - Document content (plain text)
|
|
67
|
+
* @param provider - Embedding provider instance
|
|
68
|
+
* @param options - Generation options
|
|
69
|
+
* @returns Complete embedded document ready to be added to index
|
|
70
|
+
*
|
|
71
|
+
* @example
|
|
72
|
+
* ```typescript
|
|
73
|
+
* import { generateDocumentEmbeddings } from '@opensaas/stack-rag/runtime'
|
|
74
|
+
* import { createEmbeddingProvider } from '@opensaas/stack-rag/providers'
|
|
75
|
+
*
|
|
76
|
+
* const provider = createEmbeddingProvider({
|
|
77
|
+
* type: 'openai',
|
|
78
|
+
* apiKey: process.env.OPENAI_API_KEY
|
|
79
|
+
* })
|
|
80
|
+
*
|
|
81
|
+
* const doc = await generateDocumentEmbeddings(
|
|
82
|
+
* 'docs/getting-started',
|
|
83
|
+
* 'Document content here...',
|
|
84
|
+
* provider,
|
|
85
|
+
* {
|
|
86
|
+
* title: 'Getting Started',
|
|
87
|
+
* chunkSize: 500,
|
|
88
|
+
* chunkOverlap: 50,
|
|
89
|
+
* metadata: { section: 'guides' }
|
|
90
|
+
* }
|
|
91
|
+
* )
|
|
92
|
+
* ```
|
|
93
|
+
*/
|
|
94
|
+
export declare function generateDocumentEmbeddings(documentId: string, content: string, provider: EmbeddingProvider, options: {
|
|
95
|
+
title?: string;
|
|
96
|
+
chunkSize: number;
|
|
97
|
+
chunkOverlap: number;
|
|
98
|
+
metadata?: Record<string, unknown>;
|
|
99
|
+
}): Promise<EmbeddedDocument>;
|
|
100
|
+
//# sourceMappingURL=build-time.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"build-time.d.ts","sourceRoot":"","sources":["../../src/runtime/build-time.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAC9D,OAAO,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAkB,MAAM,oBAAoB,CAAA;AAE3F;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAW1F;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAEnD;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAY1E;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AACH,wBAAsB,0BAA0B,CAC9C,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,QAAQ,EAAE,iBAAiB,EAC3B,OAAO,EAAE;IACP,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,SAAS,EAAE,MAAM,CAAA;IACjB,YAAY,EAAE,MAAM,CAAA;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;CACnC,GACA,OAAO,CAAC,gBAAgB,CAAC,CAgF3B"}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build-time utilities for generating and managing embeddings
|
|
3
|
+
* Used by CLI tools and custom build scripts
|
|
4
|
+
*/
|
|
5
|
+
import { readFileSync, existsSync } from 'node:fs';
|
|
6
|
+
import { createHash } from 'node:crypto';
|
|
7
|
+
/**
|
|
8
|
+
* Simple character-based text chunking for build-time generation
|
|
9
|
+
*
|
|
10
|
+
* Simpler than the runtime chunking strategies, optimized for build-time batch processing.
|
|
11
|
+
* Splits text into fixed-size chunks with overlap.
|
|
12
|
+
*
|
|
13
|
+
* @param text - Text to chunk
|
|
14
|
+
* @param chunkSize - Size of each chunk in characters
|
|
15
|
+
* @param overlap - Overlap between chunks in characters
|
|
16
|
+
* @returns Array of text chunks
|
|
17
|
+
*
|
|
18
|
+
* @example
|
|
19
|
+
* ```typescript
|
|
20
|
+
* import { simpleChunkText } from '@opensaas/stack-rag/runtime'
|
|
21
|
+
*
|
|
22
|
+
* const chunks = simpleChunkText("Long document...", 500, 50)
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
export function simpleChunkText(text, chunkSize, overlap) {
|
|
26
|
+
const chunks = [];
|
|
27
|
+
let start = 0;
|
|
28
|
+
while (start < text.length) {
|
|
29
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
30
|
+
chunks.push(text.slice(start, end));
|
|
31
|
+
start += chunkSize - overlap;
|
|
32
|
+
}
|
|
33
|
+
return chunks;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Compute SHA256 hash of content for change detection
|
|
37
|
+
*
|
|
38
|
+
* @param content - Content to hash
|
|
39
|
+
* @returns Hexadecimal hash string
|
|
40
|
+
*
|
|
41
|
+
* @example
|
|
42
|
+
* ```typescript
|
|
43
|
+
* import { hashContent } from '@opensaas/stack-rag/runtime'
|
|
44
|
+
*
|
|
45
|
+
* const hash = hashContent("document content")
|
|
46
|
+
* ```
|
|
47
|
+
*/
|
|
48
|
+
export function hashContent(content) {
|
|
49
|
+
return createHash('sha256').update(content).digest('hex');
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Load existing embeddings index from file
|
|
53
|
+
*
|
|
54
|
+
* Used for differential updates - only regenerate embeddings for changed content.
|
|
55
|
+
*
|
|
56
|
+
* @param filePath - Path to embeddings JSON file
|
|
57
|
+
* @returns Loaded index or null if file doesn't exist or can't be loaded
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* ```typescript
|
|
61
|
+
* import { loadExistingIndex } from '@opensaas/stack-rag/runtime'
|
|
62
|
+
*
|
|
63
|
+
* const existing = loadExistingIndex('.embeddings/docs.json')
|
|
64
|
+
* if (existing) {
|
|
65
|
+
* console.log(`Found ${Object.keys(existing.documents).length} existing documents`)
|
|
66
|
+
* }
|
|
67
|
+
* ```
|
|
68
|
+
*/
|
|
69
|
+
export function loadExistingIndex(filePath) {
|
|
70
|
+
if (!existsSync(filePath)) {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
try {
|
|
74
|
+
const content = readFileSync(filePath, 'utf-8');
|
|
75
|
+
return JSON.parse(content);
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
console.warn(`Warning: Could not load existing embeddings from ${filePath}`);
|
|
79
|
+
return null;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Generate embeddings for a document with chunking
|
|
84
|
+
*
|
|
85
|
+
* Main utility for build-time embedding generation. Chunks the document,
|
|
86
|
+
* generates embeddings for each chunk, and returns a complete EmbeddedDocument.
|
|
87
|
+
*
|
|
88
|
+
* @param documentId - Unique identifier for the document
|
|
89
|
+
* @param content - Document content (plain text)
|
|
90
|
+
* @param provider - Embedding provider instance
|
|
91
|
+
* @param options - Generation options
|
|
92
|
+
* @returns Complete embedded document ready to be added to index
|
|
93
|
+
*
|
|
94
|
+
* @example
|
|
95
|
+
* ```typescript
|
|
96
|
+
* import { generateDocumentEmbeddings } from '@opensaas/stack-rag/runtime'
|
|
97
|
+
* import { createEmbeddingProvider } from '@opensaas/stack-rag/providers'
|
|
98
|
+
*
|
|
99
|
+
* const provider = createEmbeddingProvider({
|
|
100
|
+
* type: 'openai',
|
|
101
|
+
* apiKey: process.env.OPENAI_API_KEY
|
|
102
|
+
* })
|
|
103
|
+
*
|
|
104
|
+
* const doc = await generateDocumentEmbeddings(
|
|
105
|
+
* 'docs/getting-started',
|
|
106
|
+
* 'Document content here...',
|
|
107
|
+
* provider,
|
|
108
|
+
* {
|
|
109
|
+
* title: 'Getting Started',
|
|
110
|
+
* chunkSize: 500,
|
|
111
|
+
* chunkOverlap: 50,
|
|
112
|
+
* metadata: { section: 'guides' }
|
|
113
|
+
* }
|
|
114
|
+
* )
|
|
115
|
+
* ```
|
|
116
|
+
*/
|
|
117
|
+
export async function generateDocumentEmbeddings(documentId, content, provider, options) {
|
|
118
|
+
const { title, chunkSize, chunkOverlap, metadata = {} } = options;
|
|
119
|
+
// Hash content for differential updates
|
|
120
|
+
const contentHash = hashContent(content);
|
|
121
|
+
// Prepare all text chunks to embed
|
|
122
|
+
const allTextChunks = [];
|
|
123
|
+
const chunkTypes = [];
|
|
124
|
+
// Add title chunk first if title exists
|
|
125
|
+
if (title) {
|
|
126
|
+
allTextChunks.push(title);
|
|
127
|
+
chunkTypes.push('title');
|
|
128
|
+
}
|
|
129
|
+
// Chunk the content
|
|
130
|
+
const contentChunks = simpleChunkText(content, chunkSize, chunkOverlap);
|
|
131
|
+
allTextChunks.push(...contentChunks);
|
|
132
|
+
contentChunks.forEach(() => chunkTypes.push('content'));
|
|
133
|
+
// Generate embeddings in batch for all chunks
|
|
134
|
+
const allEmbeddings = await provider.embedBatch(allTextChunks);
|
|
135
|
+
// Build chunks with embeddings
|
|
136
|
+
const chunks = [];
|
|
137
|
+
let embeddingIndex = 0;
|
|
138
|
+
let contentChunkIndex = 0;
|
|
139
|
+
for (let i = 0; i < chunkTypes.length; i++) {
|
|
140
|
+
const type = chunkTypes[i];
|
|
141
|
+
if (type === 'title') {
|
|
142
|
+
// Title chunk
|
|
143
|
+
chunks.push({
|
|
144
|
+
text: allTextChunks[embeddingIndex],
|
|
145
|
+
embedding: allEmbeddings[embeddingIndex],
|
|
146
|
+
metadata: {
|
|
147
|
+
chunkIndex: -1, // Special index for title
|
|
148
|
+
startOffset: 0,
|
|
149
|
+
endOffset: 0,
|
|
150
|
+
isTitle: true,
|
|
151
|
+
...metadata,
|
|
152
|
+
},
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
// Content chunk
|
|
157
|
+
chunks.push({
|
|
158
|
+
text: allTextChunks[embeddingIndex],
|
|
159
|
+
embedding: allEmbeddings[embeddingIndex],
|
|
160
|
+
metadata: {
|
|
161
|
+
chunkIndex: contentChunkIndex,
|
|
162
|
+
startOffset: contentChunkIndex * (chunkSize - chunkOverlap),
|
|
163
|
+
endOffset: Math.min((contentChunkIndex + 1) * chunkSize - contentChunkIndex * chunkOverlap, content.length),
|
|
164
|
+
...metadata,
|
|
165
|
+
},
|
|
166
|
+
});
|
|
167
|
+
contentChunkIndex++;
|
|
168
|
+
}
|
|
169
|
+
embeddingIndex++;
|
|
170
|
+
}
|
|
171
|
+
return {
|
|
172
|
+
id: documentId,
|
|
173
|
+
title,
|
|
174
|
+
chunks,
|
|
175
|
+
embeddingMetadata: {
|
|
176
|
+
model: provider.model,
|
|
177
|
+
provider: provider.type,
|
|
178
|
+
dimensions: provider.dimensions,
|
|
179
|
+
generatedAt: new Date().toISOString(),
|
|
180
|
+
},
|
|
181
|
+
generatedAt: new Date().toISOString(),
|
|
182
|
+
contentHash,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
//# sourceMappingURL=build-time.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"build-time.js","sourceRoot":"","sources":["../../src/runtime/build-time.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAA;AAClD,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAIxC;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY,EAAE,SAAiB,EAAE,OAAe;IAC9E,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;QACpD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAA;QACnC,KAAK,IAAI,SAAS,GAAG,OAAO,CAAA;IAC9B,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,WAAW,CAAC,OAAe;IACzC,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;AAC3D,CAAC;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,iBAAiB,CAAC,QAAgB;IAChD,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC1B,OAAO,IAAI,CAAA;IACb,CAAC;IAED,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;QAC/C,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAoB,CAAA;IAC/C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,IAAI,CAAC,oDAAoD,QAAQ,EAAE,CAAC,CAAA;QAC5E,OAAO,IAAI,CAAA;IACb,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AACH,MAAM,CAAC,KAAK,UAAU,0BAA0B,CAC9C,UAAkB,EAClB,OAAe,EACf,QAA2B,EAC3B,OAKC;IAED,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,YAAY,EAAE,QAAQ,GAAG,EAAE,EAAE,GAAG,OAAO,CAAA;IAEjE,wCAAwC;IACxC,MAAM,WAAW,GAAG,WAAW,CAAC,OAAO,CAAC,CAAA;IAExC,mCAAmC;IACnC,MAAM,aAAa,GAAa,EAAE,CAAA;IAClC,MAAM,UAAU,GAA+B,EAAE,CAAA;IAEjD,wCAAwC;IACxC,IAAI,KAAK,EAAE,CAAC;QACV,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;QACzB,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC1B,CAAC;IAED,oBAAoB;IACpB,MAAM,aAAa,GAAG,eAAe,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAA;IACvE,aAAa,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,CAAA;IACpC,aAAa,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAA;IAEvD,8CAA8C;IAC9C,MAAM,aAAa,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;IAE9D,+BAA+B;IAC/B,MAAM,MAAM,GAAqB,EAAE,CAAA;IAEnC,IAAI,cAAc,GAAG,CAAC,CAAA;IACtB,IAAI,iBAAiB,GAAG,CAAC,CAAA;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAA;QAE1B,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;YACrB,cAAc;YACd,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,aAAa,CAAC,cAAc,CAAC;gBACnC,SAAS,EAAE,aAAa,CAAC,cAAc,CAAC;gBACxC,QAAQ,EAAE;oBACR,UAAU,EAAE,CAAC,CAAC,EAAE,0BAA0B;oBAC1C,WAAW,EAAE,CAAC;oBACd,SAAS,EAAE,CAAC;oBACZ,OAAO,EAAE,IAAI;oBACb,GAAG,QAAQ;iBACZ;aACF,CAAC,CAAA;QACJ,CAAC;aAAM,CAAC;YACN,gBAAgB;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,aAAa,CAAC,cAAc,CAAC;gBACnC,SAAS,EAAE,aAAa,CAAC,cAAc,CAAC;gBACxC,QAAQ,EAAE;oBACR,UAAU,EAAE,iBAAiB;oBAC7B,WAAW,EAAE,iBAAiB,GAAG,CAAC,SAAS,GAAG,YAAY,CAAC;oBAC3D,SAAS,EAAE,IAAI,CAAC,GAAG,CACjB,CAAC,iBAAiB,GAAG,CAAC,CAAC,GAAG,SAAS,GAAG,iBAAiB,GAAG,YAAY,EACtE,OAAO,CAAC,MAAM,CACf;oBACD,GAAG,QAAQ;iBACZ;aACF,CAAC,CAAA;YACF,iBAAiB,EAAE,CAAA;QACrB,CAAC;QAED,cAAc,EAAE,CAAA;IAClB,CAAC;IAED,OAAO;QACL,EAAE,EAAE,UAAU;QACd,KAAK;QACL,MAAM;QACN,iBAAiB,EAAE;YACjB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,QAAQ,EAAE,QAAQ,CAAC,IAAI;YACvB,UAAU,EAAE,QAAQ,CAAC,UAAU;YAC/B,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACtC;QACD,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACrC,WAAW;KACZ,CAAA;AACH,CAAC"}
|
package/dist/runtime/index.d.ts
CHANGED
|
@@ -11,4 +11,7 @@ export { chunkText, estimateTokenCount, mergeSmallChunks, type ChunkingStrategy,
|
|
|
11
11
|
export { generateEmbedding, generateEmbeddings, shouldRegenerateEmbedding, hashText, validateEmbeddingDimensions, mergeEmbeddings, type GenerateEmbeddingOptions, type GenerateEmbeddingsOptions, type ChunkedEmbedding, } from './embeddings.js';
|
|
12
12
|
export { semanticSearch, findSimilar, type SemanticSearchOptions, type FindSimilarOptions, } from './search.js';
|
|
13
13
|
export { batchProcess, RateLimiter, ProcessingQueue, type BatchProcessOptions, type BatchProgress, type BatchError, type BatchProcessResult, } from './batch.js';
|
|
14
|
+
export { simpleChunkText, hashContent, loadExistingIndex, generateDocumentEmbeddings, } from './build-time.js';
|
|
15
|
+
export { stripMarkdown, extractMarkdownText } from './markdown.js';
|
|
16
|
+
export { createProviderFromEnv, getProviderConfigFromEnv, type ProviderType, } from './provider-helpers.js';
|
|
14
17
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/runtime/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAGH,OAAO,EACL,SAAS,EACT,kBAAkB,EAClB,gBAAgB,EAChB,KAAK,gBAAgB,EACrB,KAAK,eAAe,EACpB,KAAK,SAAS,GACf,MAAM,eAAe,CAAA;AAGtB,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,EACzB,QAAQ,EACR,2BAA2B,EAC3B,eAAe,EACf,KAAK,wBAAwB,EAC7B,KAAK,yBAAyB,EAC9B,KAAK,gBAAgB,GACtB,MAAM,iBAAiB,CAAA;AAGxB,OAAO,EACL,cAAc,EACd,WAAW,EACX,KAAK,qBAAqB,EAC1B,KAAK,kBAAkB,GACxB,MAAM,aAAa,CAAA;AAGpB,OAAO,EACL,YAAY,EACZ,WAAW,EACX,eAAe,EACf,KAAK,mBAAmB,EACxB,KAAK,aAAa,EAClB,KAAK,UAAU,EACf,KAAK,kBAAkB,GACxB,MAAM,YAAY,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/runtime/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAGH,OAAO,EACL,SAAS,EACT,kBAAkB,EAClB,gBAAgB,EAChB,KAAK,gBAAgB,EACrB,KAAK,eAAe,EACpB,KAAK,SAAS,GACf,MAAM,eAAe,CAAA;AAGtB,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,EACzB,QAAQ,EACR,2BAA2B,EAC3B,eAAe,EACf,KAAK,wBAAwB,EAC7B,KAAK,yBAAyB,EAC9B,KAAK,gBAAgB,GACtB,MAAM,iBAAiB,CAAA;AAGxB,OAAO,EACL,cAAc,EACd,WAAW,EACX,KAAK,qBAAqB,EAC1B,KAAK,kBAAkB,GACxB,MAAM,aAAa,CAAA;AAGpB,OAAO,EACL,YAAY,EACZ,WAAW,EACX,eAAe,EACf,KAAK,mBAAmB,EACxB,KAAK,aAAa,EAClB,KAAK,UAAU,EACf,KAAK,kBAAkB,GACxB,MAAM,YAAY,CAAA;AAGnB,OAAO,EACL,eAAe,EACf,WAAW,EACX,iBAAiB,EACjB,0BAA0B,GAC3B,MAAM,iBAAiB,CAAA;AAGxB,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,eAAe,CAAA;AAGlE,OAAO,EACL,qBAAqB,EACrB,wBAAwB,EACxB,KAAK,YAAY,GAClB,MAAM,uBAAuB,CAAA"}
|
package/dist/runtime/index.js
CHANGED
|
@@ -15,4 +15,10 @@ export { generateEmbedding, generateEmbeddings, shouldRegenerateEmbedding, hashT
|
|
|
15
15
|
export { semanticSearch, findSimilar, } from './search.js';
|
|
16
16
|
// Batch processing
|
|
17
17
|
export { batchProcess, RateLimiter, ProcessingQueue, } from './batch.js';
|
|
18
|
+
// Build-time utilities
|
|
19
|
+
export { simpleChunkText, hashContent, loadExistingIndex, generateDocumentEmbeddings, } from './build-time.js';
|
|
20
|
+
// Markdown processing
|
|
21
|
+
export { stripMarkdown, extractMarkdownText } from './markdown.js';
|
|
22
|
+
// Provider helpers
|
|
23
|
+
export { createProviderFromEnv, getProviderConfigFromEnv, } from './provider-helpers.js';
|
|
18
24
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/runtime/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,gBAAgB;AAChB,OAAO,EACL,SAAS,EACT,kBAAkB,EAClB,gBAAgB,GAIjB,MAAM,eAAe,CAAA;AAEtB,uBAAuB;AACvB,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,EACzB,QAAQ,EACR,2BAA2B,EAC3B,eAAe,GAIhB,MAAM,iBAAiB,CAAA;AAExB,kBAAkB;AAClB,OAAO,EACL,cAAc,EACd,WAAW,GAGZ,MAAM,aAAa,CAAA;AAEpB,mBAAmB;AACnB,OAAO,EACL,YAAY,EACZ,WAAW,EACX,eAAe,GAKhB,MAAM,YAAY,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/runtime/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,gBAAgB;AAChB,OAAO,EACL,SAAS,EACT,kBAAkB,EAClB,gBAAgB,GAIjB,MAAM,eAAe,CAAA;AAEtB,uBAAuB;AACvB,OAAO,EACL,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,EACzB,QAAQ,EACR,2BAA2B,EAC3B,eAAe,GAIhB,MAAM,iBAAiB,CAAA;AAExB,kBAAkB;AAClB,OAAO,EACL,cAAc,EACd,WAAW,GAGZ,MAAM,aAAa,CAAA;AAEpB,mBAAmB;AACnB,OAAO,EACL,YAAY,EACZ,WAAW,EACX,eAAe,GAKhB,MAAM,YAAY,CAAA;AAEnB,uBAAuB;AACvB,OAAO,EACL,eAAe,EACf,WAAW,EACX,iBAAiB,EACjB,0BAA0B,GAC3B,MAAM,iBAAiB,CAAA;AAExB,sBAAsB;AACtB,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,eAAe,CAAA;AAElE,mBAAmB;AACnB,OAAO,EACL,qBAAqB,EACrB,wBAAwB,GAEzB,MAAM,uBAAuB,CAAA"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown processing utilities for content preparation
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Strip markdown formatting for cleaner text suitable for embeddings
|
|
6
|
+
*
|
|
7
|
+
* Removes code blocks, formatting markers, links, images, and HTML tags
|
|
8
|
+
* while preserving the actual content.
|
|
9
|
+
*
|
|
10
|
+
* @param markdown - Markdown text to process
|
|
11
|
+
* @returns Plain text with markdown removed
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* ```typescript
|
|
15
|
+
* import { stripMarkdown } from '@opensaas/stack-rag/runtime'
|
|
16
|
+
*
|
|
17
|
+
* const markdown = '# Hello\n\nThis is **bold** text with a [link](url).'
|
|
18
|
+
* const plain = stripMarkdown(markdown)
|
|
19
|
+
* // Returns: 'Hello\n\nThis is bold text with a link.'
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
export declare function stripMarkdown(markdown: string): string;
|
|
23
|
+
/**
|
|
24
|
+
* Extract text content from common markdown structures
|
|
25
|
+
*
|
|
26
|
+
* More aggressive than stripMarkdown - extracts only text content,
|
|
27
|
+
* removes all structural elements.
|
|
28
|
+
*
|
|
29
|
+
* @param markdown - Markdown text
|
|
30
|
+
* @returns Extracted plain text
|
|
31
|
+
*/
|
|
32
|
+
export declare function extractMarkdownText(markdown: string): string;
|
|
33
|
+
//# sourceMappingURL=markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../src/runtime/markdown.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CA8BtD;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAuD5D"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown processing utilities for content preparation
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Strip markdown formatting for cleaner text suitable for embeddings
|
|
6
|
+
*
|
|
7
|
+
* Removes code blocks, formatting markers, links, images, and HTML tags
|
|
8
|
+
* while preserving the actual content.
|
|
9
|
+
*
|
|
10
|
+
* @param markdown - Markdown text to process
|
|
11
|
+
* @returns Plain text with markdown removed
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* ```typescript
|
|
15
|
+
* import { stripMarkdown } from '@opensaas/stack-rag/runtime'
|
|
16
|
+
*
|
|
17
|
+
* const markdown = '# Hello\n\nThis is **bold** text with a [link](url).'
|
|
18
|
+
* const plain = stripMarkdown(markdown)
|
|
19
|
+
* // Returns: 'Hello\n\nThis is bold text with a link.'
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
export function stripMarkdown(markdown) {
|
|
23
|
+
let text = markdown;
|
|
24
|
+
// Remove code blocks
|
|
25
|
+
text = text.replace(/```[\s\S]*?```/g, '');
|
|
26
|
+
text = text.replace(/`[^`]+`/g, '');
|
|
27
|
+
// Remove headings markers but keep text
|
|
28
|
+
text = text.replace(/^#+\s+/gm, '');
|
|
29
|
+
// Remove bold/italic markers
|
|
30
|
+
text = text.replace(/\*\*([^*]+)\*\*/g, '$1');
|
|
31
|
+
text = text.replace(/\*([^*]+)\*/g, '$1');
|
|
32
|
+
text = text.replace(/__([^_]+)__/g, '$1');
|
|
33
|
+
text = text.replace(/_([^_]+)_/g, '$1');
|
|
34
|
+
// Remove links but keep text
|
|
35
|
+
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
|
|
36
|
+
// Remove images
|
|
37
|
+
text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, '');
|
|
38
|
+
// Remove HTML tags
|
|
39
|
+
text = text.replace(/<[^>]+>/g, '');
|
|
40
|
+
// Normalize whitespace
|
|
41
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
42
|
+
text = text.replace(/[ \t]+/g, ' ');
|
|
43
|
+
return text.trim();
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Extract text content from common markdown structures
|
|
47
|
+
*
|
|
48
|
+
* More aggressive than stripMarkdown - extracts only text content,
|
|
49
|
+
* removes all structural elements.
|
|
50
|
+
*
|
|
51
|
+
* @param markdown - Markdown text
|
|
52
|
+
* @returns Extracted plain text
|
|
53
|
+
*/
|
|
54
|
+
export function extractMarkdownText(markdown) {
|
|
55
|
+
let text = markdown;
|
|
56
|
+
// Remove YAML frontmatter
|
|
57
|
+
text = text.replace(/^---[\s\S]*?---\n/m, '');
|
|
58
|
+
// Remove code blocks entirely (including content)
|
|
59
|
+
text = text.replace(/```[\s\S]*?```/g, '');
|
|
60
|
+
// Remove inline code
|
|
61
|
+
text = text.replace(/`[^`]+`/g, '');
|
|
62
|
+
// Remove horizontal rules
|
|
63
|
+
text = text.replace(/^[-*_]{3,}$/gm, '');
|
|
64
|
+
// Remove blockquotes markers
|
|
65
|
+
text = text.replace(/^>\s+/gm, '');
|
|
66
|
+
// Remove list markers
|
|
67
|
+
text = text.replace(/^[\s]*[-*+]\s+/gm, '');
|
|
68
|
+
text = text.replace(/^[\s]*\d+\.\s+/gm, '');
|
|
69
|
+
// Remove headings markers
|
|
70
|
+
text = text.replace(/^#+\s+/gm, '');
|
|
71
|
+
// Remove emphasis markers
|
|
72
|
+
text = text.replace(/\*\*([^*]+)\*\*/g, '$1');
|
|
73
|
+
text = text.replace(/\*([^*]+)\*/g, '$1');
|
|
74
|
+
text = text.replace(/__([^_]+)__/g, '$1');
|
|
75
|
+
text = text.replace(/_([^_]+)_/g, '$1');
|
|
76
|
+
// Remove strikethrough
|
|
77
|
+
text = text.replace(/~~([^~]+)~~/g, '$1');
|
|
78
|
+
// Remove links but keep text
|
|
79
|
+
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
|
|
80
|
+
// Remove reference-style links
|
|
81
|
+
text = text.replace(/\[([^\]]+)\]\[[^\]]*\]/g, '$1');
|
|
82
|
+
// Remove images
|
|
83
|
+
text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, '');
|
|
84
|
+
// Remove HTML tags
|
|
85
|
+
text = text.replace(/<[^>]+>/g, '');
|
|
86
|
+
// Remove HTML entities
|
|
87
|
+
text = text.replace(/&[a-z]+;/gi, '');
|
|
88
|
+
// Normalize whitespace
|
|
89
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
90
|
+
text = text.replace(/[ \t]+/g, ' ');
|
|
91
|
+
text = text.replace(/^\s+|\s+$/gm, '');
|
|
92
|
+
return text.trim();
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/runtime/markdown.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,aAAa,CAAC,QAAgB;IAC5C,IAAI,IAAI,GAAG,QAAQ,CAAA;IAEnB,qBAAqB;IACrB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAA;IAC1C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;IAEnC,wCAAwC;IACxC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;IAEnC,6BAA6B;IAC7B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,IAAI,CAAC,CAAA;IAC7C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAA;IACzC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAA;IACzC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC,CAAA;IAEvC,6BAA6B;IAC7B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAA;IAEnD,gBAAgB;IAChB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAA;IAElD,mBAAmB;IACnB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;IAEnC,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACtC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;IAEnC,OAAO,IAAI,CAAC,IAAI,EAAE,CAAA;AACpB,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAAC,QAAgB;IAClD,IAAI,IAAI,GAAG,QAAQ,CAAA;IAEnB,0BAA0B;IAC1B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC,CAAA;IAE7C,kDAAkD;IAClD,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAA;IAE1C,qBAAqB;IACrB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;IAEnC,0BAA0B;IAC1B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC,CAAA;IAExC,6BAA6B;IAC7B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAA;IAElC,sBAAsB;IACtB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAA;IAC3C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAA;IAE3C,0BAA0B;IAC1B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;IAEnC,0BAA0B;IAC1B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,IAAI,CAAC,CAAA;IAC7C,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAA;IACzC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAA;IACzC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC,CAAA;IAEvC,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAA;IAEzC,6BAA6B;IAC7B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAA;IAEnD,+BAA+B;IAC/B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,IAAI,CAAC,CAAA;IAEpD,gBAAgB;IAChB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAA;IAElD,mBAAmB;IACnB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAA;IAEnC,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAA;IAErC,uBAAuB;IACvB,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACtC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;IACnC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAA;IAEtC,OAAO,IAAI,CAAC,IAAI,EAAE,CAAA;AACpB,CAAC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Helper utilities for working with embedding providers
|
|
3
|
+
* Simplifies provider creation from environment variables
|
|
4
|
+
*/
|
|
5
|
+
import type { EmbeddingProvider } from '../providers/types.js';
|
|
6
|
+
import type { EmbeddingProviderConfig } from '../config/types.js';
|
|
7
|
+
import 'dotenv/config';
|
|
8
|
+
/**
|
|
9
|
+
* Provider type from environment or configuration
|
|
10
|
+
*/
|
|
11
|
+
export type ProviderType = 'openai' | 'ollama';
|
|
12
|
+
/**
|
|
13
|
+
* Create an embedding provider from environment variables
|
|
14
|
+
*
|
|
15
|
+
* Reads configuration from environment variables:
|
|
16
|
+
* - EMBEDDING_PROVIDER: 'openai' or 'ollama' (default: 'openai')
|
|
17
|
+
* - OPENAI_API_KEY: Required if using OpenAI
|
|
18
|
+
* - OLLAMA_BASE_URL: Ollama endpoint (default: 'http://localhost:11434')
|
|
19
|
+
*
|
|
20
|
+
* @param overrides - Optional overrides for environment config
|
|
21
|
+
* @returns Configured embedding provider
|
|
22
|
+
*
|
|
23
|
+
* @example
|
|
24
|
+
* ```typescript
|
|
25
|
+
* import { createProviderFromEnv } from '@opensaas/stack-rag/runtime'
|
|
26
|
+
*
|
|
27
|
+
* // Uses EMBEDDING_PROVIDER and OPENAI_API_KEY from env
|
|
28
|
+
* const provider = createProviderFromEnv()
|
|
29
|
+
*
|
|
30
|
+
* // Override provider type
|
|
31
|
+
* const ollamaProvider = createProviderFromEnv({ provider: 'ollama' })
|
|
32
|
+
* ```
|
|
33
|
+
*/
|
|
34
|
+
export declare function createProviderFromEnv(overrides?: {
|
|
35
|
+
provider?: ProviderType;
|
|
36
|
+
openaiApiKey?: string;
|
|
37
|
+
ollamaBaseUrl?: string;
|
|
38
|
+
model?: string;
|
|
39
|
+
}): EmbeddingProvider;
|
|
40
|
+
/**
|
|
41
|
+
* Get provider configuration from environment
|
|
42
|
+
*
|
|
43
|
+
* Useful for inspecting what provider would be used without creating it.
|
|
44
|
+
*
|
|
45
|
+
* @returns Provider configuration object
|
|
46
|
+
*
|
|
47
|
+
* @example
|
|
48
|
+
* ```typescript
|
|
49
|
+
* import { getProviderConfigFromEnv } from '@opensaas/stack-rag/runtime'
|
|
50
|
+
*
|
|
51
|
+
* const config = getProviderConfigFromEnv()
|
|
52
|
+
* console.log(`Using ${config.type} provider`)
|
|
53
|
+
* ```
|
|
54
|
+
*/
|
|
55
|
+
export declare function getProviderConfigFromEnv(): EmbeddingProviderConfig;
|
|
56
|
+
//# sourceMappingURL=provider-helpers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"provider-helpers.d.ts","sourceRoot":"","sources":["../../src/runtime/provider-helpers.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAC9D,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAA;AACjE,OAAO,eAAe,CAAA;AAEtB;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,QAAQ,GAAG,QAAQ,CAAA;AAE9C;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,qBAAqB,CAAC,SAAS,CAAC,EAAE;IAChD,QAAQ,CAAC,EAAE,YAAY,CAAA;IACvB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,KAAK,CAAC,EAAE,MAAM,CAAA;CACf,GAAG,iBAAiB,CA+BpB;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,wBAAwB,IAAI,uBAAuB,CAwBlE"}
|