modelfusion 0.4.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -4
- package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.cjs +2 -2
- package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.d.ts +1 -1
- package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.js +3 -3
- package/package.json +2 -2
- package/{vector-index/VectorIndexSimilarTextChunkRetriever.cjs → text-chunk/SimilarTextChunksFromVectorIndexRetriever.cjs} +4 -4
- package/{vector-index/VectorIndexSimilarTextChunkRetriever.d.ts → text-chunk/SimilarTextChunksFromVectorIndexRetriever.d.ts} +7 -7
- package/{vector-index/VectorIndexSimilarTextChunkRetriever.js → text-chunk/SimilarTextChunksFromVectorIndexRetriever.js} +2 -2
- package/text-chunk/TextChunk.d.ts +1 -1
- package/text-chunk/index.cjs +3 -0
- package/text-chunk/index.d.ts +3 -0
- package/text-chunk/index.js +3 -0
- package/text-chunk/split/splitOnSeparator.cjs +7 -9
- package/text-chunk/split/splitOnSeparator.d.ts +5 -6
- package/text-chunk/split/splitOnSeparator.js +6 -7
- package/text-chunk/split/splitRecursively.cjs +16 -16
- package/text-chunk/split/splitRecursively.d.ts +13 -17
- package/text-chunk/split/splitRecursively.js +14 -11
- package/text-chunk/split/splitTextChunks.cjs +16 -0
- package/text-chunk/split/splitTextChunks.d.ts +4 -0
- package/text-chunk/split/splitTextChunks.js +11 -0
- package/{vector-index → text-chunk}/upsertTextChunks.cjs +1 -1
- package/{vector-index → text-chunk}/upsertTextChunks.d.ts +2 -2
- package/{vector-index → text-chunk}/upsertTextChunks.js +1 -1
- package/vector-index/index.cjs +0 -3
- package/vector-index/index.d.ts +0 -3
- package/vector-index/index.js +0 -3
- package/vector-index/VectorIndexTextChunkStore.cjs +0 -77
- package/vector-index/VectorIndexTextChunkStore.d.ts +0 -35
- package/vector-index/VectorIndexTextChunkStore.js +0 -73
package/README.md
CHANGED
@@ -317,12 +317,12 @@ const embeddingModel = new OpenAITextEmbeddingModel({
|
|
317
317
|
await upsertTextChunks({
|
318
318
|
vectorIndex,
|
319
319
|
embeddingModel,
|
320
|
-
chunks: texts.map((text) => ({
|
320
|
+
chunks: texts.map((text) => ({ text })),
|
321
321
|
});
|
322
322
|
|
323
323
|
// retrieve text chunks from the vector index - usually done at query time:
|
324
324
|
const { chunks } = await retrieveTextChunks(
|
325
|
-
new
|
325
|
+
new SimilarTextChunksFromVectorIndexRetriever({
|
326
326
|
vectorIndex,
|
327
327
|
embeddingModel,
|
328
328
|
maxResults: 3,
|
@@ -343,9 +343,9 @@ const { chunks } = await retrieveTextChunks(
|
|
343
343
|
- [Transcribe Audio](https://modelfusion.dev/guide/function/transcribe-audio)
|
344
344
|
- [Generate images](https://modelfusion.dev/guide/function/generate-image)
|
345
345
|
- Summarize text
|
346
|
-
- Split text
|
347
346
|
- [Tools](https://modelfusion.dev/guide/tools)
|
348
|
-
- [Text Chunks](https://modelfusion.dev/guide/text-
|
347
|
+
- [Text Chunks](https://modelfusion.dev/guide/text-chunk/)
|
348
|
+
- [Split Text](https://modelfusion.dev/guide/text-chunk/split)
|
349
349
|
- [Run abstraction](https://modelfusion.dev/guide/run/)
|
350
350
|
- [Abort signals](https://modelfusion.dev/guide/run/abort)
|
351
351
|
- [Cost calculation](https://modelfusion.dev/guide/run/cost-calculation)
|
@@ -416,6 +416,12 @@ Examples for the individual functions and objects.
|
|
416
416
|
|
417
417
|
A web chat with an AI assistant, implemented as a Next.js app.
|
418
418
|
|
419
|
+
### [Chat with PDF](https://github.com/lgrammel/modelfusion/tree/main/examples/pdf-chat-terminal)
|
420
|
+
|
421
|
+
> _terminal app_, _PDF parsing_, _in memory vector indices_, _retrieval augmented generation_, _hypothetical document embedding_
|
422
|
+
|
423
|
+
Ask questions about a PDF document and get answers from the document.
|
424
|
+
|
419
425
|
### [Image generator (Next.js)](https://github.com/lgrammel/modelfusion/tree/main/examples/image-generator-next-js)
|
420
426
|
|
421
427
|
> _Next.js app_, _Stability AI image generation_
|
package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.cjs
CHANGED
@@ -13,9 +13,9 @@ async function summarizeRecursivelyWithTextGenerationAndTokenSplitting({ text, m
|
|
13
13
|
(model.maxCompletionTokens ?? model.contextWindowSize / 4), join, }, options) {
|
14
14
|
const emptyPromptTokens = await model.countPromptTokens(await prompt({ text: "" }));
|
15
15
|
return (0, summarizeRecursively_js_1.summarizeRecursively)({
|
16
|
-
split: (0, splitRecursively_js_1.
|
16
|
+
split: (0, splitRecursively_js_1.splitAtToken)({
|
17
17
|
tokenizer: model.tokenizer,
|
18
|
-
|
18
|
+
maxTokensPerChunk: tokenLimit - emptyPromptTokens,
|
19
19
|
}),
|
20
20
|
summarize: async (input) => {
|
21
21
|
const { text } = await (0, generateText_js_1.generateText)(model, await prompt(input), options);
|
package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.d.ts
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
import {
|
1
|
+
import { TextGenerationModel, TextGenerationModelSettings } from "../../model-function/generate-text/TextGenerationModel.js";
|
2
2
|
import { FullTokenizer } from "../../model-function/tokenize-text/Tokenizer.js";
|
3
3
|
import { Run } from "../../run/Run.js";
|
4
4
|
/**
|
package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.js
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
import { generateText } from "../../model-function/generate-text/generateText.js";
|
2
|
-
import {
|
2
|
+
import { splitAtToken } from "../../text-chunk/split/splitRecursively.js";
|
3
3
|
import { summarizeRecursively } from "./summarizeRecursively.js";
|
4
4
|
/**
|
5
5
|
* Recursively summarizes a text using a text generation model, e.g. for summarization or text extraction.
|
@@ -10,9 +10,9 @@ export async function summarizeRecursivelyWithTextGenerationAndTokenSplitting({
|
|
10
10
|
(model.maxCompletionTokens ?? model.contextWindowSize / 4), join, }, options) {
|
11
11
|
const emptyPromptTokens = await model.countPromptTokens(await prompt({ text: "" }));
|
12
12
|
return summarizeRecursively({
|
13
|
-
split:
|
13
|
+
split: splitAtToken({
|
14
14
|
tokenizer: model.tokenizer,
|
15
|
-
|
15
|
+
maxTokensPerChunk: tokenLimit - emptyPromptTokens,
|
16
16
|
}),
|
17
17
|
summarize: async (input) => {
|
18
18
|
const { text } = await generateText(model, await prompt(input), options);
|
package/package.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
{
|
2
2
|
"name": "modelfusion",
|
3
3
|
"description": "Build AI applications, chatbots, and agents with JavaScript and TypeScript.",
|
4
|
-
"version": "0.
|
4
|
+
"version": "0.6.0",
|
5
5
|
"author": "Lars Grammel",
|
6
6
|
"license": "MIT",
|
7
7
|
"keywords": [
|
@@ -65,7 +65,7 @@
|
|
65
65
|
"@typescript-eslint/parser": "^6.1.0",
|
66
66
|
"copyfiles": "2.4.1",
|
67
67
|
"eslint": "^8.45.0",
|
68
|
-
"eslint-config-prettier": "
|
68
|
+
"eslint-config-prettier": "9.0.0",
|
69
69
|
"husky": "^8.0.3",
|
70
70
|
"lint-staged": "13.2.3",
|
71
71
|
"prettier": "3.0.1",
|
@@ -1,8 +1,8 @@
|
|
1
1
|
"use strict";
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.
|
3
|
+
exports.SimilarTextChunksFromVectorIndexRetriever = void 0;
|
4
4
|
const embedText_js_1 = require("../model-function/embed-text/embedText.cjs");
|
5
|
-
class
|
5
|
+
class SimilarTextChunksFromVectorIndexRetriever {
|
6
6
|
constructor({ vectorIndex, embeddingModel, maxResults, similarityThreshold, }) {
|
7
7
|
Object.defineProperty(this, "vectorIndex", {
|
8
8
|
enumerable: true,
|
@@ -48,10 +48,10 @@ class VectorIndexSimilarTextChunkRetriever {
|
|
48
48
|
return queryResult.map((item) => item.data);
|
49
49
|
}
|
50
50
|
withSettings(additionalSettings) {
|
51
|
-
return new
|
51
|
+
return new SimilarTextChunksFromVectorIndexRetriever(Object.assign({}, this.settings, additionalSettings, {
|
52
52
|
vectorIndex: this.vectorIndex,
|
53
53
|
embeddingModel: this.embeddingModel,
|
54
54
|
}));
|
55
55
|
}
|
56
56
|
}
|
57
|
-
exports.
|
57
|
+
exports.SimilarTextChunksFromVectorIndexRetriever = SimilarTextChunksFromVectorIndexRetriever;
|
@@ -1,20 +1,20 @@
|
|
1
1
|
import { FunctionOptions } from "../model-function/FunctionOptions.js";
|
2
2
|
import { TextEmbeddingModel, TextEmbeddingModelSettings } from "../model-function/embed-text/TextEmbeddingModel.js";
|
3
|
-
import { TextChunk } from "
|
4
|
-
import { TextChunkRetriever, TextChunkRetrieverSettings } from "
|
5
|
-
import { VectorIndex } from "
|
6
|
-
export interface
|
3
|
+
import { TextChunk } from "./TextChunk.js";
|
4
|
+
import { TextChunkRetriever, TextChunkRetrieverSettings } from "./retrieve-text-chunks/TextChunkRetriever.js";
|
5
|
+
import { VectorIndex } from "../vector-index/VectorIndex.js";
|
6
|
+
export interface SimilarTextChunksFromVectorIndexRetrieverSettings {
|
7
7
|
maxResults?: number;
|
8
8
|
similarityThreshold?: number;
|
9
9
|
}
|
10
|
-
export declare class
|
10
|
+
export declare class SimilarTextChunksFromVectorIndexRetriever<CHUNK extends TextChunk, INDEX, SETTINGS extends TextEmbeddingModelSettings> implements TextChunkRetriever<CHUNK, string, SimilarTextChunksFromVectorIndexRetrieverSettings> {
|
11
11
|
private readonly vectorIndex;
|
12
12
|
private readonly embeddingModel;
|
13
13
|
private readonly settings;
|
14
14
|
constructor({ vectorIndex, embeddingModel, maxResults, similarityThreshold, }: {
|
15
15
|
vectorIndex: VectorIndex<CHUNK, INDEX>;
|
16
16
|
embeddingModel: TextEmbeddingModel<unknown, SETTINGS>;
|
17
|
-
} &
|
17
|
+
} & SimilarTextChunksFromVectorIndexRetrieverSettings);
|
18
18
|
retrieveTextChunks(query: string, options?: FunctionOptions<TextChunkRetrieverSettings>): Promise<CHUNK[]>;
|
19
|
-
withSettings(additionalSettings: Partial<
|
19
|
+
withSettings(additionalSettings: Partial<SimilarTextChunksFromVectorIndexRetrieverSettings>): this;
|
20
20
|
}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import { embedText } from "../model-function/embed-text/embedText.js";
|
2
|
-
export class
|
2
|
+
export class SimilarTextChunksFromVectorIndexRetriever {
|
3
3
|
constructor({ vectorIndex, embeddingModel, maxResults, similarityThreshold, }) {
|
4
4
|
Object.defineProperty(this, "vectorIndex", {
|
5
5
|
enumerable: true,
|
@@ -45,7 +45,7 @@ export class VectorIndexSimilarTextChunkRetriever {
|
|
45
45
|
return queryResult.map((item) => item.data);
|
46
46
|
}
|
47
47
|
withSettings(additionalSettings) {
|
48
|
-
return new
|
48
|
+
return new SimilarTextChunksFromVectorIndexRetriever(Object.assign({}, this.settings, additionalSettings, {
|
49
49
|
vectorIndex: this.vectorIndex,
|
50
50
|
embeddingModel: this.embeddingModel,
|
51
51
|
}));
|
package/text-chunk/index.cjs
CHANGED
@@ -14,9 +14,12 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
15
15
|
};
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
17
|
+
__exportStar(require("./SimilarTextChunksFromVectorIndexRetriever.cjs"), exports);
|
17
18
|
__exportStar(require("./TextChunk.cjs"), exports);
|
18
19
|
__exportStar(require("./retrieve-text-chunks/TextChunkRetriever.cjs"), exports);
|
19
20
|
__exportStar(require("./retrieve-text-chunks/retrieveTextChunks.cjs"), exports);
|
20
21
|
__exportStar(require("./split/SplitFunction.cjs"), exports);
|
21
22
|
__exportStar(require("./split/splitOnSeparator.cjs"), exports);
|
22
23
|
__exportStar(require("./split/splitRecursively.cjs"), exports);
|
24
|
+
__exportStar(require("./split/splitTextChunks.cjs"), exports);
|
25
|
+
__exportStar(require("./upsertTextChunks.cjs"), exports);
|
package/text-chunk/index.d.ts
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
export * from "./SimilarTextChunksFromVectorIndexRetriever.js";
|
1
2
|
export * from "./TextChunk.js";
|
2
3
|
export * from "./retrieve-text-chunks/TextChunkRetriever.js";
|
3
4
|
export * from "./retrieve-text-chunks/retrieveTextChunks.js";
|
4
5
|
export * from "./split/SplitFunction.js";
|
5
6
|
export * from "./split/splitOnSeparator.js";
|
6
7
|
export * from "./split/splitRecursively.js";
|
8
|
+
export * from "./split/splitTextChunks.js";
|
9
|
+
export * from "./upsertTextChunks.js";
|
package/text-chunk/index.js
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
export * from "./SimilarTextChunksFromVectorIndexRetriever.js";
|
1
2
|
export * from "./TextChunk.js";
|
2
3
|
export * from "./retrieve-text-chunks/TextChunkRetriever.js";
|
3
4
|
export * from "./retrieve-text-chunks/retrieveTextChunks.js";
|
4
5
|
export * from "./split/SplitFunction.js";
|
5
6
|
export * from "./split/splitOnSeparator.js";
|
6
7
|
export * from "./split/splitRecursively.js";
|
8
|
+
export * from "./split/splitTextChunks.js";
|
9
|
+
export * from "./upsertTextChunks.js";
|
@@ -1,12 +1,10 @@
|
|
1
1
|
"use strict";
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
exports.splitOnSeparator = void 0;
|
4
|
+
/**
|
5
|
+
* Splits text on a separator string.
|
6
|
+
*/
|
7
|
+
function splitOnSeparator({ separator, }) {
|
8
|
+
return async ({ text }) => text.split(separator);
|
9
|
+
}
|
7
10
|
exports.splitOnSeparator = splitOnSeparator;
|
8
|
-
const splitOnSeparatorAsSplitFunction = ({ separator }) => async ({ text }) => (0, exports.splitOnSeparator)({
|
9
|
-
separator,
|
10
|
-
text,
|
11
|
-
});
|
12
|
-
exports.splitOnSeparatorAsSplitFunction = splitOnSeparatorAsSplitFunction;
|
@@ -1,8 +1,7 @@
|
|
1
1
|
import { SplitFunction } from "./SplitFunction.js";
|
2
|
-
|
2
|
+
/**
|
3
|
+
* Splits text on a separator string.
|
4
|
+
*/
|
5
|
+
export declare function splitOnSeparator({ separator, }: {
|
3
6
|
separator: string;
|
4
|
-
|
5
|
-
}) => Promise<string[]>;
|
6
|
-
export declare const splitOnSeparatorAsSplitFunction: ({ separator }: {
|
7
|
-
separator: string;
|
8
|
-
}) => SplitFunction;
|
7
|
+
}): SplitFunction;
|
@@ -1,7 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
export
|
5
|
-
separator
|
6
|
-
|
7
|
-
});
|
1
|
+
/**
|
2
|
+
* Splits text on a separator string.
|
3
|
+
*/
|
4
|
+
export function splitOnSeparator({ separator, }) {
|
5
|
+
return async ({ text }) => text.split(separator);
|
6
|
+
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"use strict";
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.
|
3
|
+
exports.splitAtToken = exports.splitAtCharacter = void 0;
|
4
4
|
// when segments is a string, it splits by character, otherwise according to the provided segments
|
5
5
|
function splitRecursively({ maxChunkSize, segments, }) {
|
6
6
|
if (segments.length < maxChunkSize) {
|
@@ -20,22 +20,22 @@ function splitRecursively({ maxChunkSize, segments, }) {
|
|
20
20
|
}),
|
21
21
|
];
|
22
22
|
}
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
/**
|
24
|
+
* Splits text recursively until the resulting chunks are smaller than the `maxCharactersPerChunk`.
|
25
|
+
* The text is recursively split in the middle, so that all chunks are roughtly the same size.
|
26
|
+
*/
|
27
|
+
const splitAtCharacter = ({ maxCharactersPerChunk, }) => async ({ text }) => splitRecursively({
|
28
|
+
maxChunkSize: maxCharactersPerChunk,
|
26
29
|
segments: text,
|
27
30
|
});
|
28
|
-
exports.
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
31
|
+
exports.splitAtCharacter = splitAtCharacter;
|
32
|
+
/**
|
33
|
+
* Splits text recursively until the resulting chunks are smaller than the `maxTokensPerChunk`,
|
34
|
+
* while respecting the token boundaries.
|
35
|
+
* The text is recursively split in the middle, so that all chunks are roughtly the same size.
|
36
|
+
*/
|
37
|
+
const splitAtToken = ({ tokenizer, maxTokensPerChunk, }) => async ({ text }) => splitRecursively({
|
38
|
+
maxChunkSize: maxTokensPerChunk,
|
33
39
|
segments: (await tokenizer.tokenizeWithTexts(text)).tokenTexts,
|
34
40
|
});
|
35
|
-
exports.
|
36
|
-
const splitRecursivelyAtTokenAsSplitFunction = ({ tokenizer, maxChunkSize, }) => async ({ text }) => (0, exports.splitRecursivelyAtToken)({
|
37
|
-
tokenizer,
|
38
|
-
maxChunkSize,
|
39
|
-
text,
|
40
|
-
});
|
41
|
-
exports.splitRecursivelyAtTokenAsSplitFunction = splitRecursivelyAtTokenAsSplitFunction;
|
41
|
+
exports.splitAtToken = splitAtToken;
|
@@ -1,22 +1,18 @@
|
|
1
1
|
import { FullTokenizer } from "../../model-function/tokenize-text/Tokenizer.js";
|
2
2
|
import { SplitFunction } from "./SplitFunction.js";
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
export declare const
|
8
|
-
|
9
|
-
text: string;
|
10
|
-
}) => Promise<string[]>;
|
11
|
-
export declare const splitRecursivelyAtCharacterAsSplitFunction: ({ maxChunkSize }: {
|
12
|
-
maxChunkSize: number;
|
3
|
+
/**
|
4
|
+
* Splits text recursively until the resulting chunks are smaller than the `maxCharactersPerChunk`.
|
5
|
+
* The text is recursively split in the middle, so that all chunks are roughtly the same size.
|
6
|
+
*/
|
7
|
+
export declare const splitAtCharacter: ({ maxCharactersPerChunk, }: {
|
8
|
+
maxCharactersPerChunk: number;
|
13
9
|
}) => SplitFunction;
|
14
|
-
|
10
|
+
/**
|
11
|
+
* Splits text recursively until the resulting chunks are smaller than the `maxTokensPerChunk`,
|
12
|
+
* while respecting the token boundaries.
|
13
|
+
* The text is recursively split in the middle, so that all chunks are roughtly the same size.
|
14
|
+
*/
|
15
|
+
export declare const splitAtToken: ({ tokenizer, maxTokensPerChunk, }: {
|
15
16
|
tokenizer: FullTokenizer;
|
16
|
-
|
17
|
-
text: string;
|
18
|
-
}) => Promise<string[]>;
|
19
|
-
export declare const splitRecursivelyAtTokenAsSplitFunction: ({ tokenizer, maxChunkSize, }: {
|
20
|
-
tokenizer: FullTokenizer;
|
21
|
-
maxChunkSize: number;
|
17
|
+
maxTokensPerChunk: number;
|
22
18
|
}) => SplitFunction;
|
@@ -1,5 +1,5 @@
|
|
1
1
|
// when segments is a string, it splits by character, otherwise according to the provided segments
|
2
|
-
|
2
|
+
function splitRecursively({ maxChunkSize, segments, }) {
|
3
3
|
if (segments.length < maxChunkSize) {
|
4
4
|
return Array.isArray(segments) ? [segments.join("")] : [segments];
|
5
5
|
}
|
@@ -17,17 +17,20 @@ export function splitRecursively({ maxChunkSize, segments, }) {
|
|
17
17
|
}),
|
18
18
|
];
|
19
19
|
}
|
20
|
-
|
21
|
-
|
20
|
+
/**
|
21
|
+
* Splits text recursively until the resulting chunks are smaller than the `maxCharactersPerChunk`.
|
22
|
+
* The text is recursively split in the middle, so that all chunks are roughtly the same size.
|
23
|
+
*/
|
24
|
+
export const splitAtCharacter = ({ maxCharactersPerChunk, }) => async ({ text }) => splitRecursively({
|
25
|
+
maxChunkSize: maxCharactersPerChunk,
|
22
26
|
segments: text,
|
23
27
|
});
|
24
|
-
|
25
|
-
|
26
|
-
|
28
|
+
/**
|
29
|
+
* Splits text recursively until the resulting chunks are smaller than the `maxTokensPerChunk`,
|
30
|
+
* while respecting the token boundaries.
|
31
|
+
* The text is recursively split in the middle, so that all chunks are roughtly the same size.
|
32
|
+
*/
|
33
|
+
export const splitAtToken = ({ tokenizer, maxTokensPerChunk, }) => async ({ text }) => splitRecursively({
|
34
|
+
maxChunkSize: maxTokensPerChunk,
|
27
35
|
segments: (await tokenizer.tokenizeWithTexts(text)).tokenTexts,
|
28
36
|
});
|
29
|
-
export const splitRecursivelyAtTokenAsSplitFunction = ({ tokenizer, maxChunkSize, }) => async ({ text }) => splitRecursivelyAtToken({
|
30
|
-
tokenizer,
|
31
|
-
maxChunkSize,
|
32
|
-
text,
|
33
|
-
});
|
@@ -0,0 +1,16 @@
|
|
1
|
+
"use strict";
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
+
exports.splitTextChunk = exports.splitTextChunks = void 0;
|
4
|
+
async function splitTextChunks(splitFunction, inputs) {
|
5
|
+
const pageChunks = await Promise.all(inputs.map((input) => splitTextChunk(splitFunction, input)));
|
6
|
+
return pageChunks.flat();
|
7
|
+
}
|
8
|
+
exports.splitTextChunks = splitTextChunks;
|
9
|
+
async function splitTextChunk(splitFunction, input) {
|
10
|
+
const parts = await splitFunction(input);
|
11
|
+
return parts.map((text) => ({
|
12
|
+
...input,
|
13
|
+
text,
|
14
|
+
}));
|
15
|
+
}
|
16
|
+
exports.splitTextChunk = splitTextChunk;
|
@@ -0,0 +1,4 @@
|
|
1
|
+
import { TextChunk } from "../TextChunk.js";
|
2
|
+
import { SplitFunction } from "./SplitFunction.js";
|
3
|
+
export declare function splitTextChunks<CHUNK extends TextChunk>(splitFunction: SplitFunction, inputs: CHUNK[]): Promise<CHUNK[]>;
|
4
|
+
export declare function splitTextChunk<CHUNK extends TextChunk>(splitFunction: SplitFunction, input: CHUNK): Promise<CHUNK[]>;
|
@@ -0,0 +1,11 @@
|
|
1
|
+
export async function splitTextChunks(splitFunction, inputs) {
|
2
|
+
const pageChunks = await Promise.all(inputs.map((input) => splitTextChunk(splitFunction, input)));
|
3
|
+
return pageChunks.flat();
|
4
|
+
}
|
5
|
+
export async function splitTextChunk(splitFunction, input) {
|
6
|
+
const parts = await splitFunction(input);
|
7
|
+
return parts.map((text) => ({
|
8
|
+
...input,
|
9
|
+
text,
|
10
|
+
}));
|
11
|
+
}
|
@@ -5,7 +5,7 @@ const nanoid_1 = require("nanoid");
|
|
5
5
|
const embedText_js_1 = require("../model-function/embed-text/embedText.cjs");
|
6
6
|
async function upsertTextChunks({ vectorIndex, embeddingModel, generateId = nanoid_1.nanoid, chunks, ids, }, options) {
|
7
7
|
// many embedding models support bulk embedding, so we first embed all texts:
|
8
|
-
const { embeddings } = await (0, embedText_js_1.embedTexts)(embeddingModel, chunks.map((chunk) => chunk.
|
8
|
+
const { embeddings } = await (0, embedText_js_1.embedTexts)(embeddingModel, chunks.map((chunk) => chunk.text), options);
|
9
9
|
await vectorIndex.upsertMany(chunks.map((chunk, i) => ({
|
10
10
|
id: ids?.[i] ?? generateId(),
|
11
11
|
vector: embeddings[i],
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import { FunctionOptions } from "../model-function/FunctionOptions.js";
|
2
2
|
import { TextEmbeddingModel, TextEmbeddingModelSettings } from "../model-function/embed-text/TextEmbeddingModel.js";
|
3
|
-
import { TextChunk } from "
|
4
|
-
import { VectorIndex } from "
|
3
|
+
import { TextChunk } from "./TextChunk.js";
|
4
|
+
import { VectorIndex } from "../vector-index/VectorIndex.js";
|
5
5
|
export declare function upsertTextChunks<CHUNK extends TextChunk, SETTINGS extends TextEmbeddingModelSettings>({ vectorIndex, embeddingModel, generateId, chunks, ids, }: {
|
6
6
|
vectorIndex: VectorIndex<CHUNK, unknown>;
|
7
7
|
embeddingModel: TextEmbeddingModel<unknown, SETTINGS>;
|
@@ -2,7 +2,7 @@ import { nanoid as createId } from "nanoid";
|
|
2
2
|
import { embedTexts } from "../model-function/embed-text/embedText.js";
|
3
3
|
export async function upsertTextChunks({ vectorIndex, embeddingModel, generateId = createId, chunks, ids, }, options) {
|
4
4
|
// many embedding models support bulk embedding, so we first embed all texts:
|
5
|
-
const { embeddings } = await embedTexts(embeddingModel, chunks.map((chunk) => chunk.
|
5
|
+
const { embeddings } = await embedTexts(embeddingModel, chunks.map((chunk) => chunk.text), options);
|
6
6
|
await vectorIndex.upsertMany(chunks.map((chunk, i) => ({
|
7
7
|
id: ids?.[i] ?? generateId(),
|
8
8
|
vector: embeddings[i],
|
package/vector-index/index.cjs
CHANGED
@@ -15,8 +15,5 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
15
15
|
};
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
17
17
|
__exportStar(require("./VectorIndex.cjs"), exports);
|
18
|
-
__exportStar(require("./VectorIndexSimilarTextChunkRetriever.cjs"), exports);
|
19
|
-
__exportStar(require("./VectorIndexTextChunkStore.cjs"), exports);
|
20
18
|
__exportStar(require("./memory/MemoryVectorIndex.cjs"), exports);
|
21
19
|
__exportStar(require("./pinecone/PineconeVectorIndex.cjs"), exports);
|
22
|
-
__exportStar(require("./upsertTextChunks.cjs"), exports);
|
package/vector-index/index.d.ts
CHANGED
@@ -1,6 +1,3 @@
|
|
1
1
|
export * from "./VectorIndex.js";
|
2
|
-
export * from "./VectorIndexSimilarTextChunkRetriever.js";
|
3
|
-
export * from "./VectorIndexTextChunkStore.js";
|
4
2
|
export * from "./memory/MemoryVectorIndex.js";
|
5
3
|
export * from "./pinecone/PineconeVectorIndex.js";
|
6
|
-
export * from "./upsertTextChunks.js";
|
package/vector-index/index.js
CHANGED
@@ -1,6 +1,3 @@
|
|
1
1
|
export * from "./VectorIndex.js";
|
2
|
-
export * from "./VectorIndexSimilarTextChunkRetriever.js";
|
3
|
-
export * from "./VectorIndexTextChunkStore.js";
|
4
2
|
export * from "./memory/MemoryVectorIndex.js";
|
5
3
|
export * from "./pinecone/PineconeVectorIndex.js";
|
6
|
-
export * from "./upsertTextChunks.js";
|
@@ -1,77 +0,0 @@
|
|
1
|
-
"use strict";
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.VectorIndexTextChunkStore = void 0;
|
4
|
-
const nanoid_1 = require("nanoid");
|
5
|
-
const embedText_js_1 = require("../model-function/embed-text/embedText.cjs");
|
6
|
-
class VectorIndexTextChunkStore {
|
7
|
-
constructor({ index, generateId = nanoid_1.nanoid, embeddingModel, queryFunctionId, upsertFunctionId, }) {
|
8
|
-
Object.defineProperty(this, "_index", {
|
9
|
-
enumerable: true,
|
10
|
-
configurable: true,
|
11
|
-
writable: true,
|
12
|
-
value: void 0
|
13
|
-
});
|
14
|
-
Object.defineProperty(this, "generateId", {
|
15
|
-
enumerable: true,
|
16
|
-
configurable: true,
|
17
|
-
writable: true,
|
18
|
-
value: void 0
|
19
|
-
});
|
20
|
-
Object.defineProperty(this, "embeddingModel", {
|
21
|
-
enumerable: true,
|
22
|
-
configurable: true,
|
23
|
-
writable: true,
|
24
|
-
value: void 0
|
25
|
-
});
|
26
|
-
Object.defineProperty(this, "queryFunctionId", {
|
27
|
-
enumerable: true,
|
28
|
-
configurable: true,
|
29
|
-
writable: true,
|
30
|
-
value: void 0
|
31
|
-
});
|
32
|
-
Object.defineProperty(this, "upsertFunctionId", {
|
33
|
-
enumerable: true,
|
34
|
-
configurable: true,
|
35
|
-
writable: true,
|
36
|
-
value: void 0
|
37
|
-
});
|
38
|
-
this._index = index;
|
39
|
-
this.generateId = generateId;
|
40
|
-
this.embeddingModel = embeddingModel;
|
41
|
-
this.queryFunctionId = queryFunctionId;
|
42
|
-
this.upsertFunctionId = upsertFunctionId;
|
43
|
-
}
|
44
|
-
async upsertChunk({ id = this.generateId(), chunk, }, options) {
|
45
|
-
this.upsertManyChunks({
|
46
|
-
ids: [id],
|
47
|
-
chunks: [chunk],
|
48
|
-
}, options);
|
49
|
-
}
|
50
|
-
async upsertManyChunks({ ids, chunks, }, options) {
|
51
|
-
const { embeddings } = await (0, embedText_js_1.embedTexts)(this.embeddingModel, chunks.map((chunk) => chunk.content), {
|
52
|
-
functionId: this.upsertFunctionId,
|
53
|
-
run: options?.run,
|
54
|
-
});
|
55
|
-
this._index.upsertMany(embeddings.map((embedding, i) => ({
|
56
|
-
id: ids?.[i] ?? this.generateId(),
|
57
|
-
vector: embedding,
|
58
|
-
data: chunks[i],
|
59
|
-
})));
|
60
|
-
}
|
61
|
-
async retrieveSimilarTextChunks(queryText, options) {
|
62
|
-
const { embedding } = await (0, embedText_js_1.embedText)(this.embeddingModel, queryText, {
|
63
|
-
functionId: this.queryFunctionId,
|
64
|
-
run: options?.run,
|
65
|
-
});
|
66
|
-
const queryResult = await this._index.queryByVector({
|
67
|
-
queryVector: embedding,
|
68
|
-
maxResults: 1,
|
69
|
-
similarityThreshold: undefined,
|
70
|
-
});
|
71
|
-
return queryResult.map((item) => item.data);
|
72
|
-
}
|
73
|
-
get index() {
|
74
|
-
return this._index.asIndex();
|
75
|
-
}
|
76
|
-
}
|
77
|
-
exports.VectorIndexTextChunkStore = VectorIndexTextChunkStore;
|
@@ -1,35 +0,0 @@
|
|
1
|
-
import { TextEmbeddingModel, TextEmbeddingModelSettings } from "../model-function/embed-text/TextEmbeddingModel.js";
|
2
|
-
import { Run } from "../run/Run.js";
|
3
|
-
import { TextChunk } from "../text-chunk/TextChunk.js";
|
4
|
-
import { TextChunkRetrieverSettings } from "../text-chunk/retrieve-text-chunks/TextChunkRetriever.js";
|
5
|
-
import { VectorIndex } from "./VectorIndex.js";
|
6
|
-
import { FunctionOptions } from "../model-function/FunctionOptions.js";
|
7
|
-
export declare class VectorIndexTextChunkStore<CHUNK extends TextChunk, INDEX, MODEL extends TextEmbeddingModel<unknown, TextEmbeddingModelSettings>> {
|
8
|
-
private readonly _index;
|
9
|
-
private readonly generateId;
|
10
|
-
private readonly embeddingModel;
|
11
|
-
private readonly queryFunctionId?;
|
12
|
-
private readonly upsertFunctionId?;
|
13
|
-
constructor({ index, generateId, embeddingModel, queryFunctionId, upsertFunctionId, }: {
|
14
|
-
index: VectorIndex<CHUNK, INDEX>;
|
15
|
-
generateId?: () => string;
|
16
|
-
embeddingModel: MODEL;
|
17
|
-
queryFunctionId?: string;
|
18
|
-
upsertFunctionId?: string;
|
19
|
-
});
|
20
|
-
upsertChunk({ id, chunk, }: {
|
21
|
-
id?: string;
|
22
|
-
keyText: string;
|
23
|
-
chunk: CHUNK;
|
24
|
-
}, options?: {
|
25
|
-
run?: Run;
|
26
|
-
}): Promise<void>;
|
27
|
-
upsertManyChunks({ ids, chunks, }: {
|
28
|
-
ids?: Array<string | undefined>;
|
29
|
-
chunks: CHUNK[];
|
30
|
-
}, options?: {
|
31
|
-
run?: Run;
|
32
|
-
}): Promise<void>;
|
33
|
-
retrieveSimilarTextChunks(queryText: string, options?: FunctionOptions<TextChunkRetrieverSettings> | undefined): Promise<CHUNK[]>;
|
34
|
-
get index(): INDEX;
|
35
|
-
}
|
@@ -1,73 +0,0 @@
|
|
1
|
-
import { nanoid as createId } from "nanoid";
|
2
|
-
import { embedText, embedTexts, } from "../model-function/embed-text/embedText.js";
|
3
|
-
export class VectorIndexTextChunkStore {
|
4
|
-
constructor({ index, generateId = createId, embeddingModel, queryFunctionId, upsertFunctionId, }) {
|
5
|
-
Object.defineProperty(this, "_index", {
|
6
|
-
enumerable: true,
|
7
|
-
configurable: true,
|
8
|
-
writable: true,
|
9
|
-
value: void 0
|
10
|
-
});
|
11
|
-
Object.defineProperty(this, "generateId", {
|
12
|
-
enumerable: true,
|
13
|
-
configurable: true,
|
14
|
-
writable: true,
|
15
|
-
value: void 0
|
16
|
-
});
|
17
|
-
Object.defineProperty(this, "embeddingModel", {
|
18
|
-
enumerable: true,
|
19
|
-
configurable: true,
|
20
|
-
writable: true,
|
21
|
-
value: void 0
|
22
|
-
});
|
23
|
-
Object.defineProperty(this, "queryFunctionId", {
|
24
|
-
enumerable: true,
|
25
|
-
configurable: true,
|
26
|
-
writable: true,
|
27
|
-
value: void 0
|
28
|
-
});
|
29
|
-
Object.defineProperty(this, "upsertFunctionId", {
|
30
|
-
enumerable: true,
|
31
|
-
configurable: true,
|
32
|
-
writable: true,
|
33
|
-
value: void 0
|
34
|
-
});
|
35
|
-
this._index = index;
|
36
|
-
this.generateId = generateId;
|
37
|
-
this.embeddingModel = embeddingModel;
|
38
|
-
this.queryFunctionId = queryFunctionId;
|
39
|
-
this.upsertFunctionId = upsertFunctionId;
|
40
|
-
}
|
41
|
-
async upsertChunk({ id = this.generateId(), chunk, }, options) {
|
42
|
-
this.upsertManyChunks({
|
43
|
-
ids: [id],
|
44
|
-
chunks: [chunk],
|
45
|
-
}, options);
|
46
|
-
}
|
47
|
-
async upsertManyChunks({ ids, chunks, }, options) {
|
48
|
-
const { embeddings } = await embedTexts(this.embeddingModel, chunks.map((chunk) => chunk.content), {
|
49
|
-
functionId: this.upsertFunctionId,
|
50
|
-
run: options?.run,
|
51
|
-
});
|
52
|
-
this._index.upsertMany(embeddings.map((embedding, i) => ({
|
53
|
-
id: ids?.[i] ?? this.generateId(),
|
54
|
-
vector: embedding,
|
55
|
-
data: chunks[i],
|
56
|
-
})));
|
57
|
-
}
|
58
|
-
async retrieveSimilarTextChunks(queryText, options) {
|
59
|
-
const { embedding } = await embedText(this.embeddingModel, queryText, {
|
60
|
-
functionId: this.queryFunctionId,
|
61
|
-
run: options?.run,
|
62
|
-
});
|
63
|
-
const queryResult = await this._index.queryByVector({
|
64
|
-
queryVector: embedding,
|
65
|
-
maxResults: 1,
|
66
|
-
similarityThreshold: undefined,
|
67
|
-
});
|
68
|
-
return queryResult.map((item) => item.data);
|
69
|
-
}
|
70
|
-
get index() {
|
71
|
-
return this._index.asIndex();
|
72
|
-
}
|
73
|
-
}
|