modelfusion 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -2
- package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.cjs +5 -6
- package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.d.ts +3 -3
- package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.js +6 -7
- package/model-function/generate-text/streamText.d.ts +1 -1
- package/package.json +1 -1
- package/prompt/chat/trimChatPrompt.cjs +2 -1
- package/prompt/chat/trimChatPrompt.d.ts +1 -2
- package/prompt/chat/trimChatPrompt.js +2 -1
- package/{vector-index/VectorIndexSimilarTextChunkRetriever.cjs → text-chunk/SimilarTextChunksFromVectorIndexRetriever.cjs} +4 -4
- package/{vector-index/VectorIndexSimilarTextChunkRetriever.d.ts → text-chunk/SimilarTextChunksFromVectorIndexRetriever.d.ts} +7 -7
- package/{vector-index/VectorIndexSimilarTextChunkRetriever.js → text-chunk/SimilarTextChunksFromVectorIndexRetriever.js} +2 -2
- package/text-chunk/TextChunk.d.ts +1 -1
- package/text-chunk/index.cjs +3 -0
- package/text-chunk/index.d.ts +3 -0
- package/text-chunk/index.js +3 -0
- package/text-chunk/split/splitRecursively.cjs +3 -12
- package/text-chunk/split/splitRecursively.d.ts +2 -15
- package/text-chunk/split/splitRecursively.js +3 -9
- package/text-chunk/split/splitTextChunks.cjs +14 -0
- package/text-chunk/split/splitTextChunks.d.ts +3 -0
- package/text-chunk/split/splitTextChunks.js +10 -0
- package/{vector-index → text-chunk}/upsertTextChunks.cjs +1 -1
- package/{vector-index → text-chunk}/upsertTextChunks.d.ts +2 -2
- package/{vector-index → text-chunk}/upsertTextChunks.js +1 -1
- package/vector-index/index.cjs +0 -3
- package/vector-index/index.d.ts +0 -3
- package/vector-index/index.js +0 -3
- package/vector-index/VectorIndexTextChunkStore.cjs +0 -77
- package/vector-index/VectorIndexTextChunkStore.d.ts +0 -35
- package/vector-index/VectorIndexTextChunkStore.js +0 -73
package/README.md
CHANGED
@@ -317,12 +317,12 @@ const embeddingModel = new OpenAITextEmbeddingModel({
|
|
317
317
|
await upsertTextChunks({
|
318
318
|
vectorIndex,
|
319
319
|
embeddingModel,
|
320
|
-
chunks: texts.map((text) => ({
|
320
|
+
chunks: texts.map((text) => ({ text })),
|
321
321
|
});
|
322
322
|
|
323
323
|
// retrieve text chunks from the vector index - usually done at query time:
|
324
324
|
const { chunks } = await retrieveTextChunks(
|
325
|
-
new
|
325
|
+
new SimilarTextChunksFromVectorIndexRetriever({
|
326
326
|
vectorIndex,
|
327
327
|
embeddingModel,
|
328
328
|
maxResults: 3,
|
@@ -440,6 +440,12 @@ TypeScript implementation of the classic [BabyAGI](https://github.com/yoheinakaj
|
|
440
440
|
|
441
441
|
Small agent that solves middle school math problems. It uses a calculator tool to solve the problems.
|
442
442
|
|
443
|
+
### [Chat with PDF](https://github.com/lgrammel/modelfusion/tree/main/examples/pdf-chat-terminal)
|
444
|
+
|
445
|
+
> _terminal app_, _PDF parsing_, _in memory vector indices_, _retrieval augmented generation_, _hypothetical document embedding_
|
446
|
+
|
447
|
+
Ask questions about a PDF document and get answers from the document.
|
448
|
+
|
443
449
|
### [PDF to Tweet](https://github.com/lgrammel/modelfusion/tree/main/examples/pdf-to-tweet)
|
444
450
|
|
445
451
|
> _terminal app_, _PDF parsing_, _recursive information extraction_, _in memory vector index, \_style example retrieval_, _OpenAI GPT-4_, _cost calculation_
|
package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.cjs
CHANGED
@@ -9,17 +9,16 @@ const summarizeRecursively_js_1 = require("./summarizeRecursively.cjs");
|
|
9
9
|
* It automatically splits the text into optimal chunks that are small enough to be processed by the model,
|
10
10
|
* while leaving enough space for the model to generate text.
|
11
11
|
*/
|
12
|
-
async function summarizeRecursivelyWithTextGenerationAndTokenSplitting({ text, model, prompt,
|
12
|
+
async function summarizeRecursivelyWithTextGenerationAndTokenSplitting({ text, model, prompt, tokenLimit = model.contextWindowSize -
|
13
|
+
(model.maxCompletionTokens ?? model.contextWindowSize / 4), join, }, options) {
|
13
14
|
const emptyPromptTokens = await model.countPromptTokens(await prompt({ text: "" }));
|
14
15
|
return (0, summarizeRecursively_js_1.summarizeRecursively)({
|
15
|
-
split: (0, splitRecursively_js_1.
|
16
|
+
split: (0, splitRecursively_js_1.splitRecursivelyAtToken)({
|
16
17
|
tokenizer: model.tokenizer,
|
17
|
-
maxChunkSize:
|
18
|
-
reservedCompletionTokens -
|
19
|
-
emptyPromptTokens,
|
18
|
+
maxChunkSize: tokenLimit - emptyPromptTokens,
|
20
19
|
}),
|
21
20
|
summarize: async (input) => {
|
22
|
-
const { text } = await (0, generateText_js_1.generateText)(model
|
21
|
+
const { text } = await (0, generateText_js_1.generateText)(model, await prompt(input), options);
|
23
22
|
return text;
|
24
23
|
},
|
25
24
|
join,
|
package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.d.ts
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
import {
|
1
|
+
import { TextGenerationModel, TextGenerationModelSettings } from "../../model-function/generate-text/TextGenerationModel.js";
|
2
2
|
import { FullTokenizer } from "../../model-function/tokenize-text/Tokenizer.js";
|
3
3
|
import { Run } from "../../run/Run.js";
|
4
4
|
/**
|
@@ -6,7 +6,7 @@ import { Run } from "../../run/Run.js";
|
|
6
6
|
* It automatically splits the text into optimal chunks that are small enough to be processed by the model,
|
7
7
|
* while leaving enough space for the model to generate text.
|
8
8
|
*/
|
9
|
-
export declare function summarizeRecursivelyWithTextGenerationAndTokenSplitting<PROMPT>({ text, model, prompt,
|
9
|
+
export declare function summarizeRecursivelyWithTextGenerationAndTokenSplitting<PROMPT>({ text, model, prompt, tokenLimit, join, }: {
|
10
10
|
text: string;
|
11
11
|
model: TextGenerationModel<PROMPT, any, any, TextGenerationModelSettings> & {
|
12
12
|
contextWindowSize: number;
|
@@ -16,7 +16,7 @@ export declare function summarizeRecursivelyWithTextGenerationAndTokenSplitting<
|
|
16
16
|
prompt: (input: {
|
17
17
|
text: string;
|
18
18
|
}) => Promise<PROMPT>;
|
19
|
-
|
19
|
+
tokenLimit?: number;
|
20
20
|
join?: (texts: Array<string>) => string;
|
21
21
|
}, options?: {
|
22
22
|
functionId?: string;
|
package/composed-function/summarize/summarizeRecursivelyWithTextGenerationAndTokenSplitting.js
CHANGED
@@ -1,22 +1,21 @@
|
|
1
1
|
import { generateText } from "../../model-function/generate-text/generateText.js";
|
2
|
-
import {
|
2
|
+
import { splitRecursivelyAtToken } from "../../text-chunk/split/splitRecursively.js";
|
3
3
|
import { summarizeRecursively } from "./summarizeRecursively.js";
|
4
4
|
/**
|
5
5
|
* Recursively summarizes a text using a text generation model, e.g. for summarization or text extraction.
|
6
6
|
* It automatically splits the text into optimal chunks that are small enough to be processed by the model,
|
7
7
|
* while leaving enough space for the model to generate text.
|
8
8
|
*/
|
9
|
-
export async function summarizeRecursivelyWithTextGenerationAndTokenSplitting({ text, model, prompt,
|
9
|
+
export async function summarizeRecursivelyWithTextGenerationAndTokenSplitting({ text, model, prompt, tokenLimit = model.contextWindowSize -
|
10
|
+
(model.maxCompletionTokens ?? model.contextWindowSize / 4), join, }, options) {
|
10
11
|
const emptyPromptTokens = await model.countPromptTokens(await prompt({ text: "" }));
|
11
12
|
return summarizeRecursively({
|
12
|
-
split:
|
13
|
+
split: splitRecursivelyAtToken({
|
13
14
|
tokenizer: model.tokenizer,
|
14
|
-
maxChunkSize:
|
15
|
-
reservedCompletionTokens -
|
16
|
-
emptyPromptTokens,
|
15
|
+
maxChunkSize: tokenLimit - emptyPromptTokens,
|
17
16
|
}),
|
18
17
|
summarize: async (input) => {
|
19
|
-
const { text } = await generateText(model
|
18
|
+
const { text } = await generateText(model, await prompt(input), options);
|
20
19
|
return text;
|
21
20
|
},
|
22
21
|
join,
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import { FunctionOptions } from "../FunctionOptions.js";
|
2
|
+
import { CallMetadata } from "../executeCall.js";
|
2
3
|
import { DeltaEvent } from "./DeltaEvent.js";
|
3
4
|
import { TextGenerationModel, TextGenerationModelSettings } from "./TextGenerationModel.js";
|
4
|
-
import { CallMetadata } from "model-function/executeCall.js";
|
5
5
|
export declare function streamText<PROMPT, FULL_DELTA, SETTINGS extends TextGenerationModelSettings>(model: TextGenerationModel<PROMPT, unknown, FULL_DELTA, SETTINGS> & {
|
6
6
|
generateDeltaStreamResponse: (prompt: PROMPT, options: FunctionOptions<SETTINGS>) => PromiseLike<AsyncIterable<DeltaEvent<FULL_DELTA>>>;
|
7
7
|
extractTextDelta: (fullDelta: FULL_DELTA) => string | undefined;
|
package/package.json
CHANGED
@@ -12,7 +12,8 @@ const validateChatPrompt_js_1 = require("./validateChatPrompt.cjs");
|
|
12
12
|
*
|
13
13
|
* @see https://modelfusion.dev/guide/function/generate-text/prompt-mapping#limiting-the-chat-length
|
14
14
|
*/
|
15
|
-
async function trimChatPrompt({ prompt, model, tokenLimit = model.contextWindowSize -
|
15
|
+
async function trimChatPrompt({ prompt, model, tokenLimit = model.contextWindowSize -
|
16
|
+
(model.maxCompletionTokens ?? model.contextWindowSize / 4), }) {
|
16
17
|
(0, validateChatPrompt_js_1.validateChatPrompt)(prompt);
|
17
18
|
const startsWithSystemMessage = "system" in prompt[0];
|
18
19
|
const systemMessage = startsWithSystemMessage ? [prompt[0]] : [];
|
@@ -1,4 +1,4 @@
|
|
1
|
-
import { TextGenerationModel } from "model-function/generate-text/TextGenerationModel.js";
|
1
|
+
import { TextGenerationModel } from "../../model-function/generate-text/TextGenerationModel.js";
|
2
2
|
import { ChatPrompt } from "./ChatPrompt.js";
|
3
3
|
/**
|
4
4
|
* Keeps only the most recent messages in the prompt, while leaving enough space for the completion.
|
@@ -14,7 +14,6 @@ export declare function trimChatPrompt({ prompt, model, tokenLimit, }: {
|
|
14
14
|
prompt: ChatPrompt;
|
15
15
|
model: TextGenerationModel<ChatPrompt, any, any, any> & {
|
16
16
|
contextWindowSize: number;
|
17
|
-
maxCompletionTokens: number;
|
18
17
|
countPromptTokens: (prompt: ChatPrompt) => PromiseLike<number>;
|
19
18
|
};
|
20
19
|
tokenLimit?: number;
|
@@ -9,7 +9,8 @@ import { validateChatPrompt } from "./validateChatPrompt.js";
|
|
9
9
|
*
|
10
10
|
* @see https://modelfusion.dev/guide/function/generate-text/prompt-mapping#limiting-the-chat-length
|
11
11
|
*/
|
12
|
-
export async function trimChatPrompt({ prompt, model, tokenLimit = model.contextWindowSize -
|
12
|
+
export async function trimChatPrompt({ prompt, model, tokenLimit = model.contextWindowSize -
|
13
|
+
(model.maxCompletionTokens ?? model.contextWindowSize / 4), }) {
|
13
14
|
validateChatPrompt(prompt);
|
14
15
|
const startsWithSystemMessage = "system" in prompt[0];
|
15
16
|
const systemMessage = startsWithSystemMessage ? [prompt[0]] : [];
|
@@ -1,8 +1,8 @@
|
|
1
1
|
"use strict";
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.
|
3
|
+
exports.SimilarTextChunksFromVectorIndexRetriever = void 0;
|
4
4
|
const embedText_js_1 = require("../model-function/embed-text/embedText.cjs");
|
5
|
-
class
|
5
|
+
class SimilarTextChunksFromVectorIndexRetriever {
|
6
6
|
constructor({ vectorIndex, embeddingModel, maxResults, similarityThreshold, }) {
|
7
7
|
Object.defineProperty(this, "vectorIndex", {
|
8
8
|
enumerable: true,
|
@@ -48,10 +48,10 @@ class VectorIndexSimilarTextChunkRetriever {
|
|
48
48
|
return queryResult.map((item) => item.data);
|
49
49
|
}
|
50
50
|
withSettings(additionalSettings) {
|
51
|
-
return new
|
51
|
+
return new SimilarTextChunksFromVectorIndexRetriever(Object.assign({}, this.settings, additionalSettings, {
|
52
52
|
vectorIndex: this.vectorIndex,
|
53
53
|
embeddingModel: this.embeddingModel,
|
54
54
|
}));
|
55
55
|
}
|
56
56
|
}
|
57
|
-
exports.
|
57
|
+
exports.SimilarTextChunksFromVectorIndexRetriever = SimilarTextChunksFromVectorIndexRetriever;
|
@@ -1,20 +1,20 @@
|
|
1
1
|
import { FunctionOptions } from "../model-function/FunctionOptions.js";
|
2
2
|
import { TextEmbeddingModel, TextEmbeddingModelSettings } from "../model-function/embed-text/TextEmbeddingModel.js";
|
3
|
-
import { TextChunk } from "
|
4
|
-
import { TextChunkRetriever, TextChunkRetrieverSettings } from "
|
5
|
-
import { VectorIndex } from "
|
6
|
-
export interface
|
3
|
+
import { TextChunk } from "./TextChunk.js";
|
4
|
+
import { TextChunkRetriever, TextChunkRetrieverSettings } from "./retrieve-text-chunks/TextChunkRetriever.js";
|
5
|
+
import { VectorIndex } from "../vector-index/VectorIndex.js";
|
6
|
+
export interface SimilarTextChunksFromVectorIndexRetrieverSettings {
|
7
7
|
maxResults?: number;
|
8
8
|
similarityThreshold?: number;
|
9
9
|
}
|
10
|
-
export declare class
|
10
|
+
export declare class SimilarTextChunksFromVectorIndexRetriever<CHUNK extends TextChunk, INDEX, SETTINGS extends TextEmbeddingModelSettings> implements TextChunkRetriever<CHUNK, string, SimilarTextChunksFromVectorIndexRetrieverSettings> {
|
11
11
|
private readonly vectorIndex;
|
12
12
|
private readonly embeddingModel;
|
13
13
|
private readonly settings;
|
14
14
|
constructor({ vectorIndex, embeddingModel, maxResults, similarityThreshold, }: {
|
15
15
|
vectorIndex: VectorIndex<CHUNK, INDEX>;
|
16
16
|
embeddingModel: TextEmbeddingModel<unknown, SETTINGS>;
|
17
|
-
} &
|
17
|
+
} & SimilarTextChunksFromVectorIndexRetrieverSettings);
|
18
18
|
retrieveTextChunks(query: string, options?: FunctionOptions<TextChunkRetrieverSettings>): Promise<CHUNK[]>;
|
19
|
-
withSettings(additionalSettings: Partial<
|
19
|
+
withSettings(additionalSettings: Partial<SimilarTextChunksFromVectorIndexRetrieverSettings>): this;
|
20
20
|
}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import { embedText } from "../model-function/embed-text/embedText.js";
|
2
|
-
export class
|
2
|
+
export class SimilarTextChunksFromVectorIndexRetriever {
|
3
3
|
constructor({ vectorIndex, embeddingModel, maxResults, similarityThreshold, }) {
|
4
4
|
Object.defineProperty(this, "vectorIndex", {
|
5
5
|
enumerable: true,
|
@@ -45,7 +45,7 @@ export class VectorIndexSimilarTextChunkRetriever {
|
|
45
45
|
return queryResult.map((item) => item.data);
|
46
46
|
}
|
47
47
|
withSettings(additionalSettings) {
|
48
|
-
return new
|
48
|
+
return new SimilarTextChunksFromVectorIndexRetriever(Object.assign({}, this.settings, additionalSettings, {
|
49
49
|
vectorIndex: this.vectorIndex,
|
50
50
|
embeddingModel: this.embeddingModel,
|
51
51
|
}));
|
package/text-chunk/index.cjs
CHANGED
@@ -14,9 +14,12 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
15
15
|
};
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
17
|
+
__exportStar(require("./SimilarTextChunksFromVectorIndexRetriever.cjs"), exports);
|
17
18
|
__exportStar(require("./TextChunk.cjs"), exports);
|
18
19
|
__exportStar(require("./retrieve-text-chunks/TextChunkRetriever.cjs"), exports);
|
19
20
|
__exportStar(require("./retrieve-text-chunks/retrieveTextChunks.cjs"), exports);
|
20
21
|
__exportStar(require("./split/SplitFunction.cjs"), exports);
|
21
22
|
__exportStar(require("./split/splitOnSeparator.cjs"), exports);
|
22
23
|
__exportStar(require("./split/splitRecursively.cjs"), exports);
|
24
|
+
__exportStar(require("./split/splitTextChunks.cjs"), exports);
|
25
|
+
__exportStar(require("./upsertTextChunks.cjs"), exports);
|
package/text-chunk/index.d.ts
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
export * from "./SimilarTextChunksFromVectorIndexRetriever.js";
|
1
2
|
export * from "./TextChunk.js";
|
2
3
|
export * from "./retrieve-text-chunks/TextChunkRetriever.js";
|
3
4
|
export * from "./retrieve-text-chunks/retrieveTextChunks.js";
|
4
5
|
export * from "./split/SplitFunction.js";
|
5
6
|
export * from "./split/splitOnSeparator.js";
|
6
7
|
export * from "./split/splitRecursively.js";
|
8
|
+
export * from "./split/splitTextChunks.js";
|
9
|
+
export * from "./upsertTextChunks.js";
|
package/text-chunk/index.js
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
export * from "./SimilarTextChunksFromVectorIndexRetriever.js";
|
1
2
|
export * from "./TextChunk.js";
|
2
3
|
export * from "./retrieve-text-chunks/TextChunkRetriever.js";
|
3
4
|
export * from "./retrieve-text-chunks/retrieveTextChunks.js";
|
4
5
|
export * from "./split/SplitFunction.js";
|
5
6
|
export * from "./split/splitOnSeparator.js";
|
6
7
|
export * from "./split/splitRecursively.js";
|
8
|
+
export * from "./split/splitTextChunks.js";
|
9
|
+
export * from "./upsertTextChunks.js";
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"use strict";
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.
|
3
|
+
exports.splitRecursivelyAtToken = exports.splitRecursivelyAtCharacter = void 0;
|
4
4
|
// when segments is a string, it splits by character, otherwise according to the provided segments
|
5
5
|
function splitRecursively({ maxChunkSize, segments, }) {
|
6
6
|
if (segments.length < maxChunkSize) {
|
@@ -20,22 +20,13 @@ function splitRecursively({ maxChunkSize, segments, }) {
|
|
20
20
|
}),
|
21
21
|
];
|
22
22
|
}
|
23
|
-
|
24
|
-
const splitRecursivelyAtCharacter = async ({ maxChunkSize, text, }) => splitRecursively({
|
23
|
+
const splitRecursivelyAtCharacter = ({ maxChunkSize }) => async ({ text }) => splitRecursively({
|
25
24
|
maxChunkSize,
|
26
25
|
segments: text,
|
27
26
|
});
|
28
27
|
exports.splitRecursivelyAtCharacter = splitRecursivelyAtCharacter;
|
29
|
-
const
|
30
|
-
exports.splitRecursivelyAtCharacterAsSplitFunction = splitRecursivelyAtCharacterAsSplitFunction;
|
31
|
-
const splitRecursivelyAtToken = async ({ tokenizer, maxChunkSize, text, }) => splitRecursively({
|
28
|
+
const splitRecursivelyAtToken = ({ tokenizer, maxChunkSize, }) => async ({ text }) => splitRecursively({
|
32
29
|
maxChunkSize,
|
33
30
|
segments: (await tokenizer.tokenizeWithTexts(text)).tokenTexts,
|
34
31
|
});
|
35
32
|
exports.splitRecursivelyAtToken = splitRecursivelyAtToken;
|
36
|
-
const splitRecursivelyAtTokenAsSplitFunction = ({ tokenizer, maxChunkSize, }) => async ({ text }) => (0, exports.splitRecursivelyAtToken)({
|
37
|
-
tokenizer,
|
38
|
-
maxChunkSize,
|
39
|
-
text,
|
40
|
-
});
|
41
|
-
exports.splitRecursivelyAtTokenAsSplitFunction = splitRecursivelyAtTokenAsSplitFunction;
|
@@ -1,22 +1,9 @@
|
|
1
1
|
import { FullTokenizer } from "../../model-function/tokenize-text/Tokenizer.js";
|
2
2
|
import { SplitFunction } from "./SplitFunction.js";
|
3
|
-
export declare
|
4
|
-
maxChunkSize: number;
|
5
|
-
segments: string | Array<string>;
|
6
|
-
}): Array<string>;
|
7
|
-
export declare const splitRecursivelyAtCharacter: ({ maxChunkSize, text, }: {
|
8
|
-
maxChunkSize: number;
|
9
|
-
text: string;
|
10
|
-
}) => Promise<string[]>;
|
11
|
-
export declare const splitRecursivelyAtCharacterAsSplitFunction: ({ maxChunkSize }: {
|
3
|
+
export declare const splitRecursivelyAtCharacter: ({ maxChunkSize }: {
|
12
4
|
maxChunkSize: number;
|
13
5
|
}) => SplitFunction;
|
14
|
-
export declare const splitRecursivelyAtToken: ({ tokenizer, maxChunkSize,
|
15
|
-
tokenizer: FullTokenizer;
|
16
|
-
maxChunkSize: number;
|
17
|
-
text: string;
|
18
|
-
}) => Promise<string[]>;
|
19
|
-
export declare const splitRecursivelyAtTokenAsSplitFunction: ({ tokenizer, maxChunkSize, }: {
|
6
|
+
export declare const splitRecursivelyAtToken: ({ tokenizer, maxChunkSize, }: {
|
20
7
|
tokenizer: FullTokenizer;
|
21
8
|
maxChunkSize: number;
|
22
9
|
}) => SplitFunction;
|
@@ -1,5 +1,5 @@
|
|
1
1
|
// when segments is a string, it splits by character, otherwise according to the provided segments
|
2
|
-
|
2
|
+
function splitRecursively({ maxChunkSize, segments, }) {
|
3
3
|
if (segments.length < maxChunkSize) {
|
4
4
|
return Array.isArray(segments) ? [segments.join("")] : [segments];
|
5
5
|
}
|
@@ -17,17 +17,11 @@ export function splitRecursively({ maxChunkSize, segments, }) {
|
|
17
17
|
}),
|
18
18
|
];
|
19
19
|
}
|
20
|
-
export const splitRecursivelyAtCharacter = async ({
|
20
|
+
export const splitRecursivelyAtCharacter = ({ maxChunkSize }) => async ({ text }) => splitRecursively({
|
21
21
|
maxChunkSize,
|
22
22
|
segments: text,
|
23
23
|
});
|
24
|
-
export const
|
25
|
-
export const splitRecursivelyAtToken = async ({ tokenizer, maxChunkSize, text, }) => splitRecursively({
|
24
|
+
export const splitRecursivelyAtToken = ({ tokenizer, maxChunkSize, }) => async ({ text }) => splitRecursively({
|
26
25
|
maxChunkSize,
|
27
26
|
segments: (await tokenizer.tokenizeWithTexts(text)).tokenTexts,
|
28
27
|
});
|
29
|
-
export const splitRecursivelyAtTokenAsSplitFunction = ({ tokenizer, maxChunkSize, }) => async ({ text }) => splitRecursivelyAtToken({
|
30
|
-
tokenizer,
|
31
|
-
maxChunkSize,
|
32
|
-
text,
|
33
|
-
});
|
@@ -0,0 +1,14 @@
|
|
1
|
+
"use strict";
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
+
exports.splitTextChunks = void 0;
|
4
|
+
async function splitTextChunks(splitFunction, inputs) {
|
5
|
+
const pageChunks = await Promise.all(inputs.map(async (input) => {
|
6
|
+
const parts = await splitFunction(input);
|
7
|
+
return parts.map((text) => ({
|
8
|
+
...input,
|
9
|
+
text,
|
10
|
+
}));
|
11
|
+
}));
|
12
|
+
return pageChunks.flat();
|
13
|
+
}
|
14
|
+
exports.splitTextChunks = splitTextChunks;
|
@@ -0,0 +1,10 @@
|
|
1
|
+
export async function splitTextChunks(splitFunction, inputs) {
|
2
|
+
const pageChunks = await Promise.all(inputs.map(async (input) => {
|
3
|
+
const parts = await splitFunction(input);
|
4
|
+
return parts.map((text) => ({
|
5
|
+
...input,
|
6
|
+
text,
|
7
|
+
}));
|
8
|
+
}));
|
9
|
+
return pageChunks.flat();
|
10
|
+
}
|
@@ -5,7 +5,7 @@ const nanoid_1 = require("nanoid");
|
|
5
5
|
const embedText_js_1 = require("../model-function/embed-text/embedText.cjs");
|
6
6
|
async function upsertTextChunks({ vectorIndex, embeddingModel, generateId = nanoid_1.nanoid, chunks, ids, }, options) {
|
7
7
|
// many embedding models support bulk embedding, so we first embed all texts:
|
8
|
-
const { embeddings } = await (0, embedText_js_1.embedTexts)(embeddingModel, chunks.map((chunk) => chunk.
|
8
|
+
const { embeddings } = await (0, embedText_js_1.embedTexts)(embeddingModel, chunks.map((chunk) => chunk.text), options);
|
9
9
|
await vectorIndex.upsertMany(chunks.map((chunk, i) => ({
|
10
10
|
id: ids?.[i] ?? generateId(),
|
11
11
|
vector: embeddings[i],
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import { FunctionOptions } from "../model-function/FunctionOptions.js";
|
2
2
|
import { TextEmbeddingModel, TextEmbeddingModelSettings } from "../model-function/embed-text/TextEmbeddingModel.js";
|
3
|
-
import { TextChunk } from "
|
4
|
-
import { VectorIndex } from "
|
3
|
+
import { TextChunk } from "./TextChunk.js";
|
4
|
+
import { VectorIndex } from "../vector-index/VectorIndex.js";
|
5
5
|
export declare function upsertTextChunks<CHUNK extends TextChunk, SETTINGS extends TextEmbeddingModelSettings>({ vectorIndex, embeddingModel, generateId, chunks, ids, }: {
|
6
6
|
vectorIndex: VectorIndex<CHUNK, unknown>;
|
7
7
|
embeddingModel: TextEmbeddingModel<unknown, SETTINGS>;
|
@@ -2,7 +2,7 @@ import { nanoid as createId } from "nanoid";
|
|
2
2
|
import { embedTexts } from "../model-function/embed-text/embedText.js";
|
3
3
|
export async function upsertTextChunks({ vectorIndex, embeddingModel, generateId = createId, chunks, ids, }, options) {
|
4
4
|
// many embedding models support bulk embedding, so we first embed all texts:
|
5
|
-
const { embeddings } = await embedTexts(embeddingModel, chunks.map((chunk) => chunk.
|
5
|
+
const { embeddings } = await embedTexts(embeddingModel, chunks.map((chunk) => chunk.text), options);
|
6
6
|
await vectorIndex.upsertMany(chunks.map((chunk, i) => ({
|
7
7
|
id: ids?.[i] ?? generateId(),
|
8
8
|
vector: embeddings[i],
|
package/vector-index/index.cjs
CHANGED
@@ -15,8 +15,5 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
15
15
|
};
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
17
17
|
__exportStar(require("./VectorIndex.cjs"), exports);
|
18
|
-
__exportStar(require("./VectorIndexSimilarTextChunkRetriever.cjs"), exports);
|
19
|
-
__exportStar(require("./VectorIndexTextChunkStore.cjs"), exports);
|
20
18
|
__exportStar(require("./memory/MemoryVectorIndex.cjs"), exports);
|
21
19
|
__exportStar(require("./pinecone/PineconeVectorIndex.cjs"), exports);
|
22
|
-
__exportStar(require("./upsertTextChunks.cjs"), exports);
|
package/vector-index/index.d.ts
CHANGED
@@ -1,6 +1,3 @@
|
|
1
1
|
export * from "./VectorIndex.js";
|
2
|
-
export * from "./VectorIndexSimilarTextChunkRetriever.js";
|
3
|
-
export * from "./VectorIndexTextChunkStore.js";
|
4
2
|
export * from "./memory/MemoryVectorIndex.js";
|
5
3
|
export * from "./pinecone/PineconeVectorIndex.js";
|
6
|
-
export * from "./upsertTextChunks.js";
|
package/vector-index/index.js
CHANGED
@@ -1,6 +1,3 @@
|
|
1
1
|
export * from "./VectorIndex.js";
|
2
|
-
export * from "./VectorIndexSimilarTextChunkRetriever.js";
|
3
|
-
export * from "./VectorIndexTextChunkStore.js";
|
4
2
|
export * from "./memory/MemoryVectorIndex.js";
|
5
3
|
export * from "./pinecone/PineconeVectorIndex.js";
|
6
|
-
export * from "./upsertTextChunks.js";
|
@@ -1,77 +0,0 @@
|
|
1
|
-
"use strict";
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
-
exports.VectorIndexTextChunkStore = void 0;
|
4
|
-
const nanoid_1 = require("nanoid");
|
5
|
-
const embedText_js_1 = require("../model-function/embed-text/embedText.cjs");
|
6
|
-
class VectorIndexTextChunkStore {
|
7
|
-
constructor({ index, generateId = nanoid_1.nanoid, embeddingModel, queryFunctionId, upsertFunctionId, }) {
|
8
|
-
Object.defineProperty(this, "_index", {
|
9
|
-
enumerable: true,
|
10
|
-
configurable: true,
|
11
|
-
writable: true,
|
12
|
-
value: void 0
|
13
|
-
});
|
14
|
-
Object.defineProperty(this, "generateId", {
|
15
|
-
enumerable: true,
|
16
|
-
configurable: true,
|
17
|
-
writable: true,
|
18
|
-
value: void 0
|
19
|
-
});
|
20
|
-
Object.defineProperty(this, "embeddingModel", {
|
21
|
-
enumerable: true,
|
22
|
-
configurable: true,
|
23
|
-
writable: true,
|
24
|
-
value: void 0
|
25
|
-
});
|
26
|
-
Object.defineProperty(this, "queryFunctionId", {
|
27
|
-
enumerable: true,
|
28
|
-
configurable: true,
|
29
|
-
writable: true,
|
30
|
-
value: void 0
|
31
|
-
});
|
32
|
-
Object.defineProperty(this, "upsertFunctionId", {
|
33
|
-
enumerable: true,
|
34
|
-
configurable: true,
|
35
|
-
writable: true,
|
36
|
-
value: void 0
|
37
|
-
});
|
38
|
-
this._index = index;
|
39
|
-
this.generateId = generateId;
|
40
|
-
this.embeddingModel = embeddingModel;
|
41
|
-
this.queryFunctionId = queryFunctionId;
|
42
|
-
this.upsertFunctionId = upsertFunctionId;
|
43
|
-
}
|
44
|
-
async upsertChunk({ id = this.generateId(), chunk, }, options) {
|
45
|
-
this.upsertManyChunks({
|
46
|
-
ids: [id],
|
47
|
-
chunks: [chunk],
|
48
|
-
}, options);
|
49
|
-
}
|
50
|
-
async upsertManyChunks({ ids, chunks, }, options) {
|
51
|
-
const { embeddings } = await (0, embedText_js_1.embedTexts)(this.embeddingModel, chunks.map((chunk) => chunk.content), {
|
52
|
-
functionId: this.upsertFunctionId,
|
53
|
-
run: options?.run,
|
54
|
-
});
|
55
|
-
this._index.upsertMany(embeddings.map((embedding, i) => ({
|
56
|
-
id: ids?.[i] ?? this.generateId(),
|
57
|
-
vector: embedding,
|
58
|
-
data: chunks[i],
|
59
|
-
})));
|
60
|
-
}
|
61
|
-
async retrieveSimilarTextChunks(queryText, options) {
|
62
|
-
const { embedding } = await (0, embedText_js_1.embedText)(this.embeddingModel, queryText, {
|
63
|
-
functionId: this.queryFunctionId,
|
64
|
-
run: options?.run,
|
65
|
-
});
|
66
|
-
const queryResult = await this._index.queryByVector({
|
67
|
-
queryVector: embedding,
|
68
|
-
maxResults: 1,
|
69
|
-
similarityThreshold: undefined,
|
70
|
-
});
|
71
|
-
return queryResult.map((item) => item.data);
|
72
|
-
}
|
73
|
-
get index() {
|
74
|
-
return this._index.asIndex();
|
75
|
-
}
|
76
|
-
}
|
77
|
-
exports.VectorIndexTextChunkStore = VectorIndexTextChunkStore;
|
@@ -1,35 +0,0 @@
|
|
1
|
-
import { TextEmbeddingModel, TextEmbeddingModelSettings } from "../model-function/embed-text/TextEmbeddingModel.js";
|
2
|
-
import { Run } from "../run/Run.js";
|
3
|
-
import { TextChunk } from "../text-chunk/TextChunk.js";
|
4
|
-
import { TextChunkRetrieverSettings } from "../text-chunk/retrieve-text-chunks/TextChunkRetriever.js";
|
5
|
-
import { VectorIndex } from "./VectorIndex.js";
|
6
|
-
import { FunctionOptions } from "../model-function/FunctionOptions.js";
|
7
|
-
export declare class VectorIndexTextChunkStore<CHUNK extends TextChunk, INDEX, MODEL extends TextEmbeddingModel<unknown, TextEmbeddingModelSettings>> {
|
8
|
-
private readonly _index;
|
9
|
-
private readonly generateId;
|
10
|
-
private readonly embeddingModel;
|
11
|
-
private readonly queryFunctionId?;
|
12
|
-
private readonly upsertFunctionId?;
|
13
|
-
constructor({ index, generateId, embeddingModel, queryFunctionId, upsertFunctionId, }: {
|
14
|
-
index: VectorIndex<CHUNK, INDEX>;
|
15
|
-
generateId?: () => string;
|
16
|
-
embeddingModel: MODEL;
|
17
|
-
queryFunctionId?: string;
|
18
|
-
upsertFunctionId?: string;
|
19
|
-
});
|
20
|
-
upsertChunk({ id, chunk, }: {
|
21
|
-
id?: string;
|
22
|
-
keyText: string;
|
23
|
-
chunk: CHUNK;
|
24
|
-
}, options?: {
|
25
|
-
run?: Run;
|
26
|
-
}): Promise<void>;
|
27
|
-
upsertManyChunks({ ids, chunks, }: {
|
28
|
-
ids?: Array<string | undefined>;
|
29
|
-
chunks: CHUNK[];
|
30
|
-
}, options?: {
|
31
|
-
run?: Run;
|
32
|
-
}): Promise<void>;
|
33
|
-
retrieveSimilarTextChunks(queryText: string, options?: FunctionOptions<TextChunkRetrieverSettings> | undefined): Promise<CHUNK[]>;
|
34
|
-
get index(): INDEX;
|
35
|
-
}
|
@@ -1,73 +0,0 @@
|
|
1
|
-
import { nanoid as createId } from "nanoid";
|
2
|
-
import { embedText, embedTexts, } from "../model-function/embed-text/embedText.js";
|
3
|
-
export class VectorIndexTextChunkStore {
|
4
|
-
constructor({ index, generateId = createId, embeddingModel, queryFunctionId, upsertFunctionId, }) {
|
5
|
-
Object.defineProperty(this, "_index", {
|
6
|
-
enumerable: true,
|
7
|
-
configurable: true,
|
8
|
-
writable: true,
|
9
|
-
value: void 0
|
10
|
-
});
|
11
|
-
Object.defineProperty(this, "generateId", {
|
12
|
-
enumerable: true,
|
13
|
-
configurable: true,
|
14
|
-
writable: true,
|
15
|
-
value: void 0
|
16
|
-
});
|
17
|
-
Object.defineProperty(this, "embeddingModel", {
|
18
|
-
enumerable: true,
|
19
|
-
configurable: true,
|
20
|
-
writable: true,
|
21
|
-
value: void 0
|
22
|
-
});
|
23
|
-
Object.defineProperty(this, "queryFunctionId", {
|
24
|
-
enumerable: true,
|
25
|
-
configurable: true,
|
26
|
-
writable: true,
|
27
|
-
value: void 0
|
28
|
-
});
|
29
|
-
Object.defineProperty(this, "upsertFunctionId", {
|
30
|
-
enumerable: true,
|
31
|
-
configurable: true,
|
32
|
-
writable: true,
|
33
|
-
value: void 0
|
34
|
-
});
|
35
|
-
this._index = index;
|
36
|
-
this.generateId = generateId;
|
37
|
-
this.embeddingModel = embeddingModel;
|
38
|
-
this.queryFunctionId = queryFunctionId;
|
39
|
-
this.upsertFunctionId = upsertFunctionId;
|
40
|
-
}
|
41
|
-
async upsertChunk({ id = this.generateId(), chunk, }, options) {
|
42
|
-
this.upsertManyChunks({
|
43
|
-
ids: [id],
|
44
|
-
chunks: [chunk],
|
45
|
-
}, options);
|
46
|
-
}
|
47
|
-
async upsertManyChunks({ ids, chunks, }, options) {
|
48
|
-
const { embeddings } = await embedTexts(this.embeddingModel, chunks.map((chunk) => chunk.content), {
|
49
|
-
functionId: this.upsertFunctionId,
|
50
|
-
run: options?.run,
|
51
|
-
});
|
52
|
-
this._index.upsertMany(embeddings.map((embedding, i) => ({
|
53
|
-
id: ids?.[i] ?? this.generateId(),
|
54
|
-
vector: embedding,
|
55
|
-
data: chunks[i],
|
56
|
-
})));
|
57
|
-
}
|
58
|
-
async retrieveSimilarTextChunks(queryText, options) {
|
59
|
-
const { embedding } = await embedText(this.embeddingModel, queryText, {
|
60
|
-
functionId: this.queryFunctionId,
|
61
|
-
run: options?.run,
|
62
|
-
});
|
63
|
-
const queryResult = await this._index.queryByVector({
|
64
|
-
queryVector: embedding,
|
65
|
-
maxResults: 1,
|
66
|
-
similarityThreshold: undefined,
|
67
|
-
});
|
68
|
-
return queryResult.map((item) => item.data);
|
69
|
-
}
|
70
|
-
get index() {
|
71
|
-
return this._index.asIndex();
|
72
|
-
}
|
73
|
-
}
|