pageindex-core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +139 -0
- package/dist/index.d.cts +49 -0
- package/dist/index.d.ts +49 -0
- package/dist/index.js +110 -0
- package/package.json +32 -0
- package/readme.md +20 -0
- package/src/__tests__/pageindex.test.ts +20 -0
- package/src/adapters/MemoryVectorStore.ts +41 -0
- package/src/adapters/VectorAdapter.ts +1 -0
- package/src/core/PageIndex.ts +63 -0
- package/src/index.ts +4 -0
- package/src/processors/PageSplitter.ts +9 -0
- package/src/public/createPageIndex.ts +6 -0
- package/src/types/index.types.ts +31 -0
- package/src/utils/withRetry.ts +16 -0
- package/tsconfig.json +13 -0
- package/vitest.config.ts +7 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
MemoryVectorStore: () => MemoryVectorStore,
|
|
24
|
+
PageSplitter: () => PageSplitter,
|
|
25
|
+
createPageIndex: () => createPageIndex
|
|
26
|
+
});
|
|
27
|
+
module.exports = __toCommonJS(index_exports);
|
|
28
|
+
|
|
29
|
+
// src/core/PageIndex.ts
|
|
30
|
+
var import_crypto = require("crypto");
|
|
31
|
+
|
|
32
|
+
// src/utils/withRetry.ts
|
|
33
|
+
async function withRetry(fn, retries = 3) {
|
|
34
|
+
let lastError;
|
|
35
|
+
for (let i = 0; i < retries; i++) {
|
|
36
|
+
try {
|
|
37
|
+
return await fn();
|
|
38
|
+
} catch (err) {
|
|
39
|
+
lastError = err;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
throw lastError;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// src/core/PageIndex.ts
|
|
46
|
+
var PageIndex = class {
|
|
47
|
+
constructor(config) {
|
|
48
|
+
this.config = config;
|
|
49
|
+
this.pages = [];
|
|
50
|
+
this.isBuilt = false;
|
|
51
|
+
if (!config.embeddingModel || !config.vectorStore) {
|
|
52
|
+
throw new Error("embeddingModel and vectorStore are required");
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
async ingest(pages) {
|
|
56
|
+
this.pages = pages.map((content, i) => ({
|
|
57
|
+
id: (0, import_crypto.randomUUID)(),
|
|
58
|
+
pageNumber: i + 1,
|
|
59
|
+
content
|
|
60
|
+
}));
|
|
61
|
+
this.isBuilt = false;
|
|
62
|
+
}
|
|
63
|
+
async build() {
|
|
64
|
+
const batchSize = this.config.batchSize ?? 5;
|
|
65
|
+
for (let i = 0; i < this.pages.length; i += batchSize) {
|
|
66
|
+
const batch = this.pages.slice(i, i + batchSize);
|
|
67
|
+
await Promise.all(
|
|
68
|
+
batch.map(async (page) => {
|
|
69
|
+
if (this.config.summarizer) {
|
|
70
|
+
page.summary = await this.config.summarizer(page.content);
|
|
71
|
+
}
|
|
72
|
+
page.embedding = await withRetry(
|
|
73
|
+
() => this.config.embeddingModel(
|
|
74
|
+
page.summary || page.content
|
|
75
|
+
)
|
|
76
|
+
);
|
|
77
|
+
await this.config.vectorStore.upsert(page);
|
|
78
|
+
this.config.logger?.("page_indexed", { id: page.id });
|
|
79
|
+
})
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
this.isBuilt = true;
|
|
83
|
+
}
|
|
84
|
+
async retrieve(query, topK = 3) {
|
|
85
|
+
if (!this.isBuilt) throw new Error("Index not built");
|
|
86
|
+
const queryEmbedding = await this.config.embeddingModel(query);
|
|
87
|
+
return this.config.vectorStore.search(queryEmbedding, {
|
|
88
|
+
topK
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
// src/public/createPageIndex.ts
|
|
94
|
+
function createPageIndex(config) {
|
|
95
|
+
return new PageIndex(config);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// src/adapters/MemoryVectorStore.ts
|
|
99
|
+
var MemoryVectorStore = class {
|
|
100
|
+
constructor() {
|
|
101
|
+
this.store = /* @__PURE__ */ new Map();
|
|
102
|
+
}
|
|
103
|
+
async upsert(page) {
|
|
104
|
+
this.store.set(page.id, page);
|
|
105
|
+
}
|
|
106
|
+
async search(queryEmbedding, options) {
|
|
107
|
+
const scored = Array.from(this.store.values()).map((page) => ({
|
|
108
|
+
page,
|
|
109
|
+
score: cosineSimilarity(queryEmbedding, page.embedding || [])
|
|
110
|
+
}));
|
|
111
|
+
return scored.sort((a, b) => b.score - a.score).slice(0, options.topK).map((s) => s.page);
|
|
112
|
+
}
|
|
113
|
+
};
|
|
114
|
+
function cosineSimilarity(a, b) {
|
|
115
|
+
let dot = 0, normA = 0, normB = 0;
|
|
116
|
+
for (let i = 0; i < a.length; i++) {
|
|
117
|
+
dot += a[i] * (b[i] || 0);
|
|
118
|
+
normA += a[i] ** 2;
|
|
119
|
+
normB += (b[i] || 0) ** 2;
|
|
120
|
+
}
|
|
121
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// src/processors/PageSplitter.ts
|
|
125
|
+
var PageSplitter = class {
|
|
126
|
+
static splitByLength(text, maxChars = 3e3) {
|
|
127
|
+
const pages = [];
|
|
128
|
+
for (let i = 0; i < text.length; i += maxChars) {
|
|
129
|
+
pages.push(text.slice(i, i + maxChars));
|
|
130
|
+
}
|
|
131
|
+
return pages;
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
135
|
+
0 && (module.exports = {
|
|
136
|
+
MemoryVectorStore,
|
|
137
|
+
PageSplitter,
|
|
138
|
+
createPageIndex
|
|
139
|
+
});
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
interface PageUnit {
|
|
2
|
+
id: string;
|
|
3
|
+
pageNumber: number;
|
|
4
|
+
content: string;
|
|
5
|
+
summary?: string;
|
|
6
|
+
metadata?: Record<string, any>;
|
|
7
|
+
embedding?: number[];
|
|
8
|
+
}
|
|
9
|
+
interface SearchOptions {
|
|
10
|
+
topK: number;
|
|
11
|
+
filter?: Record<string, any>;
|
|
12
|
+
}
|
|
13
|
+
interface VectorAdapter {
|
|
14
|
+
upsert(page: PageUnit): Promise<void>;
|
|
15
|
+
search(queryEmbedding: number[], options: SearchOptions): Promise<PageUnit[]>;
|
|
16
|
+
get?(id: string): Promise<PageUnit | null>;
|
|
17
|
+
delete?(id: string): Promise<void>;
|
|
18
|
+
}
|
|
19
|
+
interface PageIndexConfig {
|
|
20
|
+
embeddingModel: (text: string) => Promise<number[]>;
|
|
21
|
+
summarizer?: (text: string) => Promise<string>;
|
|
22
|
+
vectorStore: VectorAdapter;
|
|
23
|
+
batchSize?: number;
|
|
24
|
+
logger?: (event: string, payload?: any) => void;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
declare class PageIndex {
|
|
28
|
+
private config;
|
|
29
|
+
private pages;
|
|
30
|
+
private isBuilt;
|
|
31
|
+
constructor(config: PageIndexConfig);
|
|
32
|
+
ingest(pages: string[]): Promise<void>;
|
|
33
|
+
build(): Promise<void>;
|
|
34
|
+
retrieve(query: string, topK?: number): Promise<PageUnit[]>;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
declare function createPageIndex(config: PageIndexConfig): PageIndex;
|
|
38
|
+
|
|
39
|
+
declare class MemoryVectorStore implements VectorAdapter {
|
|
40
|
+
private store;
|
|
41
|
+
upsert(page: PageUnit): Promise<void>;
|
|
42
|
+
search(queryEmbedding: number[], options: SearchOptions): Promise<PageUnit[]>;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
declare class PageSplitter {
|
|
46
|
+
static splitByLength(text: string, maxChars?: number): string[];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export { MemoryVectorStore, type PageIndexConfig, PageSplitter, type PageUnit, type SearchOptions, type VectorAdapter, createPageIndex };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
interface PageUnit {
|
|
2
|
+
id: string;
|
|
3
|
+
pageNumber: number;
|
|
4
|
+
content: string;
|
|
5
|
+
summary?: string;
|
|
6
|
+
metadata?: Record<string, any>;
|
|
7
|
+
embedding?: number[];
|
|
8
|
+
}
|
|
9
|
+
interface SearchOptions {
|
|
10
|
+
topK: number;
|
|
11
|
+
filter?: Record<string, any>;
|
|
12
|
+
}
|
|
13
|
+
interface VectorAdapter {
|
|
14
|
+
upsert(page: PageUnit): Promise<void>;
|
|
15
|
+
search(queryEmbedding: number[], options: SearchOptions): Promise<PageUnit[]>;
|
|
16
|
+
get?(id: string): Promise<PageUnit | null>;
|
|
17
|
+
delete?(id: string): Promise<void>;
|
|
18
|
+
}
|
|
19
|
+
interface PageIndexConfig {
|
|
20
|
+
embeddingModel: (text: string) => Promise<number[]>;
|
|
21
|
+
summarizer?: (text: string) => Promise<string>;
|
|
22
|
+
vectorStore: VectorAdapter;
|
|
23
|
+
batchSize?: number;
|
|
24
|
+
logger?: (event: string, payload?: any) => void;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
declare class PageIndex {
|
|
28
|
+
private config;
|
|
29
|
+
private pages;
|
|
30
|
+
private isBuilt;
|
|
31
|
+
constructor(config: PageIndexConfig);
|
|
32
|
+
ingest(pages: string[]): Promise<void>;
|
|
33
|
+
build(): Promise<void>;
|
|
34
|
+
retrieve(query: string, topK?: number): Promise<PageUnit[]>;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
declare function createPageIndex(config: PageIndexConfig): PageIndex;
|
|
38
|
+
|
|
39
|
+
declare class MemoryVectorStore implements VectorAdapter {
|
|
40
|
+
private store;
|
|
41
|
+
upsert(page: PageUnit): Promise<void>;
|
|
42
|
+
search(queryEmbedding: number[], options: SearchOptions): Promise<PageUnit[]>;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
declare class PageSplitter {
|
|
46
|
+
static splitByLength(text: string, maxChars?: number): string[];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export { MemoryVectorStore, type PageIndexConfig, PageSplitter, type PageUnit, type SearchOptions, type VectorAdapter, createPageIndex };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
// src/core/PageIndex.ts
|
|
2
|
+
import { randomUUID } from "crypto";
|
|
3
|
+
|
|
4
|
+
// src/utils/withRetry.ts
|
|
5
|
+
async function withRetry(fn, retries = 3) {
|
|
6
|
+
let lastError;
|
|
7
|
+
for (let i = 0; i < retries; i++) {
|
|
8
|
+
try {
|
|
9
|
+
return await fn();
|
|
10
|
+
} catch (err) {
|
|
11
|
+
lastError = err;
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
throw lastError;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// src/core/PageIndex.ts
|
|
18
|
+
var PageIndex = class {
|
|
19
|
+
constructor(config) {
|
|
20
|
+
this.config = config;
|
|
21
|
+
this.pages = [];
|
|
22
|
+
this.isBuilt = false;
|
|
23
|
+
if (!config.embeddingModel || !config.vectorStore) {
|
|
24
|
+
throw new Error("embeddingModel and vectorStore are required");
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
async ingest(pages) {
|
|
28
|
+
this.pages = pages.map((content, i) => ({
|
|
29
|
+
id: randomUUID(),
|
|
30
|
+
pageNumber: i + 1,
|
|
31
|
+
content
|
|
32
|
+
}));
|
|
33
|
+
this.isBuilt = false;
|
|
34
|
+
}
|
|
35
|
+
async build() {
|
|
36
|
+
const batchSize = this.config.batchSize ?? 5;
|
|
37
|
+
for (let i = 0; i < this.pages.length; i += batchSize) {
|
|
38
|
+
const batch = this.pages.slice(i, i + batchSize);
|
|
39
|
+
await Promise.all(
|
|
40
|
+
batch.map(async (page) => {
|
|
41
|
+
if (this.config.summarizer) {
|
|
42
|
+
page.summary = await this.config.summarizer(page.content);
|
|
43
|
+
}
|
|
44
|
+
page.embedding = await withRetry(
|
|
45
|
+
() => this.config.embeddingModel(
|
|
46
|
+
page.summary || page.content
|
|
47
|
+
)
|
|
48
|
+
);
|
|
49
|
+
await this.config.vectorStore.upsert(page);
|
|
50
|
+
this.config.logger?.("page_indexed", { id: page.id });
|
|
51
|
+
})
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
this.isBuilt = true;
|
|
55
|
+
}
|
|
56
|
+
async retrieve(query, topK = 3) {
|
|
57
|
+
if (!this.isBuilt) throw new Error("Index not built");
|
|
58
|
+
const queryEmbedding = await this.config.embeddingModel(query);
|
|
59
|
+
return this.config.vectorStore.search(queryEmbedding, {
|
|
60
|
+
topK
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
// src/public/createPageIndex.ts
|
|
66
|
+
function createPageIndex(config) {
|
|
67
|
+
return new PageIndex(config);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// src/adapters/MemoryVectorStore.ts
|
|
71
|
+
var MemoryVectorStore = class {
|
|
72
|
+
constructor() {
|
|
73
|
+
this.store = /* @__PURE__ */ new Map();
|
|
74
|
+
}
|
|
75
|
+
async upsert(page) {
|
|
76
|
+
this.store.set(page.id, page);
|
|
77
|
+
}
|
|
78
|
+
async search(queryEmbedding, options) {
|
|
79
|
+
const scored = Array.from(this.store.values()).map((page) => ({
|
|
80
|
+
page,
|
|
81
|
+
score: cosineSimilarity(queryEmbedding, page.embedding || [])
|
|
82
|
+
}));
|
|
83
|
+
return scored.sort((a, b) => b.score - a.score).slice(0, options.topK).map((s) => s.page);
|
|
84
|
+
}
|
|
85
|
+
};
|
|
86
|
+
function cosineSimilarity(a, b) {
|
|
87
|
+
let dot = 0, normA = 0, normB = 0;
|
|
88
|
+
for (let i = 0; i < a.length; i++) {
|
|
89
|
+
dot += a[i] * (b[i] || 0);
|
|
90
|
+
normA += a[i] ** 2;
|
|
91
|
+
normB += (b[i] || 0) ** 2;
|
|
92
|
+
}
|
|
93
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// src/processors/PageSplitter.ts
|
|
97
|
+
var PageSplitter = class {
|
|
98
|
+
static splitByLength(text, maxChars = 3e3) {
|
|
99
|
+
const pages = [];
|
|
100
|
+
for (let i = 0; i < text.length; i += maxChars) {
|
|
101
|
+
pages.push(text.slice(i, i + maxChars));
|
|
102
|
+
}
|
|
103
|
+
return pages;
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
export {
|
|
107
|
+
MemoryVectorStore,
|
|
108
|
+
PageSplitter,
|
|
109
|
+
createPageIndex
|
|
110
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pageindex-core",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Structure-aware page-level retrieval engine (RAG alternative)",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.cjs",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./dist/index.js",
|
|
12
|
+
"require": "./dist/index.cjs"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"sideEffects": false,
|
|
16
|
+
"scripts": {
|
|
17
|
+
"build": "tsup src/index.ts --format esm,cjs --dts",
|
|
18
|
+
"dev": "tsup src/index.ts --watch",
|
|
19
|
+
"test": "vitest",
|
|
20
|
+
"prepublishOnly": "npm run build"
|
|
21
|
+
},
|
|
22
|
+
"keywords": [
|
|
23
|
+
"rag",
|
|
24
|
+
"retrieval",
|
|
25
|
+
"llm",
|
|
26
|
+
"vector",
|
|
27
|
+
"semantic-search",
|
|
28
|
+
"pageindex"
|
|
29
|
+
],
|
|
30
|
+
"author": "Your Name",
|
|
31
|
+
"license": "MIT"
|
|
32
|
+
}
|
package/readme.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# @pageindex/core
|
|
2
|
+
|
|
3
|
+
Structure-aware page-level retrieval engine for LLM systems.
|
|
4
|
+
|
|
5
|
+
## Why?
|
|
6
|
+
|
|
7
|
+
Traditional RAG uses arbitrary chunking which:
|
|
8
|
+
- Breaks structural context
|
|
9
|
+
- Loses page-level semantics
|
|
10
|
+
- Causes retrieval noise
|
|
11
|
+
|
|
12
|
+
PageIndex retrieves full logical pages first,
|
|
13
|
+
then allows refinement — preserving structure.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npm install @pageindex/core
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { createPageIndex } from "../index";
|
|
2
|
+
import { MemoryVectorStore } from "../adapters/MemoryVectorStore";
|
|
3
|
+
import { test, expect } from "vitest";
|
|
4
|
+
|
|
5
|
+
test("indexes and retrieves", async () => {
|
|
6
|
+
const embedding = async (text: string) =>
|
|
7
|
+
Array(10).fill(text.length);
|
|
8
|
+
|
|
9
|
+
const index = createPageIndex({
|
|
10
|
+
embeddingModel: embedding,
|
|
11
|
+
vectorStore: new MemoryVectorStore()
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
await index.ingest(["hello world", "another page"]);
|
|
15
|
+
await index.build();
|
|
16
|
+
|
|
17
|
+
const results = await index.retrieve("hello");
|
|
18
|
+
|
|
19
|
+
expect(results.length).toBeGreaterThan(0);
|
|
20
|
+
});
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import {
|
|
2
|
+
VectorAdapter,
|
|
3
|
+
PageUnit,
|
|
4
|
+
SearchOptions
|
|
5
|
+
} from "../types/index.types";
|
|
6
|
+
|
|
7
|
+
export class MemoryVectorStore implements VectorAdapter {
|
|
8
|
+
private store = new Map<string, PageUnit>();
|
|
9
|
+
|
|
10
|
+
async upsert(page: PageUnit): Promise<void> {
|
|
11
|
+
this.store.set(page.id, page);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
async search(
|
|
15
|
+
queryEmbedding: number[],
|
|
16
|
+
options: SearchOptions
|
|
17
|
+
): Promise<PageUnit[]> {
|
|
18
|
+
|
|
19
|
+
const scored = Array.from(this.store.values()).map(page => ({
|
|
20
|
+
page,
|
|
21
|
+
score: cosineSimilarity(queryEmbedding, page.embedding || [])
|
|
22
|
+
}));
|
|
23
|
+
|
|
24
|
+
return scored
|
|
25
|
+
.sort((a, b) => b.score - a.score)
|
|
26
|
+
.slice(0, options.topK)
|
|
27
|
+
.map(s => s.page);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function cosineSimilarity(a: number[], b: number[]) {
|
|
32
|
+
let dot = 0, normA = 0, normB = 0;
|
|
33
|
+
|
|
34
|
+
for (let i = 0; i < a.length; i++) {
|
|
35
|
+
dot += a[i] * (b[i] || 0);
|
|
36
|
+
normA += a[i] ** 2;
|
|
37
|
+
normB += (b[i] || 0) ** 2;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
41
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "../types/index.types";
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { randomUUID } from "crypto";
|
|
2
|
+
import {
|
|
3
|
+
PageIndexConfig,
|
|
4
|
+
PageUnit
|
|
5
|
+
} from "../types/index.types";
|
|
6
|
+
import { withRetry } from "../utils/withRetry";
|
|
7
|
+
|
|
8
|
+
export class PageIndex {
|
|
9
|
+
private pages: PageUnit[] = [];
|
|
10
|
+
private isBuilt = false;
|
|
11
|
+
|
|
12
|
+
constructor(private config: PageIndexConfig) {
|
|
13
|
+
if (!config.embeddingModel || !config.vectorStore) {
|
|
14
|
+
throw new Error("embeddingModel and vectorStore are required");
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
async ingest(pages: string[]) {
|
|
19
|
+
this.pages = pages.map((content, i) => ({
|
|
20
|
+
id: randomUUID(),
|
|
21
|
+
pageNumber: i + 1,
|
|
22
|
+
content
|
|
23
|
+
}));
|
|
24
|
+
this.isBuilt = false;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async build() {
|
|
28
|
+
const batchSize = this.config.batchSize ?? 5;
|
|
29
|
+
|
|
30
|
+
for (let i = 0; i < this.pages.length; i += batchSize) {
|
|
31
|
+
const batch = this.pages.slice(i, i + batchSize);
|
|
32
|
+
|
|
33
|
+
await Promise.all(
|
|
34
|
+
batch.map(async page => {
|
|
35
|
+
if (this.config.summarizer) {
|
|
36
|
+
page.summary = await this.config.summarizer(page.content);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
page.embedding = await withRetry(() =>
|
|
40
|
+
this.config.embeddingModel(
|
|
41
|
+
page.summary || page.content
|
|
42
|
+
)
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
await this.config.vectorStore.upsert(page);
|
|
46
|
+
this.config.logger?.("page_indexed", { id: page.id });
|
|
47
|
+
})
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
this.isBuilt = true;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async retrieve(query: string, topK = 3) {
|
|
55
|
+
if (!this.isBuilt) throw new Error("Index not built");
|
|
56
|
+
|
|
57
|
+
const queryEmbedding = await this.config.embeddingModel(query);
|
|
58
|
+
|
|
59
|
+
return this.config.vectorStore.search(queryEmbedding, {
|
|
60
|
+
topK
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export interface PageUnit {
|
|
2
|
+
id: string;
|
|
3
|
+
pageNumber: number;
|
|
4
|
+
content: string;
|
|
5
|
+
summary?: string;
|
|
6
|
+
metadata?: Record<string, any>;
|
|
7
|
+
embedding?: number[];
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface SearchOptions {
|
|
11
|
+
topK: number;
|
|
12
|
+
filter?: Record<string, any>;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface VectorAdapter {
|
|
16
|
+
upsert(page: PageUnit): Promise<void>;
|
|
17
|
+
search(
|
|
18
|
+
queryEmbedding: number[],
|
|
19
|
+
options: SearchOptions
|
|
20
|
+
): Promise<PageUnit[]>;
|
|
21
|
+
get?(id: string): Promise<PageUnit | null>;
|
|
22
|
+
delete?(id: string): Promise<void>;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface PageIndexConfig {
|
|
26
|
+
embeddingModel: (text: string) => Promise<number[]>;
|
|
27
|
+
summarizer?: (text: string) => Promise<string>;
|
|
28
|
+
vectorStore: VectorAdapter;
|
|
29
|
+
batchSize?: number;
|
|
30
|
+
logger?: (event: string, payload?: any) => void;
|
|
31
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export async function withRetry<T>(
|
|
2
|
+
fn: () => Promise<T>,
|
|
3
|
+
retries = 3
|
|
4
|
+
): Promise<T> {
|
|
5
|
+
let lastError;
|
|
6
|
+
|
|
7
|
+
for (let i = 0; i < retries; i++) {
|
|
8
|
+
try {
|
|
9
|
+
return await fn();
|
|
10
|
+
} catch (err) {
|
|
11
|
+
lastError = err;
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
throw lastError;
|
|
16
|
+
}
|
package/tsconfig.json
ADDED