@contractspec/lib.knowledge 1.56.1 → 1.58.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/access/guard.d.ts +13 -17
- package/dist/access/guard.d.ts.map +1 -1
- package/dist/access/guard.js +60 -49
- package/dist/access/index.d.ts +2 -2
- package/dist/access/index.d.ts.map +1 -0
- package/dist/access/index.js +60 -2
- package/dist/index.d.ts +6 -12
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +455 -12
- package/dist/ingestion/document-processor.d.ts +18 -20
- package/dist/ingestion/document-processor.d.ts.map +1 -1
- package/dist/ingestion/document-processor.js +63 -53
- package/dist/ingestion/embedding-service.d.ts +7 -11
- package/dist/ingestion/embedding-service.d.ts.map +1 -1
- package/dist/ingestion/embedding-service.js +26 -25
- package/dist/ingestion/gmail-adapter.d.ts +13 -17
- package/dist/ingestion/gmail-adapter.d.ts.map +1 -1
- package/dist/ingestion/gmail-adapter.js +67 -46
- package/dist/ingestion/index.d.ts +6 -6
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +221 -6
- package/dist/ingestion/storage-adapter.d.ts +10 -14
- package/dist/ingestion/storage-adapter.d.ts.map +1 -1
- package/dist/ingestion/storage-adapter.js +31 -26
- package/dist/ingestion/vector-indexer.d.ts +11 -15
- package/dist/ingestion/vector-indexer.d.ts.map +1 -1
- package/dist/ingestion/vector-indexer.js +32 -32
- package/dist/node/access/guard.js +60 -0
- package/dist/node/access/index.js +60 -0
- package/dist/node/index.js +454 -0
- package/dist/node/ingestion/document-processor.js +64 -0
- package/dist/node/ingestion/embedding-service.js +26 -0
- package/dist/node/ingestion/gmail-adapter.js +72 -0
- package/dist/node/ingestion/index.js +221 -0
- package/dist/node/ingestion/storage-adapter.js +31 -0
- package/dist/node/ingestion/vector-indexer.js +32 -0
- package/dist/node/query/index.js +79 -0
- package/dist/node/query/service.js +79 -0
- package/dist/node/retriever/index.js +100 -0
- package/dist/node/retriever/interface.js +0 -0
- package/dist/node/retriever/static-retriever.js +43 -0
- package/dist/node/retriever/vector-retriever.js +58 -0
- package/dist/node/types.js +0 -0
- package/dist/query/index.d.ts +2 -2
- package/dist/query/index.d.ts.map +1 -0
- package/dist/query/index.js +79 -2
- package/dist/query/service.d.ts +20 -24
- package/dist/query/service.d.ts.map +1 -1
- package/dist/query/service.js +76 -62
- package/dist/retriever/index.d.ts +4 -4
- package/dist/retriever/index.d.ts.map +1 -0
- package/dist/retriever/index.js +100 -3
- package/dist/retriever/interface.d.ts +38 -43
- package/dist/retriever/interface.d.ts.map +1 -1
- package/dist/retriever/interface.js +1 -0
- package/dist/retriever/static-retriever.d.ts +13 -18
- package/dist/retriever/static-retriever.d.ts.map +1 -1
- package/dist/retriever/static-retriever.js +42 -46
- package/dist/retriever/vector-retriever.d.ts +23 -28
- package/dist/retriever/vector-retriever.d.ts.map +1 -1
- package/dist/retriever/vector-retriever.js +57 -59
- package/dist/types.d.ts +34 -39
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -0
- package/package.json +152 -45
- package/dist/access/guard.js.map +0 -1
- package/dist/ingestion/document-processor.js.map +0 -1
- package/dist/ingestion/embedding-service.js.map +0 -1
- package/dist/ingestion/gmail-adapter.js.map +0 -1
- package/dist/ingestion/storage-adapter.js.map +0 -1
- package/dist/ingestion/vector-indexer.js.map +0 -1
- package/dist/query/service.js.map +0 -1
- package/dist/retriever/static-retriever.js.map +0 -1
- package/dist/retriever/vector-retriever.js.map +0 -1
|
@@ -1,26 +1,27 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingestion/embedding-service.ts
|
|
3
|
+
class EmbeddingService {
|
|
4
|
+
provider;
|
|
5
|
+
batchSize;
|
|
6
|
+
constructor(provider, batchSize = 16) {
|
|
7
|
+
this.provider = provider;
|
|
8
|
+
this.batchSize = batchSize;
|
|
9
|
+
}
|
|
10
|
+
async embedFragments(fragments) {
|
|
11
|
+
const results = [];
|
|
12
|
+
for (let i = 0;i < fragments.length; i += this.batchSize) {
|
|
13
|
+
const slice = fragments.slice(i, i + this.batchSize);
|
|
14
|
+
const documents = slice.map((fragment) => ({
|
|
15
|
+
id: fragment.id,
|
|
16
|
+
text: fragment.text,
|
|
17
|
+
metadata: fragment.metadata
|
|
18
|
+
}));
|
|
19
|
+
const embeddings = await this.provider.embedDocuments(documents);
|
|
20
|
+
results.push(...embeddings);
|
|
21
|
+
}
|
|
22
|
+
return results;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
export {
|
|
26
|
+
EmbeddingService
|
|
22
27
|
};
|
|
23
|
-
|
|
24
|
-
//#endregion
|
|
25
|
-
export { EmbeddingService };
|
|
26
|
-
//# sourceMappingURL=embedding-service.js.map
|
|
@@ -1,19 +1,15 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
ingestThread(thread: EmailThread): Promise<void>;
|
|
15
|
-
private toRawDocument;
|
|
1
|
+
import type { EmailInboundProvider, EmailThread } from '@contractspec/lib.contracts';
|
|
2
|
+
import type { DocumentProcessor } from './document-processor';
|
|
3
|
+
import type { EmbeddingService } from './embedding-service';
|
|
4
|
+
import type { VectorIndexer } from './vector-indexer';
|
|
5
|
+
export declare class GmailIngestionAdapter {
|
|
6
|
+
private readonly gmail;
|
|
7
|
+
private readonly processor;
|
|
8
|
+
private readonly embeddings;
|
|
9
|
+
private readonly indexer;
|
|
10
|
+
constructor(gmail: EmailInboundProvider, processor: DocumentProcessor, embeddings: EmbeddingService, indexer: VectorIndexer);
|
|
11
|
+
syncThreads(query?: Parameters<EmailInboundProvider['listThreads']>[0]): Promise<void>;
|
|
12
|
+
ingestThread(thread: EmailThread): Promise<void>;
|
|
13
|
+
private toRawDocument;
|
|
16
14
|
}
|
|
17
|
-
//#endregion
|
|
18
|
-
export { GmailIngestionAdapter };
|
|
19
15
|
//# sourceMappingURL=gmail-adapter.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"gmail-adapter.d.ts","
|
|
1
|
+
{"version":3,"file":"gmail-adapter.d.ts","sourceRoot":"","sources":["../../src/ingestion/gmail-adapter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,oBAAoB,EACpB,WAAW,EACZ,MAAM,6BAA6B,CAAC;AACrC,OAAO,KAAK,EAAE,iBAAiB,EAAe,MAAM,sBAAsB,CAAC;AAC3E,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAEtD,qBAAa,qBAAqB;IAE9B,OAAO,CAAC,QAAQ,CAAC,KAAK;IACtB,OAAO,CAAC,QAAQ,CAAC,SAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,UAAU;IAC3B,OAAO,CAAC,QAAQ,CAAC,OAAO;gBAHP,KAAK,EAAE,oBAAoB,EAC3B,SAAS,EAAE,iBAAiB,EAC5B,UAAU,EAAE,gBAAgB,EAC5B,OAAO,EAAE,aAAa;IAGnC,WAAW,CACf,KAAK,CAAC,EAAE,UAAU,CAAC,oBAAoB,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC;IAQtD,YAAY,CAAC,MAAM,EAAE,WAAW;IAOtC,OAAO,CAAC,aAAa;CAatB"}
|
|
@@ -1,52 +1,73 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingestion/gmail-adapter.ts
|
|
3
|
+
class GmailIngestionAdapter {
|
|
4
|
+
gmail;
|
|
5
|
+
processor;
|
|
6
|
+
embeddings;
|
|
7
|
+
indexer;
|
|
8
|
+
constructor(gmail, processor, embeddings, indexer) {
|
|
9
|
+
this.gmail = gmail;
|
|
10
|
+
this.processor = processor;
|
|
11
|
+
this.embeddings = embeddings;
|
|
12
|
+
this.indexer = indexer;
|
|
13
|
+
}
|
|
14
|
+
async syncThreads(query) {
|
|
15
|
+
const threads = await this.gmail.listThreads(query);
|
|
16
|
+
for (const thread of threads) {
|
|
17
|
+
await this.ingestThread(thread);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
async ingestThread(thread) {
|
|
21
|
+
const document = this.toRawDocument(thread);
|
|
22
|
+
const fragments = await this.processor.process(document);
|
|
23
|
+
const embeddings = await this.embeddings.embedFragments(fragments);
|
|
24
|
+
await this.indexer.upsert(fragments, embeddings);
|
|
25
|
+
}
|
|
26
|
+
toRawDocument(thread) {
|
|
27
|
+
const content = composeThreadText(thread);
|
|
28
|
+
return {
|
|
29
|
+
id: thread.id,
|
|
30
|
+
mimeType: "text/plain",
|
|
31
|
+
data: Buffer.from(content, "utf-8"),
|
|
32
|
+
metadata: {
|
|
33
|
+
subject: thread.subject ?? "",
|
|
34
|
+
participants: thread.participants.map((p) => p.email).join(", "),
|
|
35
|
+
updatedAt: thread.updatedAt.toISOString()
|
|
36
|
+
}
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
}
|
|
33
40
|
function composeThreadText(thread) {
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
const header = [
|
|
42
|
+
`Subject: ${thread.subject ?? ""}`,
|
|
43
|
+
`Snippet: ${thread.snippet ?? ""}`
|
|
44
|
+
];
|
|
45
|
+
const messageTexts = thread.messages.map((message) => {
|
|
46
|
+
const parts = [
|
|
47
|
+
`From: ${formatAddress(message.from)}`,
|
|
48
|
+
`To: ${message.to.map(formatAddress).join(", ")}`
|
|
49
|
+
];
|
|
50
|
+
if (message.sentAt) {
|
|
51
|
+
parts.push(`Date: ${message.sentAt.toISOString()}`);
|
|
52
|
+
}
|
|
53
|
+
const body = message.textBody ?? stripHtml(message.htmlBody ?? "");
|
|
54
|
+
return `${parts.join(`
|
|
55
|
+
`)}
|
|
56
|
+
|
|
57
|
+
${body ?? ""}`;
|
|
58
|
+
});
|
|
59
|
+
return [...header, ...messageTexts].join(`
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
`);
|
|
42
64
|
}
|
|
43
65
|
function formatAddress(address) {
|
|
44
|
-
|
|
66
|
+
return address.name ? `${address.name} <${address.email}>` : address.email;
|
|
45
67
|
}
|
|
46
68
|
function stripHtml(html) {
|
|
47
|
-
|
|
69
|
+
return html.replace(/<[^>]+>/g, " ");
|
|
48
70
|
}
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
//# sourceMappingURL=gmail-adapter.js.map
|
|
71
|
+
export {
|
|
72
|
+
GmailIngestionAdapter
|
|
73
|
+
};
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
export * from './document-processor';
|
|
2
|
+
export * from './embedding-service';
|
|
3
|
+
export * from './vector-indexer';
|
|
4
|
+
export * from './gmail-adapter';
|
|
5
|
+
export * from './storage-adapter';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/ingestion/index.ts"],"names":[],"mappings":"AAAA,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,mBAAmB,CAAC"}
|
package/dist/ingestion/index.js
CHANGED
|
@@ -1,7 +1,222 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import {
|
|
4
|
-
import { GmailIngestionAdapter } from "./gmail-adapter.js";
|
|
5
|
-
import { StorageIngestionAdapter } from "./storage-adapter.js";
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingestion/document-processor.ts
|
|
3
|
+
import { Buffer as Buffer2 } from "buffer";
|
|
6
4
|
|
|
7
|
-
|
|
5
|
+
class DocumentProcessor {
|
|
6
|
+
extractors = new Map;
|
|
7
|
+
constructor() {
|
|
8
|
+
this.registerExtractor("text/plain", this.extractText.bind(this));
|
|
9
|
+
this.registerExtractor("application/json", this.extractJson.bind(this));
|
|
10
|
+
}
|
|
11
|
+
registerExtractor(mimeType, extractor) {
|
|
12
|
+
this.extractors.set(mimeType.toLowerCase(), extractor);
|
|
13
|
+
}
|
|
14
|
+
async process(document) {
|
|
15
|
+
const extractor = this.extractors.get(document.mimeType.toLowerCase()) ?? this.extractors.get("*/*");
|
|
16
|
+
if (!extractor) {
|
|
17
|
+
throw new Error(`No extractor registered for mime type ${document.mimeType}`);
|
|
18
|
+
}
|
|
19
|
+
const fragments = await extractor(document);
|
|
20
|
+
if (fragments.length === 0) {
|
|
21
|
+
return [
|
|
22
|
+
{
|
|
23
|
+
id: `${document.id}:0`,
|
|
24
|
+
documentId: document.id,
|
|
25
|
+
text: "",
|
|
26
|
+
metadata: document.metadata
|
|
27
|
+
}
|
|
28
|
+
];
|
|
29
|
+
}
|
|
30
|
+
return fragments;
|
|
31
|
+
}
|
|
32
|
+
async extractText(document) {
|
|
33
|
+
const text = Buffer2.from(document.data).toString("utf-8");
|
|
34
|
+
return [
|
|
35
|
+
{
|
|
36
|
+
id: `${document.id}:0`,
|
|
37
|
+
documentId: document.id,
|
|
38
|
+
text,
|
|
39
|
+
metadata: document.metadata
|
|
40
|
+
}
|
|
41
|
+
];
|
|
42
|
+
}
|
|
43
|
+
async extractJson(document) {
|
|
44
|
+
const text = Buffer2.from(document.data).toString("utf-8");
|
|
45
|
+
try {
|
|
46
|
+
const json = JSON.parse(text);
|
|
47
|
+
return [
|
|
48
|
+
{
|
|
49
|
+
id: `${document.id}:0`,
|
|
50
|
+
documentId: document.id,
|
|
51
|
+
text: JSON.stringify(json, null, 2),
|
|
52
|
+
metadata: {
|
|
53
|
+
...document.metadata,
|
|
54
|
+
contentType: "application/json"
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
];
|
|
58
|
+
} catch {
|
|
59
|
+
return this.extractText(document);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// src/ingestion/embedding-service.ts
|
|
65
|
+
class EmbeddingService {
|
|
66
|
+
provider;
|
|
67
|
+
batchSize;
|
|
68
|
+
constructor(provider, batchSize = 16) {
|
|
69
|
+
this.provider = provider;
|
|
70
|
+
this.batchSize = batchSize;
|
|
71
|
+
}
|
|
72
|
+
async embedFragments(fragments) {
|
|
73
|
+
const results = [];
|
|
74
|
+
for (let i = 0;i < fragments.length; i += this.batchSize) {
|
|
75
|
+
const slice = fragments.slice(i, i + this.batchSize);
|
|
76
|
+
const documents = slice.map((fragment) => ({
|
|
77
|
+
id: fragment.id,
|
|
78
|
+
text: fragment.text,
|
|
79
|
+
metadata: fragment.metadata
|
|
80
|
+
}));
|
|
81
|
+
const embeddings = await this.provider.embedDocuments(documents);
|
|
82
|
+
results.push(...embeddings);
|
|
83
|
+
}
|
|
84
|
+
return results;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// src/ingestion/vector-indexer.ts
|
|
89
|
+
class VectorIndexer {
|
|
90
|
+
provider;
|
|
91
|
+
config;
|
|
92
|
+
constructor(provider, config) {
|
|
93
|
+
this.provider = provider;
|
|
94
|
+
this.config = config;
|
|
95
|
+
}
|
|
96
|
+
async upsert(fragments, embeddings) {
|
|
97
|
+
const documents = embeddings.map((embedding) => {
|
|
98
|
+
const fragment = fragments.find((f) => f.id === embedding.id);
|
|
99
|
+
return {
|
|
100
|
+
id: embedding.id,
|
|
101
|
+
vector: embedding.vector,
|
|
102
|
+
payload: {
|
|
103
|
+
...this.config.metadata,
|
|
104
|
+
...fragment?.metadata ?? {},
|
|
105
|
+
documentId: fragment?.documentId
|
|
106
|
+
},
|
|
107
|
+
namespace: this.config.namespace
|
|
108
|
+
};
|
|
109
|
+
});
|
|
110
|
+
const request = {
|
|
111
|
+
collection: this.config.collection,
|
|
112
|
+
documents
|
|
113
|
+
};
|
|
114
|
+
await this.provider.upsert(request);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// src/ingestion/gmail-adapter.ts
|
|
119
|
+
class GmailIngestionAdapter {
|
|
120
|
+
gmail;
|
|
121
|
+
processor;
|
|
122
|
+
embeddings;
|
|
123
|
+
indexer;
|
|
124
|
+
constructor(gmail, processor, embeddings, indexer) {
|
|
125
|
+
this.gmail = gmail;
|
|
126
|
+
this.processor = processor;
|
|
127
|
+
this.embeddings = embeddings;
|
|
128
|
+
this.indexer = indexer;
|
|
129
|
+
}
|
|
130
|
+
async syncThreads(query) {
|
|
131
|
+
const threads = await this.gmail.listThreads(query);
|
|
132
|
+
for (const thread of threads) {
|
|
133
|
+
await this.ingestThread(thread);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
async ingestThread(thread) {
|
|
137
|
+
const document = this.toRawDocument(thread);
|
|
138
|
+
const fragments = await this.processor.process(document);
|
|
139
|
+
const embeddings = await this.embeddings.embedFragments(fragments);
|
|
140
|
+
await this.indexer.upsert(fragments, embeddings);
|
|
141
|
+
}
|
|
142
|
+
toRawDocument(thread) {
|
|
143
|
+
const content = composeThreadText(thread);
|
|
144
|
+
return {
|
|
145
|
+
id: thread.id,
|
|
146
|
+
mimeType: "text/plain",
|
|
147
|
+
data: Buffer.from(content, "utf-8"),
|
|
148
|
+
metadata: {
|
|
149
|
+
subject: thread.subject ?? "",
|
|
150
|
+
participants: thread.participants.map((p) => p.email).join(", "),
|
|
151
|
+
updatedAt: thread.updatedAt.toISOString()
|
|
152
|
+
}
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
function composeThreadText(thread) {
|
|
157
|
+
const header = [
|
|
158
|
+
`Subject: ${thread.subject ?? ""}`,
|
|
159
|
+
`Snippet: ${thread.snippet ?? ""}`
|
|
160
|
+
];
|
|
161
|
+
const messageTexts = thread.messages.map((message) => {
|
|
162
|
+
const parts = [
|
|
163
|
+
`From: ${formatAddress(message.from)}`,
|
|
164
|
+
`To: ${message.to.map(formatAddress).join(", ")}`
|
|
165
|
+
];
|
|
166
|
+
if (message.sentAt) {
|
|
167
|
+
parts.push(`Date: ${message.sentAt.toISOString()}`);
|
|
168
|
+
}
|
|
169
|
+
const body = message.textBody ?? stripHtml(message.htmlBody ?? "");
|
|
170
|
+
return `${parts.join(`
|
|
171
|
+
`)}
|
|
172
|
+
|
|
173
|
+
${body ?? ""}`;
|
|
174
|
+
});
|
|
175
|
+
return [...header, ...messageTexts].join(`
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
`);
|
|
180
|
+
}
|
|
181
|
+
function formatAddress(address) {
|
|
182
|
+
return address.name ? `${address.name} <${address.email}>` : address.email;
|
|
183
|
+
}
|
|
184
|
+
function stripHtml(html) {
|
|
185
|
+
return html.replace(/<[^>]+>/g, " ");
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// src/ingestion/storage-adapter.ts
|
|
189
|
+
class StorageIngestionAdapter {
|
|
190
|
+
processor;
|
|
191
|
+
embeddings;
|
|
192
|
+
indexer;
|
|
193
|
+
constructor(processor, embeddings, indexer) {
|
|
194
|
+
this.processor = processor;
|
|
195
|
+
this.embeddings = embeddings;
|
|
196
|
+
this.indexer = indexer;
|
|
197
|
+
}
|
|
198
|
+
async ingestObject(object) {
|
|
199
|
+
if (!("data" in object) || !object.data) {
|
|
200
|
+
throw new Error("Storage ingestion requires object data");
|
|
201
|
+
}
|
|
202
|
+
const raw = {
|
|
203
|
+
id: object.key,
|
|
204
|
+
mimeType: object.contentType ?? "application/octet-stream",
|
|
205
|
+
data: object.data,
|
|
206
|
+
metadata: {
|
|
207
|
+
bucket: object.bucket,
|
|
208
|
+
checksum: object.checksum ?? ""
|
|
209
|
+
}
|
|
210
|
+
};
|
|
211
|
+
const fragments = await this.processor.process(raw);
|
|
212
|
+
const embeddings = await this.embeddings.embedFragments(fragments);
|
|
213
|
+
await this.indexer.upsert(fragments, embeddings);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
export {
|
|
217
|
+
VectorIndexer,
|
|
218
|
+
StorageIngestionAdapter,
|
|
219
|
+
GmailIngestionAdapter,
|
|
220
|
+
EmbeddingService,
|
|
221
|
+
DocumentProcessor
|
|
222
|
+
};
|
|
@@ -1,16 +1,12 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
constructor(processor: DocumentProcessor, embeddings: EmbeddingService, indexer: VectorIndexer);
|
|
12
|
-
ingestObject(object: GetObjectResult): Promise<void>;
|
|
1
|
+
import type { GetObjectResult } from '@contractspec/lib.contracts';
|
|
2
|
+
import type { DocumentProcessor } from './document-processor';
|
|
3
|
+
import type { EmbeddingService } from './embedding-service';
|
|
4
|
+
import type { VectorIndexer } from './vector-indexer';
|
|
5
|
+
export declare class StorageIngestionAdapter {
|
|
6
|
+
private readonly processor;
|
|
7
|
+
private readonly embeddings;
|
|
8
|
+
private readonly indexer;
|
|
9
|
+
constructor(processor: DocumentProcessor, embeddings: EmbeddingService, indexer: VectorIndexer);
|
|
10
|
+
ingestObject(object: GetObjectResult): Promise<void>;
|
|
13
11
|
}
|
|
14
|
-
//#endregion
|
|
15
|
-
export { StorageIngestionAdapter };
|
|
16
12
|
//# sourceMappingURL=storage-adapter.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"storage-adapter.d.ts","
|
|
1
|
+
{"version":3,"file":"storage-adapter.d.ts","sourceRoot":"","sources":["../../src/ingestion/storage-adapter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AACnE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAEtD,qBAAa,uBAAuB;IAEhC,OAAO,CAAC,QAAQ,CAAC,SAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,UAAU;IAC3B,OAAO,CAAC,QAAQ,CAAC,OAAO;gBAFP,SAAS,EAAE,iBAAiB,EAC5B,UAAU,EAAE,gBAAgB,EAC5B,OAAO,EAAE,aAAa;IAGnC,YAAY,CAAC,MAAM,EAAE,eAAe,GAAG,OAAO,CAAC,IAAI,CAAC;CAmB3D"}
|
|
@@ -1,27 +1,32 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingestion/storage-adapter.ts
|
|
3
|
+
class StorageIngestionAdapter {
|
|
4
|
+
processor;
|
|
5
|
+
embeddings;
|
|
6
|
+
indexer;
|
|
7
|
+
constructor(processor, embeddings, indexer) {
|
|
8
|
+
this.processor = processor;
|
|
9
|
+
this.embeddings = embeddings;
|
|
10
|
+
this.indexer = indexer;
|
|
11
|
+
}
|
|
12
|
+
async ingestObject(object) {
|
|
13
|
+
if (!("data" in object) || !object.data) {
|
|
14
|
+
throw new Error("Storage ingestion requires object data");
|
|
15
|
+
}
|
|
16
|
+
const raw = {
|
|
17
|
+
id: object.key,
|
|
18
|
+
mimeType: object.contentType ?? "application/octet-stream",
|
|
19
|
+
data: object.data,
|
|
20
|
+
metadata: {
|
|
21
|
+
bucket: object.bucket,
|
|
22
|
+
checksum: object.checksum ?? ""
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
const fragments = await this.processor.process(raw);
|
|
26
|
+
const embeddings = await this.embeddings.embedFragments(fragments);
|
|
27
|
+
await this.indexer.upsert(fragments, embeddings);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
export {
|
|
31
|
+
StorageIngestionAdapter
|
|
23
32
|
};
|
|
24
|
-
|
|
25
|
-
//#endregion
|
|
26
|
-
export { StorageIngestionAdapter };
|
|
27
|
-
//# sourceMappingURL=storage-adapter.js.map
|
|
@@ -1,18 +1,14 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
namespace?: string;
|
|
8
|
-
metadata?: Record<string, string>;
|
|
1
|
+
import type { VectorStoreProvider, EmbeddingResult } from '@contractspec/lib.contracts';
|
|
2
|
+
import type { DocumentFragment } from './document-processor';
|
|
3
|
+
export interface VectorIndexConfig {
|
|
4
|
+
collection: string;
|
|
5
|
+
namespace?: string;
|
|
6
|
+
metadata?: Record<string, string>;
|
|
9
7
|
}
|
|
10
|
-
declare class VectorIndexer {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
8
|
+
export declare class VectorIndexer {
|
|
9
|
+
private readonly provider;
|
|
10
|
+
private readonly config;
|
|
11
|
+
constructor(provider: VectorStoreProvider, config: VectorIndexConfig);
|
|
12
|
+
upsert(fragments: DocumentFragment[], embeddings: EmbeddingResult[]): Promise<void>;
|
|
15
13
|
}
|
|
16
|
-
//#endregion
|
|
17
|
-
export { VectorIndexConfig, VectorIndexer };
|
|
18
14
|
//# sourceMappingURL=vector-indexer.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vector-indexer.d.ts","
|
|
1
|
+
{"version":3,"file":"vector-indexer.d.ts","sourceRoot":"","sources":["../../src/ingestion/vector-indexer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,mBAAmB,EAEnB,eAAe,EAChB,MAAM,6BAA6B,CAAC;AACrC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAE7D,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACnC;AAED,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAsB;IAC/C,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAoB;gBAE/B,QAAQ,EAAE,mBAAmB,EAAE,MAAM,EAAE,iBAAiB;IAK9D,MAAM,CACV,SAAS,EAAE,gBAAgB,EAAE,EAC7B,UAAU,EAAE,eAAe,EAAE,GAC5B,OAAO,CAAC,IAAI,CAAC;CAsBjB"}
|