@contractspec/lib.knowledge 1.57.0 → 1.59.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/access/guard.d.ts +13 -17
- package/dist/access/guard.d.ts.map +1 -1
- package/dist/access/guard.js +60 -49
- package/dist/access/index.d.ts +2 -2
- package/dist/access/index.d.ts.map +1 -0
- package/dist/access/index.js +60 -2
- package/dist/index.d.ts +6 -12
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +455 -12
- package/dist/ingestion/document-processor.d.ts +18 -20
- package/dist/ingestion/document-processor.d.ts.map +1 -1
- package/dist/ingestion/document-processor.js +63 -53
- package/dist/ingestion/embedding-service.d.ts +7 -11
- package/dist/ingestion/embedding-service.d.ts.map +1 -1
- package/dist/ingestion/embedding-service.js +26 -25
- package/dist/ingestion/gmail-adapter.d.ts +13 -17
- package/dist/ingestion/gmail-adapter.d.ts.map +1 -1
- package/dist/ingestion/gmail-adapter.js +67 -46
- package/dist/ingestion/index.d.ts +6 -6
- package/dist/ingestion/index.d.ts.map +1 -0
- package/dist/ingestion/index.js +221 -6
- package/dist/ingestion/storage-adapter.d.ts +10 -14
- package/dist/ingestion/storage-adapter.d.ts.map +1 -1
- package/dist/ingestion/storage-adapter.js +31 -26
- package/dist/ingestion/vector-indexer.d.ts +11 -15
- package/dist/ingestion/vector-indexer.d.ts.map +1 -1
- package/dist/ingestion/vector-indexer.js +32 -32
- package/dist/node/access/guard.js +60 -0
- package/dist/node/access/index.js +60 -0
- package/dist/node/index.js +454 -0
- package/dist/node/ingestion/document-processor.js +64 -0
- package/dist/node/ingestion/embedding-service.js +26 -0
- package/dist/node/ingestion/gmail-adapter.js +72 -0
- package/dist/node/ingestion/index.js +221 -0
- package/dist/node/ingestion/storage-adapter.js +31 -0
- package/dist/node/ingestion/vector-indexer.js +32 -0
- package/dist/node/query/index.js +79 -0
- package/dist/node/query/service.js +79 -0
- package/dist/node/retriever/index.js +100 -0
- package/dist/node/retriever/interface.js +0 -0
- package/dist/node/retriever/static-retriever.js +43 -0
- package/dist/node/retriever/vector-retriever.js +58 -0
- package/dist/node/types.js +0 -0
- package/dist/query/index.d.ts +2 -2
- package/dist/query/index.d.ts.map +1 -0
- package/dist/query/index.js +79 -2
- package/dist/query/service.d.ts +20 -24
- package/dist/query/service.d.ts.map +1 -1
- package/dist/query/service.js +76 -62
- package/dist/retriever/index.d.ts +4 -4
- package/dist/retriever/index.d.ts.map +1 -0
- package/dist/retriever/index.js +100 -3
- package/dist/retriever/interface.d.ts +38 -42
- package/dist/retriever/interface.d.ts.map +1 -1
- package/dist/retriever/interface.js +1 -0
- package/dist/retriever/static-retriever.d.ts +13 -17
- package/dist/retriever/static-retriever.d.ts.map +1 -1
- package/dist/retriever/static-retriever.js +42 -46
- package/dist/retriever/vector-retriever.d.ts +23 -27
- package/dist/retriever/vector-retriever.d.ts.map +1 -1
- package/dist/retriever/vector-retriever.js +57 -59
- package/dist/types.d.ts +34 -38
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -0
- package/package.json +152 -45
- package/dist/access/guard.js.map +0 -1
- package/dist/ingestion/document-processor.js.map +0 -1
- package/dist/ingestion/embedding-service.js.map +0 -1
- package/dist/ingestion/gmail-adapter.js.map +0 -1
- package/dist/ingestion/storage-adapter.js.map +0 -1
- package/dist/ingestion/vector-indexer.js.map +0 -1
- package/dist/query/service.js.map +0 -1
- package/dist/retriever/static-retriever.js.map +0 -1
- package/dist/retriever/vector-retriever.js.map +0 -1
|
@@ -1,33 +1,33 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
1
|
+
// @bun
|
|
2
|
+
// src/ingestion/vector-indexer.ts
|
|
3
|
+
class VectorIndexer {
|
|
4
|
+
provider;
|
|
5
|
+
config;
|
|
6
|
+
constructor(provider, config) {
|
|
7
|
+
this.provider = provider;
|
|
8
|
+
this.config = config;
|
|
9
|
+
}
|
|
10
|
+
async upsert(fragments, embeddings) {
|
|
11
|
+
const documents = embeddings.map((embedding) => {
|
|
12
|
+
const fragment = fragments.find((f) => f.id === embedding.id);
|
|
13
|
+
return {
|
|
14
|
+
id: embedding.id,
|
|
15
|
+
vector: embedding.vector,
|
|
16
|
+
payload: {
|
|
17
|
+
...this.config.metadata,
|
|
18
|
+
...fragment?.metadata ?? {},
|
|
19
|
+
documentId: fragment?.documentId
|
|
20
|
+
},
|
|
21
|
+
namespace: this.config.namespace
|
|
22
|
+
};
|
|
23
|
+
});
|
|
24
|
+
const request = {
|
|
25
|
+
collection: this.config.collection,
|
|
26
|
+
documents
|
|
27
|
+
};
|
|
28
|
+
await this.provider.upsert(request);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
export {
|
|
32
|
+
VectorIndexer
|
|
29
33
|
};
|
|
30
|
-
|
|
31
|
-
//#endregion
|
|
32
|
-
export { VectorIndexer };
|
|
33
|
-
//# sourceMappingURL=vector-indexer.js.map
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
// src/access/guard.ts
|
|
2
|
+
var DEFAULT_DISALLOWED_WRITE = ["external", "ephemeral"];
|
|
3
|
+
|
|
4
|
+
class KnowledgeAccessGuard {
|
|
5
|
+
disallowedWrite;
|
|
6
|
+
requireWorkflowBinding;
|
|
7
|
+
requireAgentBinding;
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.disallowedWrite = new Set(options.disallowWriteCategories ?? DEFAULT_DISALLOWED_WRITE);
|
|
10
|
+
this.requireWorkflowBinding = options.requireWorkflowBinding ?? true;
|
|
11
|
+
this.requireAgentBinding = options.requireAgentBinding ?? false;
|
|
12
|
+
}
|
|
13
|
+
checkAccess(spaceBinding, context, appConfig) {
|
|
14
|
+
const { binding, space } = spaceBinding;
|
|
15
|
+
if (binding.required !== false && !this.isSpaceBound(spaceBinding, appConfig)) {
|
|
16
|
+
return {
|
|
17
|
+
allowed: false,
|
|
18
|
+
reason: `Knowledge space "${space.meta.key}" is not bound in the resolved app config.`
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
if (context.operation === "write" && this.disallowedWrite.has(space.meta.category)) {
|
|
22
|
+
return {
|
|
23
|
+
allowed: false,
|
|
24
|
+
reason: `Knowledge space "${space.meta.key}" is category "${space.meta.category}" and is read-only.`
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
if (this.requireWorkflowBinding && context.workflowName) {
|
|
28
|
+
const allowedWorkflows = binding.scope?.workflows;
|
|
29
|
+
if (allowedWorkflows && !allowedWorkflows.includes(context.workflowName)) {
|
|
30
|
+
return {
|
|
31
|
+
allowed: false,
|
|
32
|
+
reason: `Workflow "${context.workflowName}" is not authorized to access knowledge space "${space.meta.key}".`
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (this.requireAgentBinding && context.agentName) {
|
|
37
|
+
const allowedAgents = binding.scope?.agents;
|
|
38
|
+
if (allowedAgents && !allowedAgents.includes(context.agentName)) {
|
|
39
|
+
return {
|
|
40
|
+
allowed: false,
|
|
41
|
+
reason: `Agent "${context.agentName}" is not authorized to access knowledge space "${space.meta.key}".`
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (space.meta.category === "ephemeral") {
|
|
46
|
+
return {
|
|
47
|
+
allowed: true,
|
|
48
|
+
severity: "warning",
|
|
49
|
+
reason: `Knowledge space "${space.meta.key}" is ephemeral; results may be transient.`
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
return { allowed: true };
|
|
53
|
+
}
|
|
54
|
+
isSpaceBound(resolved, appConfig) {
|
|
55
|
+
return appConfig.knowledge.some((entry) => entry.space.meta.key === resolved.space.meta.key && (resolved.space.meta.version == null || entry.space.meta.version === resolved.space.meta.version));
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
export {
|
|
59
|
+
KnowledgeAccessGuard
|
|
60
|
+
};
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
// src/access/guard.ts
|
|
2
|
+
var DEFAULT_DISALLOWED_WRITE = ["external", "ephemeral"];
|
|
3
|
+
|
|
4
|
+
class KnowledgeAccessGuard {
|
|
5
|
+
disallowedWrite;
|
|
6
|
+
requireWorkflowBinding;
|
|
7
|
+
requireAgentBinding;
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.disallowedWrite = new Set(options.disallowWriteCategories ?? DEFAULT_DISALLOWED_WRITE);
|
|
10
|
+
this.requireWorkflowBinding = options.requireWorkflowBinding ?? true;
|
|
11
|
+
this.requireAgentBinding = options.requireAgentBinding ?? false;
|
|
12
|
+
}
|
|
13
|
+
checkAccess(spaceBinding, context, appConfig) {
|
|
14
|
+
const { binding, space } = spaceBinding;
|
|
15
|
+
if (binding.required !== false && !this.isSpaceBound(spaceBinding, appConfig)) {
|
|
16
|
+
return {
|
|
17
|
+
allowed: false,
|
|
18
|
+
reason: `Knowledge space "${space.meta.key}" is not bound in the resolved app config.`
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
if (context.operation === "write" && this.disallowedWrite.has(space.meta.category)) {
|
|
22
|
+
return {
|
|
23
|
+
allowed: false,
|
|
24
|
+
reason: `Knowledge space "${space.meta.key}" is category "${space.meta.category}" and is read-only.`
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
if (this.requireWorkflowBinding && context.workflowName) {
|
|
28
|
+
const allowedWorkflows = binding.scope?.workflows;
|
|
29
|
+
if (allowedWorkflows && !allowedWorkflows.includes(context.workflowName)) {
|
|
30
|
+
return {
|
|
31
|
+
allowed: false,
|
|
32
|
+
reason: `Workflow "${context.workflowName}" is not authorized to access knowledge space "${space.meta.key}".`
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (this.requireAgentBinding && context.agentName) {
|
|
37
|
+
const allowedAgents = binding.scope?.agents;
|
|
38
|
+
if (allowedAgents && !allowedAgents.includes(context.agentName)) {
|
|
39
|
+
return {
|
|
40
|
+
allowed: false,
|
|
41
|
+
reason: `Agent "${context.agentName}" is not authorized to access knowledge space "${space.meta.key}".`
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (space.meta.category === "ephemeral") {
|
|
46
|
+
return {
|
|
47
|
+
allowed: true,
|
|
48
|
+
severity: "warning",
|
|
49
|
+
reason: `Knowledge space "${space.meta.key}" is ephemeral; results may be transient.`
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
return { allowed: true };
|
|
53
|
+
}
|
|
54
|
+
isSpaceBound(resolved, appConfig) {
|
|
55
|
+
return appConfig.knowledge.some((entry) => entry.space.meta.key === resolved.space.meta.key && (resolved.space.meta.version == null || entry.space.meta.version === resolved.space.meta.version));
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
export {
|
|
59
|
+
KnowledgeAccessGuard
|
|
60
|
+
};
|
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
// src/access/guard.ts
|
|
2
|
+
var DEFAULT_DISALLOWED_WRITE = ["external", "ephemeral"];
|
|
3
|
+
|
|
4
|
+
class KnowledgeAccessGuard {
|
|
5
|
+
disallowedWrite;
|
|
6
|
+
requireWorkflowBinding;
|
|
7
|
+
requireAgentBinding;
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.disallowedWrite = new Set(options.disallowWriteCategories ?? DEFAULT_DISALLOWED_WRITE);
|
|
10
|
+
this.requireWorkflowBinding = options.requireWorkflowBinding ?? true;
|
|
11
|
+
this.requireAgentBinding = options.requireAgentBinding ?? false;
|
|
12
|
+
}
|
|
13
|
+
checkAccess(spaceBinding, context, appConfig) {
|
|
14
|
+
const { binding, space } = spaceBinding;
|
|
15
|
+
if (binding.required !== false && !this.isSpaceBound(spaceBinding, appConfig)) {
|
|
16
|
+
return {
|
|
17
|
+
allowed: false,
|
|
18
|
+
reason: `Knowledge space "${space.meta.key}" is not bound in the resolved app config.`
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
if (context.operation === "write" && this.disallowedWrite.has(space.meta.category)) {
|
|
22
|
+
return {
|
|
23
|
+
allowed: false,
|
|
24
|
+
reason: `Knowledge space "${space.meta.key}" is category "${space.meta.category}" and is read-only.`
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
if (this.requireWorkflowBinding && context.workflowName) {
|
|
28
|
+
const allowedWorkflows = binding.scope?.workflows;
|
|
29
|
+
if (allowedWorkflows && !allowedWorkflows.includes(context.workflowName)) {
|
|
30
|
+
return {
|
|
31
|
+
allowed: false,
|
|
32
|
+
reason: `Workflow "${context.workflowName}" is not authorized to access knowledge space "${space.meta.key}".`
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
if (this.requireAgentBinding && context.agentName) {
|
|
37
|
+
const allowedAgents = binding.scope?.agents;
|
|
38
|
+
if (allowedAgents && !allowedAgents.includes(context.agentName)) {
|
|
39
|
+
return {
|
|
40
|
+
allowed: false,
|
|
41
|
+
reason: `Agent "${context.agentName}" is not authorized to access knowledge space "${space.meta.key}".`
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (space.meta.category === "ephemeral") {
|
|
46
|
+
return {
|
|
47
|
+
allowed: true,
|
|
48
|
+
severity: "warning",
|
|
49
|
+
reason: `Knowledge space "${space.meta.key}" is ephemeral; results may be transient.`
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
return { allowed: true };
|
|
53
|
+
}
|
|
54
|
+
isSpaceBound(resolved, appConfig) {
|
|
55
|
+
return appConfig.knowledge.some((entry) => entry.space.meta.key === resolved.space.meta.key && (resolved.space.meta.version == null || entry.space.meta.version === resolved.space.meta.version));
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// src/retriever/static-retriever.ts
|
|
59
|
+
class StaticRetriever {
|
|
60
|
+
content;
|
|
61
|
+
constructor(config) {
|
|
62
|
+
this.content = config.content instanceof Map ? config.content : new Map(Object.entries(config.content));
|
|
63
|
+
}
|
|
64
|
+
async retrieve(query, options) {
|
|
65
|
+
const content = this.content.get(options.spaceKey);
|
|
66
|
+
if (!content)
|
|
67
|
+
return [];
|
|
68
|
+
const queryLower = query.toLowerCase();
|
|
69
|
+
const lines = content.split(`
|
|
70
|
+
`).filter((line) => line.trim());
|
|
71
|
+
const results = [];
|
|
72
|
+
for (const line of lines) {
|
|
73
|
+
if (line.toLowerCase().includes(queryLower)) {
|
|
74
|
+
results.push({
|
|
75
|
+
content: line,
|
|
76
|
+
source: options.spaceKey,
|
|
77
|
+
score: 1,
|
|
78
|
+
metadata: { type: "static" }
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return results.slice(0, options.topK ?? 5);
|
|
83
|
+
}
|
|
84
|
+
async getStatic(spaceKey) {
|
|
85
|
+
return this.content.get(spaceKey) ?? null;
|
|
86
|
+
}
|
|
87
|
+
supportsSpace(spaceKey) {
|
|
88
|
+
return this.content.has(spaceKey);
|
|
89
|
+
}
|
|
90
|
+
listSpaces() {
|
|
91
|
+
return [...this.content.keys()];
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
function createStaticRetriever(content) {
|
|
95
|
+
return new StaticRetriever({ content });
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// src/retriever/vector-retriever.ts
|
|
99
|
+
class VectorRetriever {
|
|
100
|
+
config;
|
|
101
|
+
spaceCollections;
|
|
102
|
+
staticContent;
|
|
103
|
+
constructor(config) {
|
|
104
|
+
this.config = config;
|
|
105
|
+
this.spaceCollections = config.spaceCollections instanceof Map ? config.spaceCollections : new Map(Object.entries(config.spaceCollections));
|
|
106
|
+
this.staticContent = config.staticContent ? config.staticContent instanceof Map ? config.staticContent : new Map(Object.entries(config.staticContent)) : new Map;
|
|
107
|
+
}
|
|
108
|
+
async retrieve(query, options) {
|
|
109
|
+
const collection = this.spaceCollections.get(options.spaceKey);
|
|
110
|
+
if (!collection) {
|
|
111
|
+
return [];
|
|
112
|
+
}
|
|
113
|
+
const embedding = await this.config.embeddings.embedQuery(query);
|
|
114
|
+
const results = await this.config.vectorStore.search({
|
|
115
|
+
collection,
|
|
116
|
+
vector: embedding.vector,
|
|
117
|
+
topK: options.topK ?? this.config.defaultTopK ?? 5,
|
|
118
|
+
namespace: options.tenantId,
|
|
119
|
+
filter: options.filter
|
|
120
|
+
});
|
|
121
|
+
const minScore = options.minScore ?? this.config.defaultMinScore ?? 0;
|
|
122
|
+
const filtered = results.filter((r) => r.score >= minScore);
|
|
123
|
+
return filtered.map((result) => ({
|
|
124
|
+
content: this.extractContent(result.payload),
|
|
125
|
+
source: result.id,
|
|
126
|
+
score: result.score,
|
|
127
|
+
metadata: result.payload
|
|
128
|
+
}));
|
|
129
|
+
}
|
|
130
|
+
async getStatic(spaceKey) {
|
|
131
|
+
return this.staticContent.get(spaceKey) ?? null;
|
|
132
|
+
}
|
|
133
|
+
supportsSpace(spaceKey) {
|
|
134
|
+
return this.spaceCollections.has(spaceKey);
|
|
135
|
+
}
|
|
136
|
+
listSpaces() {
|
|
137
|
+
return [...this.spaceCollections.keys()];
|
|
138
|
+
}
|
|
139
|
+
extractContent(payload) {
|
|
140
|
+
if (!payload)
|
|
141
|
+
return "";
|
|
142
|
+
if (typeof payload.text === "string")
|
|
143
|
+
return payload.text;
|
|
144
|
+
if (typeof payload.content === "string")
|
|
145
|
+
return payload.content;
|
|
146
|
+
return JSON.stringify(payload);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
function createVectorRetriever(config) {
|
|
150
|
+
return new VectorRetriever(config);
|
|
151
|
+
}
|
|
152
|
+
// src/query/service.ts
|
|
153
|
+
class KnowledgeQueryService {
|
|
154
|
+
embeddings;
|
|
155
|
+
vectorStore;
|
|
156
|
+
llm;
|
|
157
|
+
config;
|
|
158
|
+
constructor(embeddings, vectorStore, llm, config) {
|
|
159
|
+
this.embeddings = embeddings;
|
|
160
|
+
this.vectorStore = vectorStore;
|
|
161
|
+
this.llm = llm;
|
|
162
|
+
this.config = config;
|
|
163
|
+
}
|
|
164
|
+
async query(question) {
|
|
165
|
+
const embedding = await this.embeddings.embedQuery(question);
|
|
166
|
+
const results = await this.vectorStore.search({
|
|
167
|
+
collection: this.config.collection,
|
|
168
|
+
vector: embedding.vector,
|
|
169
|
+
topK: this.config.topK ?? 5,
|
|
170
|
+
namespace: this.config.namespace,
|
|
171
|
+
filter: undefined
|
|
172
|
+
});
|
|
173
|
+
const context = buildContext(results);
|
|
174
|
+
const messages = this.buildMessages(question, context);
|
|
175
|
+
const response = await this.llm.chat(messages);
|
|
176
|
+
return {
|
|
177
|
+
answer: response.message.content.map((part) => ("text" in part) ? part.text : "").join(""),
|
|
178
|
+
references: results.map((result) => ({
|
|
179
|
+
...result,
|
|
180
|
+
text: extractText(result)
|
|
181
|
+
})),
|
|
182
|
+
usage: response.usage
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
buildMessages(question, context) {
|
|
186
|
+
const systemPrompt = this.config.systemPrompt ?? "You are a knowledge assistant that answers questions using the provided context. Cite relevant sources if possible.";
|
|
187
|
+
return [
|
|
188
|
+
{
|
|
189
|
+
role: "system",
|
|
190
|
+
content: [{ type: "text", text: systemPrompt }]
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
role: "user",
|
|
194
|
+
content: [
|
|
195
|
+
{
|
|
196
|
+
type: "text",
|
|
197
|
+
text: `Question:
|
|
198
|
+
${question}
|
|
199
|
+
|
|
200
|
+
Context:
|
|
201
|
+
${context}`
|
|
202
|
+
}
|
|
203
|
+
]
|
|
204
|
+
}
|
|
205
|
+
];
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
function buildContext(results) {
|
|
209
|
+
if (results.length === 0) {
|
|
210
|
+
return "No relevant documents found.";
|
|
211
|
+
}
|
|
212
|
+
return results.map((result, index) => {
|
|
213
|
+
const text = extractText(result);
|
|
214
|
+
return `Source ${index + 1} (score: ${result.score.toFixed(3)}):
|
|
215
|
+
${text}`;
|
|
216
|
+
}).join(`
|
|
217
|
+
|
|
218
|
+
`);
|
|
219
|
+
}
|
|
220
|
+
function extractText(result) {
|
|
221
|
+
const payload = result.payload ?? {};
|
|
222
|
+
if (typeof payload.text === "string")
|
|
223
|
+
return payload.text;
|
|
224
|
+
if (typeof payload.content === "string")
|
|
225
|
+
return payload.content;
|
|
226
|
+
return JSON.stringify(payload);
|
|
227
|
+
}
|
|
228
|
+
// src/ingestion/document-processor.ts
|
|
229
|
+
import { Buffer as Buffer2 } from "node:buffer";
|
|
230
|
+
|
|
231
|
+
class DocumentProcessor {
|
|
232
|
+
extractors = new Map;
|
|
233
|
+
constructor() {
|
|
234
|
+
this.registerExtractor("text/plain", this.extractText.bind(this));
|
|
235
|
+
this.registerExtractor("application/json", this.extractJson.bind(this));
|
|
236
|
+
}
|
|
237
|
+
registerExtractor(mimeType, extractor) {
|
|
238
|
+
this.extractors.set(mimeType.toLowerCase(), extractor);
|
|
239
|
+
}
|
|
240
|
+
async process(document) {
|
|
241
|
+
const extractor = this.extractors.get(document.mimeType.toLowerCase()) ?? this.extractors.get("*/*");
|
|
242
|
+
if (!extractor) {
|
|
243
|
+
throw new Error(`No extractor registered for mime type ${document.mimeType}`);
|
|
244
|
+
}
|
|
245
|
+
const fragments = await extractor(document);
|
|
246
|
+
if (fragments.length === 0) {
|
|
247
|
+
return [
|
|
248
|
+
{
|
|
249
|
+
id: `${document.id}:0`,
|
|
250
|
+
documentId: document.id,
|
|
251
|
+
text: "",
|
|
252
|
+
metadata: document.metadata
|
|
253
|
+
}
|
|
254
|
+
];
|
|
255
|
+
}
|
|
256
|
+
return fragments;
|
|
257
|
+
}
|
|
258
|
+
async extractText(document) {
|
|
259
|
+
const text = Buffer2.from(document.data).toString("utf-8");
|
|
260
|
+
return [
|
|
261
|
+
{
|
|
262
|
+
id: `${document.id}:0`,
|
|
263
|
+
documentId: document.id,
|
|
264
|
+
text,
|
|
265
|
+
metadata: document.metadata
|
|
266
|
+
}
|
|
267
|
+
];
|
|
268
|
+
}
|
|
269
|
+
async extractJson(document) {
|
|
270
|
+
const text = Buffer2.from(document.data).toString("utf-8");
|
|
271
|
+
try {
|
|
272
|
+
const json = JSON.parse(text);
|
|
273
|
+
return [
|
|
274
|
+
{
|
|
275
|
+
id: `${document.id}:0`,
|
|
276
|
+
documentId: document.id,
|
|
277
|
+
text: JSON.stringify(json, null, 2),
|
|
278
|
+
metadata: {
|
|
279
|
+
...document.metadata,
|
|
280
|
+
contentType: "application/json"
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
];
|
|
284
|
+
} catch {
|
|
285
|
+
return this.extractText(document);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// src/ingestion/embedding-service.ts
|
|
291
|
+
class EmbeddingService {
|
|
292
|
+
provider;
|
|
293
|
+
batchSize;
|
|
294
|
+
constructor(provider, batchSize = 16) {
|
|
295
|
+
this.provider = provider;
|
|
296
|
+
this.batchSize = batchSize;
|
|
297
|
+
}
|
|
298
|
+
async embedFragments(fragments) {
|
|
299
|
+
const results = [];
|
|
300
|
+
for (let i = 0;i < fragments.length; i += this.batchSize) {
|
|
301
|
+
const slice = fragments.slice(i, i + this.batchSize);
|
|
302
|
+
const documents = slice.map((fragment) => ({
|
|
303
|
+
id: fragment.id,
|
|
304
|
+
text: fragment.text,
|
|
305
|
+
metadata: fragment.metadata
|
|
306
|
+
}));
|
|
307
|
+
const embeddings = await this.provider.embedDocuments(documents);
|
|
308
|
+
results.push(...embeddings);
|
|
309
|
+
}
|
|
310
|
+
return results;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// src/ingestion/vector-indexer.ts
|
|
315
|
+
class VectorIndexer {
|
|
316
|
+
provider;
|
|
317
|
+
config;
|
|
318
|
+
constructor(provider, config) {
|
|
319
|
+
this.provider = provider;
|
|
320
|
+
this.config = config;
|
|
321
|
+
}
|
|
322
|
+
async upsert(fragments, embeddings) {
|
|
323
|
+
const documents = embeddings.map((embedding) => {
|
|
324
|
+
const fragment = fragments.find((f) => f.id === embedding.id);
|
|
325
|
+
return {
|
|
326
|
+
id: embedding.id,
|
|
327
|
+
vector: embedding.vector,
|
|
328
|
+
payload: {
|
|
329
|
+
...this.config.metadata,
|
|
330
|
+
...fragment?.metadata ?? {},
|
|
331
|
+
documentId: fragment?.documentId
|
|
332
|
+
},
|
|
333
|
+
namespace: this.config.namespace
|
|
334
|
+
};
|
|
335
|
+
});
|
|
336
|
+
const request = {
|
|
337
|
+
collection: this.config.collection,
|
|
338
|
+
documents
|
|
339
|
+
};
|
|
340
|
+
await this.provider.upsert(request);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// src/ingestion/gmail-adapter.ts
|
|
345
|
+
class GmailIngestionAdapter {
|
|
346
|
+
gmail;
|
|
347
|
+
processor;
|
|
348
|
+
embeddings;
|
|
349
|
+
indexer;
|
|
350
|
+
constructor(gmail, processor, embeddings, indexer) {
|
|
351
|
+
this.gmail = gmail;
|
|
352
|
+
this.processor = processor;
|
|
353
|
+
this.embeddings = embeddings;
|
|
354
|
+
this.indexer = indexer;
|
|
355
|
+
}
|
|
356
|
+
async syncThreads(query) {
|
|
357
|
+
const threads = await this.gmail.listThreads(query);
|
|
358
|
+
for (const thread of threads) {
|
|
359
|
+
await this.ingestThread(thread);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
async ingestThread(thread) {
|
|
363
|
+
const document = this.toRawDocument(thread);
|
|
364
|
+
const fragments = await this.processor.process(document);
|
|
365
|
+
const embeddings = await this.embeddings.embedFragments(fragments);
|
|
366
|
+
await this.indexer.upsert(fragments, embeddings);
|
|
367
|
+
}
|
|
368
|
+
toRawDocument(thread) {
|
|
369
|
+
const content = composeThreadText(thread);
|
|
370
|
+
return {
|
|
371
|
+
id: thread.id,
|
|
372
|
+
mimeType: "text/plain",
|
|
373
|
+
data: Buffer.from(content, "utf-8"),
|
|
374
|
+
metadata: {
|
|
375
|
+
subject: thread.subject ?? "",
|
|
376
|
+
participants: thread.participants.map((p) => p.email).join(", "),
|
|
377
|
+
updatedAt: thread.updatedAt.toISOString()
|
|
378
|
+
}
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
function composeThreadText(thread) {
|
|
383
|
+
const header = [
|
|
384
|
+
`Subject: ${thread.subject ?? ""}`,
|
|
385
|
+
`Snippet: ${thread.snippet ?? ""}`
|
|
386
|
+
];
|
|
387
|
+
const messageTexts = thread.messages.map((message) => {
|
|
388
|
+
const parts = [
|
|
389
|
+
`From: ${formatAddress(message.from)}`,
|
|
390
|
+
`To: ${message.to.map(formatAddress).join(", ")}`
|
|
391
|
+
];
|
|
392
|
+
if (message.sentAt) {
|
|
393
|
+
parts.push(`Date: ${message.sentAt.toISOString()}`);
|
|
394
|
+
}
|
|
395
|
+
const body = message.textBody ?? stripHtml(message.htmlBody ?? "");
|
|
396
|
+
return `${parts.join(`
|
|
397
|
+
`)}
|
|
398
|
+
|
|
399
|
+
${body ?? ""}`;
|
|
400
|
+
});
|
|
401
|
+
return [...header, ...messageTexts].join(`
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
`);
|
|
406
|
+
}
|
|
407
|
+
function formatAddress(address) {
|
|
408
|
+
return address.name ? `${address.name} <${address.email}>` : address.email;
|
|
409
|
+
}
|
|
410
|
+
function stripHtml(html) {
|
|
411
|
+
return html.replace(/<[^>]+>/g, " ");
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// src/ingestion/storage-adapter.ts
|
|
415
|
+
class StorageIngestionAdapter {
|
|
416
|
+
processor;
|
|
417
|
+
embeddings;
|
|
418
|
+
indexer;
|
|
419
|
+
constructor(processor, embeddings, indexer) {
|
|
420
|
+
this.processor = processor;
|
|
421
|
+
this.embeddings = embeddings;
|
|
422
|
+
this.indexer = indexer;
|
|
423
|
+
}
|
|
424
|
+
async ingestObject(object) {
|
|
425
|
+
if (!("data" in object) || !object.data) {
|
|
426
|
+
throw new Error("Storage ingestion requires object data");
|
|
427
|
+
}
|
|
428
|
+
const raw = {
|
|
429
|
+
id: object.key,
|
|
430
|
+
mimeType: object.contentType ?? "application/octet-stream",
|
|
431
|
+
data: object.data,
|
|
432
|
+
metadata: {
|
|
433
|
+
bucket: object.bucket,
|
|
434
|
+
checksum: object.checksum ?? ""
|
|
435
|
+
}
|
|
436
|
+
};
|
|
437
|
+
const fragments = await this.processor.process(raw);
|
|
438
|
+
const embeddings = await this.embeddings.embedFragments(fragments);
|
|
439
|
+
await this.indexer.upsert(fragments, embeddings);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
export {
|
|
443
|
+
createVectorRetriever,
|
|
444
|
+
createStaticRetriever,
|
|
445
|
+
VectorRetriever,
|
|
446
|
+
VectorIndexer,
|
|
447
|
+
StorageIngestionAdapter,
|
|
448
|
+
StaticRetriever,
|
|
449
|
+
KnowledgeQueryService,
|
|
450
|
+
KnowledgeAccessGuard,
|
|
451
|
+
GmailIngestionAdapter,
|
|
452
|
+
EmbeddingService,
|
|
453
|
+
DocumentProcessor
|
|
454
|
+
};
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
// src/ingestion/document-processor.ts
|
|
2
|
+
import { Buffer } from "node:buffer";
|
|
3
|
+
|
|
4
|
+
class DocumentProcessor {
|
|
5
|
+
extractors = new Map;
|
|
6
|
+
constructor() {
|
|
7
|
+
this.registerExtractor("text/plain", this.extractText.bind(this));
|
|
8
|
+
this.registerExtractor("application/json", this.extractJson.bind(this));
|
|
9
|
+
}
|
|
10
|
+
registerExtractor(mimeType, extractor) {
|
|
11
|
+
this.extractors.set(mimeType.toLowerCase(), extractor);
|
|
12
|
+
}
|
|
13
|
+
async process(document) {
|
|
14
|
+
const extractor = this.extractors.get(document.mimeType.toLowerCase()) ?? this.extractors.get("*/*");
|
|
15
|
+
if (!extractor) {
|
|
16
|
+
throw new Error(`No extractor registered for mime type ${document.mimeType}`);
|
|
17
|
+
}
|
|
18
|
+
const fragments = await extractor(document);
|
|
19
|
+
if (fragments.length === 0) {
|
|
20
|
+
return [
|
|
21
|
+
{
|
|
22
|
+
id: `${document.id}:0`,
|
|
23
|
+
documentId: document.id,
|
|
24
|
+
text: "",
|
|
25
|
+
metadata: document.metadata
|
|
26
|
+
}
|
|
27
|
+
];
|
|
28
|
+
}
|
|
29
|
+
return fragments;
|
|
30
|
+
}
|
|
31
|
+
async extractText(document) {
|
|
32
|
+
const text = Buffer.from(document.data).toString("utf-8");
|
|
33
|
+
return [
|
|
34
|
+
{
|
|
35
|
+
id: `${document.id}:0`,
|
|
36
|
+
documentId: document.id,
|
|
37
|
+
text,
|
|
38
|
+
metadata: document.metadata
|
|
39
|
+
}
|
|
40
|
+
];
|
|
41
|
+
}
|
|
42
|
+
async extractJson(document) {
|
|
43
|
+
const text = Buffer.from(document.data).toString("utf-8");
|
|
44
|
+
try {
|
|
45
|
+
const json = JSON.parse(text);
|
|
46
|
+
return [
|
|
47
|
+
{
|
|
48
|
+
id: `${document.id}:0`,
|
|
49
|
+
documentId: document.id,
|
|
50
|
+
text: JSON.stringify(json, null, 2),
|
|
51
|
+
metadata: {
|
|
52
|
+
...document.metadata,
|
|
53
|
+
contentType: "application/json"
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
];
|
|
57
|
+
} catch {
|
|
58
|
+
return this.extractText(document);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
export {
|
|
63
|
+
DocumentProcessor
|
|
64
|
+
};
|