@hasna/knowledge 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -4
- package/bin/open-knowledge-mcp.js +1067 -1569
- package/bin/open-knowledge.js +257 -4
- package/docs/architecture/ai-native-knowledge-base.md +191 -0
- package/docs/architecture/hybrid-semantic-search.md +135 -0
- package/package.json +12 -7
- package/src/artifact-store.ts +184 -0
- package/src/cli.ts +662 -0
- package/src/knowledge-db.ts +247 -0
- package/src/manifest-ingest.ts +423 -0
- package/src/mcp.js +533 -0
- package/src/schema.js +25 -0
- package/src/source-ref.ts +92 -0
- package/src/store.ts +16 -6
- package/src/wiki-layout.ts +104 -0
- package/src/workspace.ts +123 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Hybrid Semantic Search Architecture
|
|
2
|
+
|
|
3
|
+
`open-knowledge` search is hybrid by design. Keyword search, semantic vector
|
|
4
|
+
search, wiki graph traversal, and citation/provenance signals each solve a
|
|
5
|
+
different part of the retrieval problem.
|
|
6
|
+
|
|
7
|
+
## Ownership Boundary
|
|
8
|
+
|
|
9
|
+
`open-files` supplies:
|
|
10
|
+
|
|
11
|
+
- Stable source refs such as `open-files://file/<id>` and
|
|
12
|
+
`open-files://file/<id>/revision/<revision_id>`.
|
|
13
|
+
- Source ids, file ids, revisions, hashes, MIME metadata, and storage location.
|
|
14
|
+
- Extracted text and extraction status.
|
|
15
|
+
- Read-only content resolution for deeper reads.
|
|
16
|
+
- Change events that trigger incremental reindexing.
|
|
17
|
+
|
|
18
|
+
`open-knowledge` owns:
|
|
19
|
+
|
|
20
|
+
- Chunk boundaries and chunk metadata.
|
|
21
|
+
- Embeddings for source chunks, wiki pages, decisions, and durable answers.
|
|
22
|
+
- FTS indexes over chunks and generated wiki artifacts.
|
|
23
|
+
- Wiki backlinks and citation graph signals.
|
|
24
|
+
- Merge, dedupe, rerank, freshness scoring, and context packing.
|
|
25
|
+
- Permission-aware filtering before any content reaches a model.
|
|
26
|
+
|
|
27
|
+
## Local Indexes
|
|
28
|
+
|
|
29
|
+
Local mode starts with SQLite:
|
|
30
|
+
|
|
31
|
+
- `chunks` stores source/wiki text segments and metadata.
|
|
32
|
+
- `chunks_fts` provides keyword search.
|
|
33
|
+
- `chunk_embeddings` stores embedding vectors as JSON until a local vector
|
|
34
|
+
extension is chosen.
|
|
35
|
+
- `wiki_pages`, `wiki_backlinks`, and `citations` provide graph and provenance
|
|
36
|
+
signals.
|
|
37
|
+
- `knowledge_indexes` tracks generated machine-readable shards.
|
|
38
|
+
|
|
39
|
+
The JSON vector representation is intentionally simple for the first local
|
|
40
|
+
implementation. The retrieval interface should hide it so a later vector
|
|
41
|
+
extension or pgvector backend can replace storage without changing CLI/MCP
|
|
42
|
+
contracts.
|
|
43
|
+
|
|
44
|
+
## Hosted Indexes
|
|
45
|
+
|
|
46
|
+
Hosted mode may use:
|
|
47
|
+
|
|
48
|
+
- Postgres plus pgvector for sources/chunks/wiki/runs.
|
|
49
|
+
- Managed vector stores for large corpora.
|
|
50
|
+
- Object storage for generated artifact shards.
|
|
51
|
+
- Worker queues for extraction, embedding, compile, lint, and refresh jobs.
|
|
52
|
+
|
|
53
|
+
Permission filtering must happen before reranked context is sent to the model.
|
|
54
|
+
Filtering only after retrieval is not enough if the retriever or reranker can see
|
|
55
|
+
unauthorized content.
|
|
56
|
+
|
|
57
|
+
## Query Pipeline
|
|
58
|
+
|
|
59
|
+
1. Normalize the query.
|
|
60
|
+
2. Embed the query if a semantic-capable provider is configured.
|
|
61
|
+
3. Run keyword FTS over source chunks and wiki chunks.
|
|
62
|
+
4. Run vector search over source chunks and wiki pages.
|
|
63
|
+
5. Expand candidate pages through backlinks and citations.
|
|
64
|
+
6. Drop stale candidates whose source revision/hash no longer matches
|
|
65
|
+
`open-files`.
|
|
66
|
+
7. Apply permission filters.
|
|
67
|
+
8. Merge and dedupe by source revision, wiki page, citation, and text hash.
|
|
68
|
+
9. Rerank by relevance, exact-match score, semantic score, freshness, citation
|
|
69
|
+
quality, and wiki authority.
|
|
70
|
+
10. Return structured results with source refs, citation spans, page refs,
|
|
71
|
+
scores, and reason codes.
|
|
72
|
+
|
|
73
|
+
## Result Shape
|
|
74
|
+
|
|
75
|
+
Search results should be structured enough for CLI, MCP, and agents:
|
|
76
|
+
|
|
77
|
+
```json
|
|
78
|
+
{
|
|
79
|
+
"kind": "source_chunk",
|
|
80
|
+
"id": "chunk_123",
|
|
81
|
+
"text": "...",
|
|
82
|
+
"score": 0.87,
|
|
83
|
+
"scores": {
|
|
84
|
+
"keyword": 0.72,
|
|
85
|
+
"semantic": 0.91,
|
|
86
|
+
"freshness": 1
|
|
87
|
+
},
|
|
88
|
+
"source": {
|
|
89
|
+
"uri": "open-files://file/file_123",
|
|
90
|
+
"revision": "rev_456",
|
|
91
|
+
"hash": "..."
|
|
92
|
+
},
|
|
93
|
+
"citation": {
|
|
94
|
+
"start_offset": 120,
|
|
95
|
+
"end_offset": 260
|
|
96
|
+
},
|
|
97
|
+
"reason": ["semantic_match", "exact_term", "fresh_source"]
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Context Packs
|
|
102
|
+
|
|
103
|
+
`knowledge <prompt>` and MCP `knowledge_ask` should not receive raw search rows.
|
|
104
|
+
They should receive context packs:
|
|
105
|
+
|
|
106
|
+
- Query and normalized intent.
|
|
107
|
+
- Selected source/wiki excerpts.
|
|
108
|
+
- Citation metadata.
|
|
109
|
+
- Freshness and permission notes.
|
|
110
|
+
- Known uncertainty or conflicting sources.
|
|
111
|
+
- Suggested durable wiki updates.
|
|
112
|
+
|
|
113
|
+
This keeps agent prompts stable while the retrieval internals evolve.
|
|
114
|
+
|
|
115
|
+
## Reindexing
|
|
116
|
+
|
|
117
|
+
Reindexing is driven by source revisions:
|
|
118
|
+
|
|
119
|
+
- If an `open-files` hash/revision changes, affected chunks and embeddings become
|
|
120
|
+
stale.
|
|
121
|
+
- If a source is deleted or access changes, affected chunks must be hidden or
|
|
122
|
+
removed before future retrieval.
|
|
123
|
+
- Wiki pages should track the source revisions they cite so lint can flag stale
|
|
124
|
+
pages.
|
|
125
|
+
- Embedding refresh jobs should be idempotent and checkpointed in `runs` and
|
|
126
|
+
`run_events`.
|
|
127
|
+
|
|
128
|
+
## Acceptance Criteria
|
|
129
|
+
|
|
130
|
+
- Local search works without network access for keyword-only retrieval.
|
|
131
|
+
- Semantic search is optional and provider-gated.
|
|
132
|
+
- Every returned excerpt can resolve to a source ref or wiki page.
|
|
133
|
+
- Permission filters run before model context assembly.
|
|
134
|
+
- Retrieval internals can swap from JSON vectors to pgvector or managed vector
|
|
135
|
+
stores without changing CLI/MCP result contracts.
|
package/package.json
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hasna/knowledge",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.5",
|
|
4
4
|
"description": "Agent-friendly local knowledge CLI with JSON output, pagination, and safe destructive actions",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
|
-
"open-knowledge": "
|
|
8
|
-
"open-knowledge-mcp": "
|
|
7
|
+
"open-knowledge": "bin/open-knowledge.js",
|
|
8
|
+
"open-knowledge-mcp": "bin/open-knowledge-mcp.js"
|
|
9
9
|
},
|
|
10
10
|
"files": [
|
|
11
11
|
"bin",
|
|
12
|
-
"src
|
|
13
|
-
"
|
|
12
|
+
"src",
|
|
13
|
+
"docs",
|
|
14
14
|
"LICENSE",
|
|
15
15
|
"README.md"
|
|
16
16
|
],
|
|
17
17
|
"scripts": {
|
|
18
18
|
"test": "bun test",
|
|
19
19
|
"test:cli": "bun test tests/cli.test.ts",
|
|
20
|
-
"build": "bun build --target=bun --outfile=bin/open-knowledge.js --minify src/cli.ts && bun build --target=bun --outfile=bin/open-knowledge-mcp.js --external @modelcontextprotocol/sdk src/mcp.js",
|
|
20
|
+
"build": "bun build --target=bun --outfile=bin/open-knowledge.js --minify --external @aws-sdk/client-s3 --external @aws-sdk/credential-providers src/cli.ts && bun build --target=bun --outfile=bin/open-knowledge-mcp.js --external @modelcontextprotocol/sdk src/mcp.js",
|
|
21
21
|
"prepublishOnly": "bun run build",
|
|
22
22
|
"postinstall": "bun run build"
|
|
23
23
|
},
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
},
|
|
38
38
|
"repository": {
|
|
39
39
|
"type": "git",
|
|
40
|
-
"url": "https://github.com/hasna/knowledge"
|
|
40
|
+
"url": "git+https://github.com/hasna/knowledge.git"
|
|
41
41
|
},
|
|
42
42
|
"bugs": {
|
|
43
43
|
"url": "https://github.com/hasna/knowledge/issues"
|
|
@@ -48,7 +48,12 @@
|
|
|
48
48
|
"node": ">=18"
|
|
49
49
|
},
|
|
50
50
|
"dependencies": {
|
|
51
|
+
"@aws-sdk/client-s3": "^3.1063.0",
|
|
52
|
+
"@aws-sdk/credential-providers": "^3.1063.0",
|
|
51
53
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
52
54
|
"zod": "^4.3.6"
|
|
55
|
+
},
|
|
56
|
+
"devDependencies": {
|
|
57
|
+
"@types/bun": "^1.3.14"
|
|
53
58
|
}
|
|
54
59
|
}
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { dirname, join, relative, sep } from 'node:path';
|
|
3
|
+
import type { KnowledgeConfig, KnowledgeWorkspace } from './workspace';
|
|
4
|
+
|
|
5
|
+
interface S3ClientLike {
|
|
6
|
+
send(command: unknown): Promise<any>;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface ArtifactWrite {
|
|
10
|
+
key: string;
|
|
11
|
+
body: string | Uint8Array;
|
|
12
|
+
content_type?: string;
|
|
13
|
+
metadata?: Record<string, string>;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface ArtifactStore {
|
|
17
|
+
readonly type: 'local' | 's3';
|
|
18
|
+
readonly canRead: boolean;
|
|
19
|
+
readonly canWrite: boolean;
|
|
20
|
+
put(entry: ArtifactWrite): Promise<{ key: string; uri: string }>;
|
|
21
|
+
getText(key: string): Promise<string>;
|
|
22
|
+
exists(key: string): Promise<boolean>;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function normalizeArtifactKey(key: string): string {
|
|
26
|
+
const raw = key.replace(/\\/g, '/').trim();
|
|
27
|
+
if (!raw || raw.startsWith('/')) {
|
|
28
|
+
throw new Error(`Invalid artifact key: ${key}`);
|
|
29
|
+
}
|
|
30
|
+
const segments = raw.split('/').filter(Boolean);
|
|
31
|
+
if (segments.length === 0 || segments.some((segment) => segment === '.' || segment === '..')) {
|
|
32
|
+
throw new Error(`Invalid artifact key: ${key}`);
|
|
33
|
+
}
|
|
34
|
+
return segments.join('/');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function assertInside(root: string, target: string): void {
|
|
38
|
+
const rel = relative(root, target);
|
|
39
|
+
if (rel.startsWith('..') || rel === '..' || rel.startsWith(`..${sep}`)) {
|
|
40
|
+
throw new Error(`Artifact path escapes root: ${target}`);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class LocalArtifactStore implements ArtifactStore {
|
|
45
|
+
readonly type = 'local' as const;
|
|
46
|
+
readonly canRead = true;
|
|
47
|
+
readonly canWrite = true;
|
|
48
|
+
|
|
49
|
+
constructor(private readonly root: string) {
|
|
50
|
+
mkdirSync(root, { recursive: true });
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async put(entry: ArtifactWrite): Promise<{ key: string; uri: string }> {
|
|
54
|
+
const key = normalizeArtifactKey(entry.key);
|
|
55
|
+
const path = join(this.root, key);
|
|
56
|
+
assertInside(this.root, path);
|
|
57
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
58
|
+
writeFileSync(path, entry.body);
|
|
59
|
+
return { key, uri: `file://${path}` };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async getText(key: string): Promise<string> {
|
|
63
|
+
const normalizedKey = normalizeArtifactKey(key);
|
|
64
|
+
const path = join(this.root, normalizedKey);
|
|
65
|
+
assertInside(this.root, path);
|
|
66
|
+
return readFileSync(path, 'utf8');
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async exists(key: string): Promise<boolean> {
|
|
70
|
+
const normalizedKey = normalizeArtifactKey(key);
|
|
71
|
+
const path = join(this.root, normalizedKey);
|
|
72
|
+
assertInside(this.root, path);
|
|
73
|
+
return existsSync(path);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export interface S3ArtifactStoreOptions {
|
|
78
|
+
bucket: string;
|
|
79
|
+
prefix?: string;
|
|
80
|
+
region?: string;
|
|
81
|
+
profile?: string;
|
|
82
|
+
max_attempts?: number;
|
|
83
|
+
server_side_encryption?: 'AES256' | 'aws:kms';
|
|
84
|
+
kms_key_id?: string;
|
|
85
|
+
client?: S3ClientLike;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export class S3ArtifactStore implements ArtifactStore {
|
|
89
|
+
readonly type = 's3' as const;
|
|
90
|
+
readonly canRead = true;
|
|
91
|
+
readonly canWrite = true;
|
|
92
|
+
private client?: S3ClientLike;
|
|
93
|
+
|
|
94
|
+
constructor(private readonly options: S3ArtifactStoreOptions) {
|
|
95
|
+
this.client = options.client;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
private async getClient(): Promise<S3ClientLike> {
|
|
99
|
+
if (this.client) return this.client;
|
|
100
|
+
const [{ S3Client }, { fromIni }] = await Promise.all([
|
|
101
|
+
import('@aws-sdk/client-s3'),
|
|
102
|
+
import('@aws-sdk/credential-providers'),
|
|
103
|
+
]);
|
|
104
|
+
this.client = new S3Client({
|
|
105
|
+
region: this.options.region,
|
|
106
|
+
credentials: this.options.profile ? fromIni({ profile: this.options.profile }) : undefined,
|
|
107
|
+
maxAttempts: this.options.max_attempts,
|
|
108
|
+
});
|
|
109
|
+
return this.client;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
private objectKey(key: string): string {
|
|
113
|
+
const normalizedKey = normalizeArtifactKey(key);
|
|
114
|
+
const prefix = this.options.prefix ? normalizeArtifactKey(this.options.prefix) : '';
|
|
115
|
+
return prefix ? `${prefix}/${normalizedKey}` : normalizedKey;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async put(entry: ArtifactWrite): Promise<{ key: string; uri: string }> {
|
|
119
|
+
const [{ PutObjectCommand }, client] = await Promise.all([
|
|
120
|
+
import('@aws-sdk/client-s3'),
|
|
121
|
+
this.getClient(),
|
|
122
|
+
]);
|
|
123
|
+
const key = this.objectKey(entry.key);
|
|
124
|
+
await client.send(new PutObjectCommand({
|
|
125
|
+
Bucket: this.options.bucket,
|
|
126
|
+
Key: key,
|
|
127
|
+
Body: entry.body,
|
|
128
|
+
ContentType: entry.content_type,
|
|
129
|
+
Metadata: entry.metadata,
|
|
130
|
+
ServerSideEncryption: this.options.server_side_encryption,
|
|
131
|
+
SSEKMSKeyId: this.options.kms_key_id,
|
|
132
|
+
}));
|
|
133
|
+
return { key, uri: `s3://${this.options.bucket}/${key}` };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
async getText(key: string): Promise<string> {
|
|
137
|
+
const [{ GetObjectCommand }, client] = await Promise.all([
|
|
138
|
+
import('@aws-sdk/client-s3'),
|
|
139
|
+
this.getClient(),
|
|
140
|
+
]);
|
|
141
|
+
const objectKey = this.objectKey(key);
|
|
142
|
+
const response = await client.send(new GetObjectCommand({
|
|
143
|
+
Bucket: this.options.bucket,
|
|
144
|
+
Key: objectKey,
|
|
145
|
+
}));
|
|
146
|
+
if (!response.Body) return '';
|
|
147
|
+
return await response.Body.transformToString();
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async exists(key: string): Promise<boolean> {
|
|
151
|
+
const [{ HeadObjectCommand }, client] = await Promise.all([
|
|
152
|
+
import('@aws-sdk/client-s3'),
|
|
153
|
+
this.getClient(),
|
|
154
|
+
]);
|
|
155
|
+
const objectKey = this.objectKey(key);
|
|
156
|
+
try {
|
|
157
|
+
await client.send(new HeadObjectCommand({
|
|
158
|
+
Bucket: this.options.bucket,
|
|
159
|
+
Key: objectKey,
|
|
160
|
+
}));
|
|
161
|
+
return true;
|
|
162
|
+
} catch (error) {
|
|
163
|
+
const name = error instanceof Error ? error.name : '';
|
|
164
|
+
if (name === 'NotFound' || name === 'NoSuchKey' || name === 'NotFoundError') return false;
|
|
165
|
+
throw error;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
export function createArtifactStore(config: KnowledgeConfig, workspace: KnowledgeWorkspace): ArtifactStore {
|
|
171
|
+
if (config.storage.type === 's3') {
|
|
172
|
+
if (!config.storage.s3?.bucket) throw new Error('S3 artifact storage requires storage.s3.bucket');
|
|
173
|
+
return new S3ArtifactStore({
|
|
174
|
+
bucket: config.storage.s3.bucket,
|
|
175
|
+
prefix: config.storage.s3.prefix,
|
|
176
|
+
region: config.storage.s3.region,
|
|
177
|
+
profile: config.storage.s3.profile,
|
|
178
|
+
max_attempts: config.storage.s3.max_attempts,
|
|
179
|
+
server_side_encryption: config.storage.s3.server_side_encryption,
|
|
180
|
+
kms_key_id: config.storage.s3.kms_key_id,
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
return new LocalArtifactStore(workspace.artifactsDir);
|
|
184
|
+
}
|