@nusoft/nuos-build-catalogue 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +13 -0
- package/dist/cli.js +491 -0
- package/dist/commands/create.d.ts +70 -0
- package/dist/commands/create.js +341 -0
- package/dist/commands/format.d.ts +19 -0
- package/dist/commands/format.js +89 -0
- package/dist/commands/handlers.d.ts +35 -0
- package/dist/commands/handlers.js +132 -0
- package/dist/commands/init.d.ts +41 -0
- package/dist/commands/init.js +289 -0
- package/dist/commands/prompt.d.ts +44 -0
- package/dist/commands/prompt.js +100 -0
- package/dist/commands/write.d.ts +39 -0
- package/dist/commands/write.js +247 -0
- package/dist/embedder/ollama.d.ts +54 -0
- package/dist/embedder/ollama.js +164 -0
- package/dist/embedder/openai.d.ts +21 -0
- package/dist/embedder/openai.js +56 -0
- package/dist/embedder/select.d.ts +9 -0
- package/dist/embedder/select.js +27 -0
- package/dist/embedder/stub.d.ts +15 -0
- package/dist/embedder/stub.js +40 -0
- package/dist/embedder/types.d.ts +21 -0
- package/dist/embedder/types.js +6 -0
- package/dist/embedder/vertex.d.ts +41 -0
- package/dist/embedder/vertex.js +94 -0
- package/dist/indexer/chunk.d.ts +20 -0
- package/dist/indexer/chunk.js +196 -0
- package/dist/indexer/crawl.d.ts +20 -0
- package/dist/indexer/crawl.js +66 -0
- package/dist/indexer/metadata.d.ts +21 -0
- package/dist/indexer/metadata.js +126 -0
- package/dist/indexer/upsert.d.ts +26 -0
- package/dist/indexer/upsert.js +152 -0
- package/dist/migrate/parsers.d.ts +17 -0
- package/dist/migrate/parsers.js +123 -0
- package/dist/migrate/run.d.ts +22 -0
- package/dist/migrate/run.js +142 -0
- package/dist/migrate/store.d.ts +20 -0
- package/dist/migrate/store.js +52 -0
- package/dist/migrate/types.d.ts +57 -0
- package/dist/migrate/types.js +13 -0
- package/dist/regenerate/check.d.ts +11 -0
- package/dist/regenerate/check.js +97 -0
- package/dist/regenerate/diff.d.ts +18 -0
- package/dist/regenerate/diff.js +38 -0
- package/dist/regenerate/types.d.ts +52 -0
- package/dist/regenerate/types.js +14 -0
- package/dist/runtime/ac-parse.d.ts +63 -0
- package/dist/runtime/ac-parse.js +196 -0
- package/dist/runtime/markdown-edit.d.ts +53 -0
- package/dist/runtime/markdown-edit.js +101 -0
- package/dist/runtime/markdown-render.d.ts +27 -0
- package/dist/runtime/markdown-render.js +209 -0
- package/dist/runtime/mis-adapter.d.ts +35 -0
- package/dist/runtime/mis-adapter.js +364 -0
- package/dist/runtime/runtime.d.ts +20 -0
- package/dist/runtime/runtime.js +39 -0
- package/dist/search/format.d.ts +6 -0
- package/dist/search/format.js +23 -0
- package/dist/search/query.d.ts +29 -0
- package/dist/search/query.js +71 -0
- package/dist/store/open.d.ts +14 -0
- package/dist/store/open.js +16 -0
- package/package.json +3 -2
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vertex AI embedder — text-embedding-005 (768 dims).
|
|
3
|
+
*
|
|
4
|
+
* Auth: GOOGLE_APPLICATION_CREDENTIALS env var pointing at a service
|
|
5
|
+
* account JSON, or any other ADC mechanism Google accepts.
|
|
6
|
+
*
|
|
7
|
+
* Chosen as the default because: it matches Sensight's production
|
|
8
|
+
* embedder; UK data residency is available; the README example
|
|
9
|
+
* documents 768 dimensions (matches text-embedding-005); per D010 the
|
|
10
|
+
* choice is the consumer's.
|
|
11
|
+
*
|
|
12
|
+
* Implementation note — Vertex's REST API does not bundle nicely without
|
|
13
|
+
* the Google auth library. Rather than vendor a heavyweight SDK in
|
|
14
|
+
* this small CLI, this implementation expects either:
|
|
15
|
+
* - GOOGLE_VERTEX_ACCESS_TOKEN env var (a short-lived OAuth token,
|
|
16
|
+
* refreshable via `gcloud auth print-access-token`), or
|
|
17
|
+
* - GOOGLE_APPLICATION_CREDENTIALS pointing at a service account JSON,
|
|
18
|
+
* in which case it shells out to `gcloud` to mint a token.
|
|
19
|
+
*
|
|
20
|
+
* The shell-out path is used by Sensight's local dev environment too.
|
|
21
|
+
* For production use of this CLI a future revision can adopt
|
|
22
|
+
* @google-cloud/aiplatform; not needed for Phase 0.
|
|
23
|
+
*/
|
|
24
|
+
import type { Embedder } from './types.js';
|
|
25
|
+
interface VertexConfig {
|
|
26
|
+
project: string;
|
|
27
|
+
location: string;
|
|
28
|
+
accessToken: string;
|
|
29
|
+
}
|
|
30
|
+
export declare class VertexEmbedder implements Embedder {
|
|
31
|
+
private readonly config;
|
|
32
|
+
readonly dimensions = 768;
|
|
33
|
+
readonly modelId = "text-embedding-005";
|
|
34
|
+
constructor(config: VertexConfig);
|
|
35
|
+
static fromEnv(): VertexEmbedder;
|
|
36
|
+
private get endpoint();
|
|
37
|
+
embed(texts: string[]): Promise<Float32Array[]>;
|
|
38
|
+
private embedBatch;
|
|
39
|
+
dispose(): Promise<void>;
|
|
40
|
+
}
|
|
41
|
+
export {};
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vertex AI embedder — text-embedding-005 (768 dims).
|
|
3
|
+
*
|
|
4
|
+
* Auth: GOOGLE_APPLICATION_CREDENTIALS env var pointing at a service
|
|
5
|
+
* account JSON, or any other ADC mechanism Google accepts.
|
|
6
|
+
*
|
|
7
|
+
* Chosen as the default because: it matches Sensight's production
|
|
8
|
+
* embedder; UK data residency is available; the README example
|
|
9
|
+
* documents 768 dimensions (matches text-embedding-005); per D010 the
|
|
10
|
+
* choice is the consumer's.
|
|
11
|
+
*
|
|
12
|
+
* Implementation note — Vertex's REST API does not bundle nicely without
|
|
13
|
+
* the Google auth library. Rather than vendor a heavyweight SDK in
|
|
14
|
+
* this small CLI, this implementation expects either:
|
|
15
|
+
* - GOOGLE_VERTEX_ACCESS_TOKEN env var (a short-lived OAuth token,
|
|
16
|
+
* refreshable via `gcloud auth print-access-token`), or
|
|
17
|
+
* - GOOGLE_APPLICATION_CREDENTIALS pointing at a service account JSON,
|
|
18
|
+
* in which case it shells out to `gcloud` to mint a token.
|
|
19
|
+
*
|
|
20
|
+
* The shell-out path is used by Sensight's local dev environment too.
|
|
21
|
+
* For production use of this CLI a future revision can adopt
|
|
22
|
+
* @google-cloud/aiplatform; not needed for Phase 0.
|
|
23
|
+
*/
|
|
24
|
+
import { execSync } from 'node:child_process';
|
|
25
|
+
const MODEL_ID = 'text-embedding-005';
|
|
26
|
+
const DIMENSIONS = 768;
|
|
27
|
+
const DEFAULT_LOCATION = 'us-central1';
|
|
28
|
+
export class VertexEmbedder {
|
|
29
|
+
config;
|
|
30
|
+
dimensions = DIMENSIONS;
|
|
31
|
+
modelId = MODEL_ID;
|
|
32
|
+
constructor(config) {
|
|
33
|
+
this.config = config;
|
|
34
|
+
}
|
|
35
|
+
static fromEnv() {
|
|
36
|
+
const project = process.env.GOOGLE_CLOUD_PROJECT ?? process.env.GCP_PROJECT;
|
|
37
|
+
if (!project) {
|
|
38
|
+
throw new Error('GOOGLE_CLOUD_PROJECT (or GCP_PROJECT) is required for the vertex embedder.');
|
|
39
|
+
}
|
|
40
|
+
const location = process.env.GOOGLE_CLOUD_LOCATION ?? DEFAULT_LOCATION;
|
|
41
|
+
let accessToken = process.env.GOOGLE_VERTEX_ACCESS_TOKEN;
|
|
42
|
+
if (!accessToken) {
|
|
43
|
+
try {
|
|
44
|
+
accessToken = execSync('gcloud auth print-access-token', {
|
|
45
|
+
encoding: 'utf8',
|
|
46
|
+
}).trim();
|
|
47
|
+
}
|
|
48
|
+
catch (err) {
|
|
49
|
+
throw new Error('Could not obtain a Vertex access token. Set GOOGLE_VERTEX_ACCESS_TOKEN, ' +
|
|
50
|
+
'or run `gcloud auth application-default login` and ensure `gcloud` is on PATH. ' +
|
|
51
|
+
'Original error: ' +
|
|
52
|
+
(err instanceof Error ? err.message : String(err)));
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return new VertexEmbedder({ project, location, accessToken });
|
|
56
|
+
}
|
|
57
|
+
get endpoint() {
|
|
58
|
+
const { project, location } = this.config;
|
|
59
|
+
return `https://${location}-aiplatform.googleapis.com/v1/projects/${project}/locations/${location}/publishers/google/models/${MODEL_ID}:predict`;
|
|
60
|
+
}
|
|
61
|
+
async embed(texts) {
|
|
62
|
+
if (texts.length === 0)
|
|
63
|
+
return [];
|
|
64
|
+
// Vertex enforces a per-request batch limit — chunk to be safe
|
|
65
|
+
const BATCH = 5;
|
|
66
|
+
const out = [];
|
|
67
|
+
for (let i = 0; i < texts.length; i += BATCH) {
|
|
68
|
+
const slice = texts.slice(i, i + BATCH);
|
|
69
|
+
const embeddings = await this.embedBatch(slice);
|
|
70
|
+
out.push(...embeddings);
|
|
71
|
+
}
|
|
72
|
+
return out;
|
|
73
|
+
}
|
|
74
|
+
async embedBatch(texts) {
|
|
75
|
+
const res = await fetch(this.endpoint, {
|
|
76
|
+
method: 'POST',
|
|
77
|
+
headers: {
|
|
78
|
+
'content-type': 'application/json',
|
|
79
|
+
authorization: `Bearer ${this.config.accessToken}`,
|
|
80
|
+
},
|
|
81
|
+
body: JSON.stringify({
|
|
82
|
+
instances: texts.map((content) => ({ content, task_type: 'RETRIEVAL_DOCUMENT' })),
|
|
83
|
+
}),
|
|
84
|
+
});
|
|
85
|
+
if (!res.ok) {
|
|
86
|
+
const body = await res.text().catch(() => '<unreadable body>');
|
|
87
|
+
throw new Error(`Vertex embed call failed (${res.status}): ${body}`);
|
|
88
|
+
}
|
|
89
|
+
const json = (await res.json());
|
|
90
|
+
return json.predictions.map((p) => new Float32Array(p.embeddings.values));
|
|
91
|
+
}
|
|
92
|
+
// Cloud embedder — nothing to release on the local machine.
|
|
93
|
+
async dispose() { }
|
|
94
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown-aware chunker.
|
|
3
|
+
*
|
|
4
|
+
* Splits a file on H1/H2/H3 boundaries. Code fences (``` ... ```) are
|
|
5
|
+
* preserved intact — we never break a chunk inside one. Each chunk gets
|
|
6
|
+
* a deterministic id of the form `<relPath>#<heading-slug-path>` so
|
|
7
|
+
* re-indexing the same file produces stable IDs.
|
|
8
|
+
*
|
|
9
|
+
* Token budget: estimated as ~4 chars per token (rough but adequate for
|
|
10
|
+
* routing decisions; the actual cost is at the embedder, which has its
|
|
11
|
+
* own per-call limits we respect there).
|
|
12
|
+
*/
|
|
13
|
+
export interface Chunk {
|
|
14
|
+
id: string;
|
|
15
|
+
text: string;
|
|
16
|
+
headings: string[];
|
|
17
|
+
startLine: number;
|
|
18
|
+
endLine: number;
|
|
19
|
+
}
|
|
20
|
+
export declare function chunkMarkdown(relativePath: string, content: string): Chunk[];
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown-aware chunker.
|
|
3
|
+
*
|
|
4
|
+
* Splits a file on H1/H2/H3 boundaries. Code fences (``` ... ```) are
|
|
5
|
+
* preserved intact — we never break a chunk inside one. Each chunk gets
|
|
6
|
+
* a deterministic id of the form `<relPath>#<heading-slug-path>` so
|
|
7
|
+
* re-indexing the same file produces stable IDs.
|
|
8
|
+
*
|
|
9
|
+
* Token budget: estimated as ~4 chars per token (rough but adequate for
|
|
10
|
+
* routing decisions; the actual cost is at the embedder, which has its
|
|
11
|
+
* own per-call limits we respect there).
|
|
12
|
+
*/
|
|
13
|
+
const MAX_CHUNK_CHARS = 600 * 4; // ~600 tokens
|
|
14
|
+
const OVERLAP_CHARS = 50 * 4; // ~50 tokens overlap when splitting
|
|
15
|
+
/**
|
|
16
|
+
* Minimum body length (the section text MINUS its heading line) for a
|
|
17
|
+
* chunk to be embedded as its own unit. Sections shorter than this are
|
|
18
|
+
* merged forward into the next non-empty sibling so the embedding has
|
|
19
|
+
* something substantive to anchor on. Without this, headings like
|
|
20
|
+
* `## Scope` with no body until the next sub-heading produce single-line
|
|
21
|
+
* chunks that all match the same generic queries at the same similarity,
|
|
22
|
+
* crowding real content out of search results.
|
|
23
|
+
*/
|
|
24
|
+
const MIN_BODY_CHARS = 80;
|
|
25
|
+
export function chunkMarkdown(relativePath, content) {
|
|
26
|
+
const lines = content.split('\n');
|
|
27
|
+
const rawSections = splitOnHeadings(lines);
|
|
28
|
+
const sections = mergeTinySections(rawSections);
|
|
29
|
+
const chunks = [];
|
|
30
|
+
for (const section of sections) {
|
|
31
|
+
const sectionText = section.lines.join('\n').trim();
|
|
32
|
+
if (sectionText.length === 0)
|
|
33
|
+
continue;
|
|
34
|
+
if (sectionText.length <= MAX_CHUNK_CHARS) {
|
|
35
|
+
chunks.push({
|
|
36
|
+
id: makeChunkId(relativePath, section.headings, 0),
|
|
37
|
+
text: sectionText,
|
|
38
|
+
headings: section.headings,
|
|
39
|
+
startLine: section.startLine,
|
|
40
|
+
endLine: section.endLine,
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
const slices = sliceLong(sectionText);
|
|
45
|
+
slices.forEach((slice, i) => {
|
|
46
|
+
chunks.push({
|
|
47
|
+
id: makeChunkId(relativePath, section.headings, i),
|
|
48
|
+
text: slice,
|
|
49
|
+
headings: section.headings,
|
|
50
|
+
startLine: section.startLine,
|
|
51
|
+
endLine: section.endLine,
|
|
52
|
+
});
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return chunks;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Merge sections whose body (everything after the heading line) is
|
|
60
|
+
* under MIN_BODY_CHARS into the next sibling. The merged section keeps
|
|
61
|
+
* the heading hierarchy of the upstream tiny section so navigation
|
|
62
|
+
* still works, but the embedded text now has substantive content.
|
|
63
|
+
*
|
|
64
|
+
* If a tiny section is the LAST section, it merges backward into the
|
|
65
|
+
* previous one. This catches files that end on a near-empty heading.
|
|
66
|
+
*/
|
|
67
|
+
function mergeTinySections(sections) {
|
|
68
|
+
if (sections.length <= 1)
|
|
69
|
+
return sections;
|
|
70
|
+
const merged = [];
|
|
71
|
+
let i = 0;
|
|
72
|
+
while (i < sections.length) {
|
|
73
|
+
const current = sections[i];
|
|
74
|
+
const body = bodyOfSection(current);
|
|
75
|
+
if (body.length >= MIN_BODY_CHARS) {
|
|
76
|
+
merged.push(current);
|
|
77
|
+
i += 1;
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
// tiny section — merge forward into the next sibling
|
|
81
|
+
if (i + 1 < sections.length) {
|
|
82
|
+
const next = sections[i + 1];
|
|
83
|
+
merged.push({
|
|
84
|
+
// Keep the LATER (more specific) heading hierarchy so search
|
|
85
|
+
// results point at the section the user actually wants.
|
|
86
|
+
headings: next.headings.length >= current.headings.length ? next.headings : current.headings,
|
|
87
|
+
lines: [...current.lines, ...next.lines],
|
|
88
|
+
startLine: current.startLine,
|
|
89
|
+
endLine: next.endLine,
|
|
90
|
+
});
|
|
91
|
+
i += 2;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
// tiny section is the last — merge backward into the previous one
|
|
95
|
+
if (merged.length > 0) {
|
|
96
|
+
const prev = merged[merged.length - 1];
|
|
97
|
+
merged[merged.length - 1] = {
|
|
98
|
+
headings: prev.headings,
|
|
99
|
+
lines: [...prev.lines, ...current.lines],
|
|
100
|
+
startLine: prev.startLine,
|
|
101
|
+
endLine: current.endLine,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
// Lone tiny section in the file — keep it.
|
|
106
|
+
merged.push(current);
|
|
107
|
+
}
|
|
108
|
+
i += 1;
|
|
109
|
+
}
|
|
110
|
+
return merged;
|
|
111
|
+
}
|
|
112
|
+
function bodyOfSection(section) {
|
|
113
|
+
// The first line is the heading itself; "body" is everything after.
|
|
114
|
+
const bodyLines = section.lines.slice(1);
|
|
115
|
+
return bodyLines.join('\n').trim();
|
|
116
|
+
}
|
|
117
|
+
function splitOnHeadings(lines) {
|
|
118
|
+
const sections = [];
|
|
119
|
+
let inFence = false;
|
|
120
|
+
const stack = []; // current heading hierarchy
|
|
121
|
+
let current = {
|
|
122
|
+
headings: [...stack],
|
|
123
|
+
lines: [],
|
|
124
|
+
startLine: 1,
|
|
125
|
+
};
|
|
126
|
+
const flush = (endLine) => {
|
|
127
|
+
if (current.lines.length > 0 || current.headings.length > 0) {
|
|
128
|
+
sections.push({
|
|
129
|
+
headings: current.headings,
|
|
130
|
+
lines: current.lines,
|
|
131
|
+
startLine: current.startLine,
|
|
132
|
+
endLine,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
lines.forEach((line, i) => {
|
|
137
|
+
const lineNum = i + 1;
|
|
138
|
+
if (line.trim().startsWith('```')) {
|
|
139
|
+
inFence = !inFence;
|
|
140
|
+
current.lines.push(line);
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
if (!inFence) {
|
|
144
|
+
const m = /^(#{1,3})\s+(.+?)\s*$/u.exec(line);
|
|
145
|
+
if (m) {
|
|
146
|
+
// close the current section before the heading line
|
|
147
|
+
flush(lineNum - 1);
|
|
148
|
+
const depth = m[1].length;
|
|
149
|
+
const text = m[2];
|
|
150
|
+
// Truncate to depth-1, padding any holes left by missing parent
|
|
151
|
+
// levels (e.g. a file that starts at H3 with no preceding H1/H2).
|
|
152
|
+
stack.length = Math.max(0, depth - 1);
|
|
153
|
+
for (let s = 0; s < stack.length; s++) {
|
|
154
|
+
if (stack[s] === undefined)
|
|
155
|
+
stack[s] = '';
|
|
156
|
+
}
|
|
157
|
+
stack.push(text);
|
|
158
|
+
current = {
|
|
159
|
+
headings: stack.filter((h) => h && h.length > 0),
|
|
160
|
+
lines: [line],
|
|
161
|
+
startLine: lineNum,
|
|
162
|
+
};
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
current.lines.push(line);
|
|
167
|
+
});
|
|
168
|
+
flush(lines.length);
|
|
169
|
+
return sections;
|
|
170
|
+
}
|
|
171
|
+
function sliceLong(text) {
|
|
172
|
+
const out = [];
|
|
173
|
+
let pos = 0;
|
|
174
|
+
while (pos < text.length) {
|
|
175
|
+
const end = Math.min(pos + MAX_CHUNK_CHARS, text.length);
|
|
176
|
+
out.push(text.slice(pos, end));
|
|
177
|
+
if (end >= text.length)
|
|
178
|
+
break;
|
|
179
|
+
pos = end - OVERLAP_CHARS;
|
|
180
|
+
if (pos <= 0)
|
|
181
|
+
pos = end;
|
|
182
|
+
}
|
|
183
|
+
return out;
|
|
184
|
+
}
|
|
185
|
+
function makeChunkId(relPath, headings, sliceIdx) {
|
|
186
|
+
const slug = headings
|
|
187
|
+
.map((h) => h
|
|
188
|
+
.toLowerCase()
|
|
189
|
+
.replace(/[^a-z0-9]+/gu, '-')
|
|
190
|
+
.replace(/^-+|-+$/gu, '')
|
|
191
|
+
.slice(0, 60))
|
|
192
|
+
.filter(Boolean)
|
|
193
|
+
.join('/');
|
|
194
|
+
const tail = sliceIdx > 0 ? `~${sliceIdx}` : '';
|
|
195
|
+
return slug ? `${relPath}#${slug}${tail}` : `${relPath}#root${tail}`;
|
|
196
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawler — walks the NuOS catalogue tree picking up indexable .md files.
|
|
3
|
+
*
|
|
4
|
+
* Per WU 110 spec:
|
|
5
|
+
* - includes: docs/build/**, docs/contracts/**, docs/philosophy/**,
|
|
6
|
+
* docs/guides/**, plus top-level docs/build/STATE.md, BUILD-ORDER.md,
|
|
7
|
+
* README.md, reference-index.md
|
|
8
|
+
* - skips: _index.md (derived; adds noise), done/, archive/, superseded/
|
|
9
|
+
* subdirs (opt-in via includeArchived)
|
|
10
|
+
* - skips: .excalidraw, binary
|
|
11
|
+
*/
|
|
12
|
+
export interface CrawlOptions {
|
|
13
|
+
catalogueRoot: string;
|
|
14
|
+
includeArchived?: boolean;
|
|
15
|
+
}
|
|
16
|
+
export interface CrawledFile {
|
|
17
|
+
absolutePath: string;
|
|
18
|
+
relativePath: string;
|
|
19
|
+
}
|
|
20
|
+
export declare function crawl(options: CrawlOptions): Promise<CrawledFile[]>;
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawler — walks the NuOS catalogue tree picking up indexable .md files.
|
|
3
|
+
*
|
|
4
|
+
* Per WU 110 spec:
|
|
5
|
+
* - includes: docs/build/**, docs/contracts/**, docs/philosophy/**,
|
|
6
|
+
* docs/guides/**, plus top-level docs/build/STATE.md, BUILD-ORDER.md,
|
|
7
|
+
* README.md, reference-index.md
|
|
8
|
+
* - skips: _index.md (derived; adds noise), done/, archive/, superseded/
|
|
9
|
+
* subdirs (opt-in via includeArchived)
|
|
10
|
+
* - skips: .excalidraw, binary
|
|
11
|
+
*/
|
|
12
|
+
import { readdir, stat } from 'node:fs/promises';
|
|
13
|
+
import path from 'node:path';
|
|
14
|
+
const TOP_LEVEL_INCLUDES = ['build', 'contracts', 'philosophy', 'guides'];
|
|
15
|
+
const SKIPPED_DIR_NAMES = new Set(['node_modules', '.git', '.nuos-catalogue']);
|
|
16
|
+
const ARCHIVED_DIR_NAMES = new Set(['done', 'archive', 'superseded']);
|
|
17
|
+
const INDEX_FILENAMES = new Set(['_index.md']);
|
|
18
|
+
export async function crawl(options) {
|
|
19
|
+
const out = [];
|
|
20
|
+
for (const top of TOP_LEVEL_INCLUDES) {
|
|
21
|
+
const start = path.join(options.catalogueRoot, top);
|
|
22
|
+
if (await exists(start)) {
|
|
23
|
+
await walkDir(start, options, out);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return out.sort((a, b) => a.relativePath.localeCompare(b.relativePath));
|
|
27
|
+
}
|
|
28
|
+
async function walkDir(dir, options, out) {
|
|
29
|
+
let entries;
|
|
30
|
+
try {
|
|
31
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
for (const entry of entries) {
|
|
37
|
+
const full = path.join(dir, entry.name);
|
|
38
|
+
if (entry.isDirectory()) {
|
|
39
|
+
if (SKIPPED_DIR_NAMES.has(entry.name))
|
|
40
|
+
continue;
|
|
41
|
+
if (!options.includeArchived && ARCHIVED_DIR_NAMES.has(entry.name))
|
|
42
|
+
continue;
|
|
43
|
+
await walkDir(full, options, out);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
if (!entry.isFile())
|
|
47
|
+
continue;
|
|
48
|
+
if (!entry.name.endsWith('.md'))
|
|
49
|
+
continue;
|
|
50
|
+
if (INDEX_FILENAMES.has(entry.name))
|
|
51
|
+
continue;
|
|
52
|
+
out.push({
|
|
53
|
+
absolutePath: full,
|
|
54
|
+
relativePath: path.relative(options.catalogueRoot, full),
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
async function exists(p) {
|
|
59
|
+
try {
|
|
60
|
+
await stat(p);
|
|
61
|
+
return true;
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-file metadata extraction.
|
|
3
|
+
*
|
|
4
|
+
* Returns a structured FileMeta record from the file path and content.
|
|
5
|
+
* Per-kind resolvers handle the variation between work-units, decisions,
|
|
6
|
+
* sessions, etc.
|
|
7
|
+
*
|
|
8
|
+
* Cross-references parse markdown links of the form `(D040)`, `(WU 110)`,
|
|
9
|
+
* `(Q015)`, `[D040](D040-...)`, etc. — so a future query like "what
|
|
10
|
+
* references D040" can be answered by metadata filter alone.
|
|
11
|
+
*/
|
|
12
|
+
export type FileKind = 'work_unit' | 'decision' | 'session' | 'open_question' | 'risk' | 'contract' | 'philosophy' | 'guide' | 'map' | 'state' | 'build_order' | 'reference' | 'readme' | 'unknown';
|
|
13
|
+
export interface FileMeta {
|
|
14
|
+
path: string;
|
|
15
|
+
kind: FileKind;
|
|
16
|
+
idInKind: string | null;
|
|
17
|
+
status: string | null;
|
|
18
|
+
date: string | null;
|
|
19
|
+
crossRefs: string[];
|
|
20
|
+
}
|
|
21
|
+
export declare function extractMetadata(absolutePath: string, relativePath: string, content: string): Promise<FileMeta>;
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-file metadata extraction.
|
|
3
|
+
*
|
|
4
|
+
* Returns a structured FileMeta record from the file path and content.
|
|
5
|
+
* Per-kind resolvers handle the variation between work-units, decisions,
|
|
6
|
+
* sessions, etc.
|
|
7
|
+
*
|
|
8
|
+
* Cross-references parse markdown links of the form `(D040)`, `(WU 110)`,
|
|
9
|
+
* `(Q015)`, `[D040](D040-...)`, etc. — so a future query like "what
|
|
10
|
+
* references D040" can be answered by metadata filter alone.
|
|
11
|
+
*/
|
|
12
|
+
import { stat } from 'node:fs/promises';
|
|
13
|
+
export async function extractMetadata(absolutePath, relativePath, content) {
|
|
14
|
+
const kind = classifyKind(relativePath);
|
|
15
|
+
const idInKind = extractIdInKind(kind, relativePath, content);
|
|
16
|
+
const status = extractStatus(content);
|
|
17
|
+
const date = await extractDate(absolutePath, content);
|
|
18
|
+
const crossRefs = extractCrossRefs(content);
|
|
19
|
+
return {
|
|
20
|
+
path: relativePath,
|
|
21
|
+
kind,
|
|
22
|
+
idInKind,
|
|
23
|
+
status,
|
|
24
|
+
date,
|
|
25
|
+
crossRefs,
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
function classifyKind(relPath) {
|
|
29
|
+
const p = relPath.replace(/\\/g, '/');
|
|
30
|
+
if (p === 'build/STATE.md')
|
|
31
|
+
return 'state';
|
|
32
|
+
if (p === 'build/BUILD-ORDER.md')
|
|
33
|
+
return 'build_order';
|
|
34
|
+
if (p === 'build/README.md')
|
|
35
|
+
return 'readme';
|
|
36
|
+
if (p === 'build/reference-index.md')
|
|
37
|
+
return 'reference';
|
|
38
|
+
if (p.startsWith('build/work-units/'))
|
|
39
|
+
return 'work_unit';
|
|
40
|
+
if (p.startsWith('build/decisions/'))
|
|
41
|
+
return 'decision';
|
|
42
|
+
if (p.startsWith('build/sessions/'))
|
|
43
|
+
return 'session';
|
|
44
|
+
if (p.startsWith('build/open-questions/'))
|
|
45
|
+
return 'open_question';
|
|
46
|
+
if (p.startsWith('build/risks/'))
|
|
47
|
+
return 'risk';
|
|
48
|
+
if (p.startsWith('build/maps/'))
|
|
49
|
+
return 'map';
|
|
50
|
+
if (p.startsWith('contracts/'))
|
|
51
|
+
return 'contract';
|
|
52
|
+
if (p.startsWith('philosophy/'))
|
|
53
|
+
return 'philosophy';
|
|
54
|
+
if (p.startsWith('guides/'))
|
|
55
|
+
return 'guide';
|
|
56
|
+
return 'unknown';
|
|
57
|
+
}
|
|
58
|
+
function extractIdInKind(kind, relPath, content) {
|
|
59
|
+
const file = relPath.split('/').pop() ?? '';
|
|
60
|
+
if (kind === 'work_unit') {
|
|
61
|
+
const m = /^(\d{3})/.exec(file);
|
|
62
|
+
return m ? `WU ${m[1]}` : null;
|
|
63
|
+
}
|
|
64
|
+
if (kind === 'decision') {
|
|
65
|
+
const m = /^(D\d+)/.exec(file);
|
|
66
|
+
return m ? m[1] : null;
|
|
67
|
+
}
|
|
68
|
+
if (kind === 'open_question') {
|
|
69
|
+
const m = /^(Q\d+)/.exec(file);
|
|
70
|
+
return m ? m[1] : null;
|
|
71
|
+
}
|
|
72
|
+
if (kind === 'risk') {
|
|
73
|
+
const m = /^(R\d+)/.exec(file);
|
|
74
|
+
return m ? m[1] : null;
|
|
75
|
+
}
|
|
76
|
+
if (kind === 'session') {
|
|
77
|
+
// Session files are dated; pull the leading H1 if present
|
|
78
|
+
const h = /^#\s+(.+?)\s*$/m.exec(content);
|
|
79
|
+
return h ? h[1] : file.replace(/\.md$/, '');
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
function extractStatus(content) {
|
|
84
|
+
const m = /^\*\*Status:\*\*\s*(.+?)\s*$/m.exec(content);
|
|
85
|
+
if (m)
|
|
86
|
+
return m[1].replace(/\s+/g, ' ').trim();
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
async function extractDate(absolutePath, content) {
|
|
90
|
+
// Look for "**Date:** 2026-05-08" or "Date: ..." in frontmatter style
|
|
91
|
+
const m = /^\*\*Date:\*\*\s*(\d{4}-\d{2}-\d{2})/m.exec(content) ||
|
|
92
|
+
/^Date:\s*(\d{4}-\d{2}-\d{2})/m.exec(content);
|
|
93
|
+
if (m)
|
|
94
|
+
return m[1];
|
|
95
|
+
try {
|
|
96
|
+
const s = await stat(absolutePath);
|
|
97
|
+
return s.mtime.toISOString().slice(0, 10);
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
return null;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
const REF_PATTERNS = [
|
|
104
|
+
{ regex: /\bD\d{3}\b/gu, canonical: (m) => m[0] },
|
|
105
|
+
{ regex: /\bQ\d{3}\b/gu, canonical: (m) => m[0] },
|
|
106
|
+
{ regex: /\bR\d{3}\b/gu, canonical: (m) => m[0] },
|
|
107
|
+
{ regex: /\bWU\s*0?\d{2,3}[a-z]?\b/giu, canonical: (m) => normaliseWu(m[0]) },
|
|
108
|
+
];
|
|
109
|
+
function normaliseWu(raw) {
|
|
110
|
+
const m = /WU\s*0?(\d{2,3})([a-z]?)/i.exec(raw);
|
|
111
|
+
if (!m)
|
|
112
|
+
return raw.toUpperCase();
|
|
113
|
+
const num = m[1].padStart(3, '0');
|
|
114
|
+
const tail = m[2] ? m[2].toLowerCase() : '';
|
|
115
|
+
return `WU ${num}${tail}`;
|
|
116
|
+
}
|
|
117
|
+
function extractCrossRefs(content) {
|
|
118
|
+
const found = new Set();
|
|
119
|
+
for (const { regex, canonical } of REF_PATTERNS) {
|
|
120
|
+
let m;
|
|
121
|
+
while ((m = regex.exec(content)) !== null) {
|
|
122
|
+
found.add(canonical(m));
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return [...found].sort();
|
|
126
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Index orchestrator — crawl + chunk + extract metadata + embed + upsert.
|
|
3
|
+
*
|
|
4
|
+
* Hash-based incremental: a separate `.nuos-catalogue/hashes.json` tracks
|
|
5
|
+
* the last-indexed content hash per file. Unchanged files are skipped.
|
|
6
|
+
* Deleted files are removed from the index.
|
|
7
|
+
*/
|
|
8
|
+
import type { NuVector } from '@nusoft/nuvector';
|
|
9
|
+
import type { Embedder } from '../embedder/types.js';
|
|
10
|
+
export interface IndexConfig {
|
|
11
|
+
catalogueRoot: string;
|
|
12
|
+
hashFilePath: string;
|
|
13
|
+
store: NuVector;
|
|
14
|
+
embedder: Embedder;
|
|
15
|
+
force?: boolean;
|
|
16
|
+
dryRun?: boolean;
|
|
17
|
+
}
|
|
18
|
+
export interface IndexReport {
|
|
19
|
+
indexed: number;
|
|
20
|
+
updated: number;
|
|
21
|
+
deleted: number;
|
|
22
|
+
unchanged: number;
|
|
23
|
+
chunks: number;
|
|
24
|
+
durationMs: number;
|
|
25
|
+
}
|
|
26
|
+
export declare function runIndex(config: IndexConfig): Promise<IndexReport>;
|