kushi-agents 5.8.4 → 5.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.mjs +124 -11
- package/package.json +2 -1
- package/plugin/instructions/global-wiki.instructions.md +7 -2
- package/plugin/runners/discover.mjs +4 -2
- package/plugin/runners/lib/references.mjs +164 -0
- package/plugin/runners/pull-references.mjs +209 -0
- package/plugin/runners/pull-state.mjs +297 -0
- package/plugin/runners/refresh.mjs +20 -0
- package/plugin/skills/global-wiki/SKILL.md +13 -4
- package/plugin/templates/init/kushi.template.yml +27 -0
- package/plugin/templates/user-config.json +20 -0
- package/src/constants.mjs +1 -0
- package/src/global-wiki-cli.mjs +230 -3
- package/src/global-wiki.mjs +133 -1
- package/src/main.mjs +4 -1
- package/src/seed-config.mjs +28 -0
- package/src/setup-wizard.mjs +13 -1
package/bin/cli.mjs
CHANGED
|
@@ -103,22 +103,47 @@ if (args.length > 0 && args[0] === 'lint') {
|
|
|
103
103
|
process.exit(0);
|
|
104
104
|
}
|
|
105
105
|
|
|
106
|
+
// ── one-shot wiki verb (v5.9.1+) ─────────────────────────────────────────────
|
|
107
|
+
// `kushi wiki` — deterministic do-the-wiki: resolve root, init if missing,
|
|
108
|
+
// print path + status + clickable file:// URL. Idempotent across
|
|
109
|
+
// "create wiki" / "do wiki" / "refresh wiki" / "update wiki".
|
|
110
|
+
if (args.length > 0 && args[0] === 'wiki') {
|
|
111
|
+
const cliMod = await import('../src/global-wiki-cli.mjs');
|
|
112
|
+
await cliMod.runWiki();
|
|
113
|
+
process.exit(0);
|
|
114
|
+
}
|
|
115
|
+
|
|
106
116
|
// ── global verb (v5.3.0+) ────────────────────────────────────────────────────
|
|
107
117
|
if (args.length > 0 && args[0] === 'global') {
|
|
108
118
|
const sub = args[1] || '';
|
|
109
|
-
const validSubs = ['init', 'status', 'ask', 'lint'];
|
|
119
|
+
const validSubs = ['init', 'status', 'ask', 'lint', 'migrate', 'set-root', 'show-root'];
|
|
110
120
|
if (!validSubs.includes(sub)) {
|
|
111
|
-
console.error('\n Usage: kushi global init
|
|
112
|
-
console.error(' kushi global status
|
|
113
|
-
console.error(' kushi global ask <question>
|
|
114
|
-
console.error(' kushi global lint
|
|
121
|
+
console.error('\n Usage: kushi global init Scaffold State/ at the resolved root');
|
|
122
|
+
console.error(' kushi global status Show counts + freshness');
|
|
123
|
+
console.error(' kushi global ask <question> Ask the global wiki');
|
|
124
|
+
console.error(' kushi global lint Lint the global wiki');
|
|
125
|
+
console.error(' kushi global show-root Show how the root path is resolved');
|
|
126
|
+
console.error(' kushi global set-root <path> [--scope workspace|home]');
|
|
127
|
+
console.error(' Persist root (workspace shared by default,');
|
|
128
|
+
console.error(' falls back to ~/.kushi/config.json)');
|
|
129
|
+
console.error(' kushi global migrate <new-path> Copy wiki to a new root + persist setting\n');
|
|
130
|
+
console.error(' Tip: Set the root to a OneDrive-synced SharePoint folder to share');
|
|
131
|
+
console.error(' the wiki across a team. See docs/how-to/wiki-on-sharepoint.md.\n');
|
|
115
132
|
process.exit(1);
|
|
116
133
|
}
|
|
117
|
-
const
|
|
118
|
-
if (sub === 'init') await runGlobalInit();
|
|
119
|
-
else if (sub === 'status') await runGlobalStatus();
|
|
120
|
-
else if (sub === 'ask') await runGlobalAsk(args.slice(2).join(' '));
|
|
121
|
-
else if (sub === 'lint') await runGlobalLint();
|
|
134
|
+
const cliMod = await import('../src/global-wiki-cli.mjs');
|
|
135
|
+
if (sub === 'init') await cliMod.runGlobalInit();
|
|
136
|
+
else if (sub === 'status') await cliMod.runGlobalStatus();
|
|
137
|
+
else if (sub === 'ask') await cliMod.runGlobalAsk(args.slice(2).join(' '));
|
|
138
|
+
else if (sub === 'lint') await cliMod.runGlobalLint();
|
|
139
|
+
else if (sub === 'migrate') await cliMod.runGlobalMigrate(args[2]);
|
|
140
|
+
else if (sub === 'set-root') {
|
|
141
|
+
const scopeIdx = args.indexOf('--scope');
|
|
142
|
+
const scope = scopeIdx > 0 && args[scopeIdx + 1] ? args[scopeIdx + 1] : null;
|
|
143
|
+
const target = args.find((a, i) => i >= 2 && !a.startsWith('--') && args[i - 1] !== '--scope');
|
|
144
|
+
await cliMod.runGlobalSetRoot(target, { scope });
|
|
145
|
+
}
|
|
146
|
+
else if (sub === 'show-root') await cliMod.runGlobalShowRoot();
|
|
122
147
|
process.exit(0);
|
|
123
148
|
}
|
|
124
149
|
|
|
@@ -252,17 +277,105 @@ if (args.includes('--help') || args.includes('-h')) {
|
|
|
252
277
|
After install, talk to Kushi:
|
|
253
278
|
bootstrap <project> First-time setup
|
|
254
279
|
refresh <project> Incremental refresh + rebuild State/
|
|
255
|
-
state <project> Re-render State/ from existing Evidence
|
|
280
|
+
state <project> Re-render State/ from existing Evidence (deterministic
|
|
281
|
+
inventory; LLM build-state skill does narrative synthesis)
|
|
282
|
+
references <project> Scan Evidence for URLs and refresh the shared
|
|
283
|
+
references pool (Evidence/_shared/references/)
|
|
256
284
|
consolidate <project> Merge per-user evidence
|
|
257
285
|
status <project> Show run-log
|
|
258
286
|
ask <project> <q> Cited Q&A over Evidence/ (auto-routes, --file-back to save)
|
|
259
287
|
lint <project> Run wiki-lint checks on State/
|
|
260
288
|
|
|
289
|
+
Workspace lifecycle (v5.9.0+):
|
|
290
|
+
wiki One-shot: resolve global wiki root, scaffold
|
|
291
|
+
if missing, print status + clickable file://
|
|
292
|
+
link to the wiki index. Idempotent —
|
|
293
|
+
"do wiki" / "refresh wiki" / "create wiki"
|
|
294
|
+
all map here.
|
|
295
|
+
uninstall [--keep-config] Remove <cwd>/.kushi/ (preserves Evidence/, State/).
|
|
296
|
+
--keep-config preserves config/user/ identity files.
|
|
297
|
+
upgrade npm i -g kushi-agents@latest then re-seed assets
|
|
298
|
+
in cwd (config preserved).
|
|
299
|
+
|
|
261
300
|
In VS Code Chat the prefix is "@Kushi". In Clawpilot just say "kushi <verb>".
|
|
262
301
|
`);
|
|
263
302
|
process.exit(0);
|
|
264
303
|
}
|
|
265
304
|
|
|
305
|
+
// ── state / refresh / bootstrap verbs (v5.9.0+) ─────────────────────────────
|
|
306
|
+
// Thin shells that exec the deterministic runners. Keeps `kushi state HCA` etc.
|
|
307
|
+
// runnable from the global bin without users having to know the runner paths.
|
|
308
|
+
if (args.length > 0 && ['state', 'refresh-runner', 'bootstrap-runner', 'discover', 'references'].includes(args[0])) {
|
|
309
|
+
const verb = args[0];
|
|
310
|
+
const project = args[1];
|
|
311
|
+
if (!project) {
|
|
312
|
+
console.error(`\n Usage: kushi ${verb} <project> [options]\n`);
|
|
313
|
+
process.exit(1);
|
|
314
|
+
}
|
|
315
|
+
const { spawnSync } = await import('node:child_process');
|
|
316
|
+
const pathMod = await import('node:path');
|
|
317
|
+
const urlMod = await import('node:url');
|
|
318
|
+
const here = pathMod.dirname(urlMod.fileURLToPath(import.meta.url));
|
|
319
|
+
const runnerMap = {
|
|
320
|
+
state: 'pull-state.mjs',
|
|
321
|
+
references: 'pull-references.mjs',
|
|
322
|
+
discover: 'discover.mjs',
|
|
323
|
+
'refresh-runner': 'refresh.mjs',
|
|
324
|
+
'bootstrap-runner': 'bootstrap.mjs',
|
|
325
|
+
};
|
|
326
|
+
const runner = pathMod.resolve(here, '..', 'plugin', 'runners', runnerMap[verb]);
|
|
327
|
+
const passthrough = args.slice(2);
|
|
328
|
+
const r = spawnSync(process.execPath, [runner, '--project', project, ...passthrough], { stdio: 'inherit' });
|
|
329
|
+
process.exit(r.status ?? 1);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// ── workspace uninstall / upgrade verbs (v5.9.0+) ───────────────────────────
|
|
333
|
+
if (args.length > 0 && args[0] === 'uninstall' && !args.includes('--clawpilot') && !args.includes('--vscode') && !args.includes('--all-hosts')) {
|
|
334
|
+
// Workspace uninstall: remove .kushi/ from cwd (preserves Evidence/, State/).
|
|
335
|
+
const fsMod = await import('node:fs');
|
|
336
|
+
const pathMod = await import('node:path');
|
|
337
|
+
const dest = pathMod.resolve(process.cwd(), '.kushi');
|
|
338
|
+
const keepConfig = args.includes('--keep-config');
|
|
339
|
+
if (!fsMod.existsSync(dest)) {
|
|
340
|
+
console.error(`\n No .kushi/ directory found at ${dest}\n`);
|
|
341
|
+
process.exit(1);
|
|
342
|
+
}
|
|
343
|
+
if (keepConfig) {
|
|
344
|
+
const assetDirs = ['agents', 'instructions', 'prompts', 'skills', 'templates', 'reference-packs', 'lib', 'runners'];
|
|
345
|
+
let removed = 0;
|
|
346
|
+
for (const d of assetDirs) {
|
|
347
|
+
const p = pathMod.join(dest, d);
|
|
348
|
+
if (fsMod.existsSync(p)) { fsMod.rmSync(p, { recursive: true, force: true }); removed++; }
|
|
349
|
+
}
|
|
350
|
+
console.log(`\n Removed ${removed} asset dir(s) from ${dest} (config/user/ preserved).\n`);
|
|
351
|
+
} else {
|
|
352
|
+
fsMod.rmSync(dest, { recursive: true, force: true });
|
|
353
|
+
console.log(`\n Removed ${dest}\n Evidence/ and State/ left untouched.\n`);
|
|
354
|
+
}
|
|
355
|
+
process.exit(0);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
if (args.length > 0 && args[0] === 'upgrade') {
|
|
359
|
+
// Upgrade: npm i -g @latest, then re-seed assets in cwd preserving config.
|
|
360
|
+
const { spawnSync } = await import('node:child_process');
|
|
361
|
+
console.log('\n Upgrading kushi-agents globally via npm...\n');
|
|
362
|
+
const npm = process.platform === 'win32' ? 'npm.cmd' : 'npm';
|
|
363
|
+
const r1 = spawnSync(npm, ['install', '-g', 'kushi-agents@latest'], { stdio: 'inherit' });
|
|
364
|
+
if (r1.status !== 0) {
|
|
365
|
+
console.error('\n npm install failed.\n');
|
|
366
|
+
process.exit(r1.status ?? 1);
|
|
367
|
+
}
|
|
368
|
+
console.log('\n Refreshing assets in cwd (config preserved)...\n');
|
|
369
|
+
const fsMod = await import('node:fs');
|
|
370
|
+
if (fsMod.existsSync('.kushi')) {
|
|
371
|
+
const r2 = spawnSync(npm, ['exec', '--', 'kushi-agents', '--no-prompt', '--force'], { stdio: 'inherit' });
|
|
372
|
+
process.exit(r2.status ?? 0);
|
|
373
|
+
} else {
|
|
374
|
+
console.log('\n No .kushi/ in cwd — global upgrade complete; cd into a project and run `kushi` to install.\n');
|
|
375
|
+
process.exit(0);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
266
379
|
// ── multi-host mode (v5.0.2+) ───────────────────────────────────────────────
|
|
267
380
|
// Trigger when the user passes any of: --vscode, --all-hosts, --uninstall.
|
|
268
381
|
// --clawpilot ALONE continues to route through the legacy main.mjs path so
|
package/package.json
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "kushi-agents",
|
|
3
|
-
"version": "5.
|
|
3
|
+
"version": "5.9.1",
|
|
4
4
|
"description": "Install Kushi — multi-source project evidence agent with Comprehensive Structured Capture (CSC) into weekly-only files across Email, Teams, OneNote, Loop, SharePoint, Meetings, CRM, ADO. Meetings retain a sibling verbatim/ audit folder. WorkIQ-only for M365 sources (Graph / m365_* FORBIDDEN as fallbacks; user-paste is first-class). Host-agnostic.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
|
+
"kushi": "./bin/cli.mjs",
|
|
7
8
|
"kushi-agents": "./bin/cli.mjs"
|
|
8
9
|
},
|
|
9
10
|
"files": [
|
|
@@ -17,8 +17,13 @@ The global wiki is a personal, cross-engagement knowledge base that lives **outs
|
|
|
17
17
|
### 1. Location
|
|
18
18
|
|
|
19
19
|
- **Default:** `~/.kushi-global/State/` (`$HOME` on POSIX, `$env:USERPROFILE` on Windows).
|
|
20
|
-
- **
|
|
21
|
-
|
|
20
|
+
- **Resolution chain (v5.9.1+, first non-empty wins):**
|
|
21
|
+
1. `$KUSHI_GLOBAL_ROOT` env var (absolute path; tilde-expanded). Tests MUST set this to `.testtmp/.kushi-global/` — never touch the real path.
|
|
22
|
+
2. **`<workspace>/.kushi/config/shared/kushi.yml` → `globalRoot`** (workspace-shared, team-shareable, committed)
|
|
23
|
+
3. `~/.kushi/config.json` → `globalRoot` (per-user, per-machine)
|
|
24
|
+
4. Default `~/.kushi-global/`
|
|
25
|
+
- The global root is **per-user OR per-team** (workspace-shared scope), never per-project, never per-host.
|
|
26
|
+
- Inspect with `kushi global show-root`. Set with `kushi global set-root <path> [--scope workspace|home]` (default scope = workspace if a `.kushi/` is found above cwd, else home).
|
|
22
27
|
|
|
23
28
|
### 2. Shape
|
|
24
29
|
|
|
@@ -201,12 +201,14 @@ function applyRows(source, rows, currentBounds, currentInteg) {
|
|
|
201
201
|
return { boundariesPatch: added.length ? { onenote: { section_file_ids: merged } } : null, accepted };
|
|
202
202
|
}
|
|
203
203
|
if (source === 'sharepoint') {
|
|
204
|
-
|
|
204
|
+
// v5.9.0: SP sites are project-wide, not per-alias. Write into integrations.yml.
|
|
205
|
+
const existing = currentInteg.sharepoint?.sites || [];
|
|
205
206
|
const incoming = rows.map(r => r.site_url).filter(Boolean);
|
|
206
207
|
const merged = dedup([...existing, ...incoming]);
|
|
207
208
|
const added = merged.filter(v => !existing.includes(v));
|
|
208
209
|
if (added.length) accepted.push(...added);
|
|
209
|
-
|
|
210
|
+
const cur = currentInteg.sharepoint || {};
|
|
211
|
+
return { integrationsPatch: added.length ? { sharepoint: { ...cur, sites: merged } } : null, accepted };
|
|
210
212
|
}
|
|
211
213
|
if (source === 'crm') {
|
|
212
214
|
const cur = currentInteg.crm || {};
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
// plugin/runners/lib/references.mjs
|
|
2
|
+
// Deterministic URL extraction, classification, and lightweight HTTP snapshot
|
|
3
|
+
// for the unified references pool. No LLM. v5.9.0.
|
|
4
|
+
|
|
5
|
+
import { promises as fs } from 'node:fs';
|
|
6
|
+
import path from 'node:path';
|
|
7
|
+
import crypto from 'node:crypto';
|
|
8
|
+
|
|
9
|
+
/** Permissive URL regex. Captures http(s) URLs in markdown / yaml / plain text. */
|
|
10
|
+
const URL_RE = /\bhttps?:\/\/[^\s<>"'`)\]}|\\]+/gi;
|
|
11
|
+
|
|
12
|
+
/** Trailing punctuation that is almost never part of a URL. */
|
|
13
|
+
const TRAILING_TRIM = /[)\].,;:!?>'"]+$/;
|
|
14
|
+
|
|
15
|
+
export function extractUrls(text) {
|
|
16
|
+
if (!text || typeof text !== 'string') return [];
|
|
17
|
+
const out = new Set();
|
|
18
|
+
const matches = text.match(URL_RE) || [];
|
|
19
|
+
for (let m of matches) {
|
|
20
|
+
m = m.replace(TRAILING_TRIM, '');
|
|
21
|
+
if (m.length > 8) out.add(m);
|
|
22
|
+
}
|
|
23
|
+
return [...out];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Stable sha1 of normalized URL (for filenames + index keys). */
|
|
27
|
+
export function urlHash(url) {
|
|
28
|
+
return crypto.createHash('sha1').update(normalizeUrl(url)).digest('hex').slice(0, 16);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** Strip fragment + common tracking query params. Keep path/query semantically. */
|
|
32
|
+
export function normalizeUrl(url) {
|
|
33
|
+
try {
|
|
34
|
+
const u = new URL(url);
|
|
35
|
+
u.hash = '';
|
|
36
|
+
const drop = ['utm_source','utm_medium','utm_campaign','utm_term','utm_content','wt.mc_id'];
|
|
37
|
+
for (const k of drop) u.searchParams.delete(k);
|
|
38
|
+
return u.toString();
|
|
39
|
+
} catch {
|
|
40
|
+
return url;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const HOST_RULES = [
|
|
45
|
+
{ match: /(^|\.)sharepoint\.com$/i, host: 'sharepoint.com', kind: 'sharepoint', authRequired: true },
|
|
46
|
+
{ match: /(^|\.)loop\.microsoft\.com$/i, host: 'loop.microsoft.com', kind: 'loop', authRequired: true },
|
|
47
|
+
{ match: /loop\.cloud\.microsoft$/i, host: 'loop.cloud.microsoft', kind: 'loop', authRequired: true },
|
|
48
|
+
{ match: /loop-api\.cloud\.microsoft$/i, host: 'loop-api.cloud.microsoft', kind: 'loop', authRequired: true },
|
|
49
|
+
{ match: /(^|\.)fluidpreview\.office\.net$/i, host: 'fluidpreview.office.net', kind: 'loop', authRequired: true },
|
|
50
|
+
{ match: /(^|\.)teams\.microsoft\.com$/i, host: 'teams.microsoft.com', kind: 'teams', authRequired: true },
|
|
51
|
+
{ match: /(^|\.)office\.com$/i, host: 'office.com', kind: 'office', authRequired: true },
|
|
52
|
+
{ match: /(^|\.)dev\.azure\.com$/i, host: 'dev.azure.com', kind: 'ado', authRequired: true },
|
|
53
|
+
{ match: /(^|\.)visualstudio\.com$/i, host: 'visualstudio.com', kind: 'ado', authRequired: true },
|
|
54
|
+
{ match: /(^|\.)dynamics\.com$/i, host: 'dynamics.com', kind: 'crm', authRequired: true },
|
|
55
|
+
{ match: /(^|\.)learn\.microsoft\.com$/i, host: 'learn.microsoft.com', kind: 'docs', authRequired: false },
|
|
56
|
+
{ match: /(^|\.)docs\.microsoft\.com$/i, host: 'docs.microsoft.com', kind: 'docs', authRequired: false },
|
|
57
|
+
{ match: /(^|\.)github\.com$/i, host: 'github.com', kind: 'repo', authRequired: false },
|
|
58
|
+
];
|
|
59
|
+
|
|
60
|
+
/** Classify a URL into { host, kind, authRequired }. Falls back to "external". */
|
|
61
|
+
export function classify(url) {
|
|
62
|
+
let host = '';
|
|
63
|
+
try { host = new URL(url).hostname.toLowerCase(); } catch { return { host: 'unknown', kind: 'invalid', authRequired: false }; }
|
|
64
|
+
for (const r of HOST_RULES) {
|
|
65
|
+
if (r.match.test(host)) return { host: r.host, kind: r.kind, authRequired: r.authRequired };
|
|
66
|
+
}
|
|
67
|
+
return { host, kind: 'external', authRequired: false };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Safe filename segment for hosts. */
|
|
71
|
+
export function safeHost(host) {
|
|
72
|
+
return (host || 'unknown').toLowerCase().replace(/[^a-z0-9.-]/g, '_').slice(0, 80);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Fetch a URL and extract a small content snapshot.
|
|
77
|
+
* Returns { ok, status, title, description, h1, contentType, bytes, snippet }.
|
|
78
|
+
* Keeps payload bounded (<= maxBytes default 64KB).
|
|
79
|
+
*/
|
|
80
|
+
export async function fetchSnapshot(url, { timeoutMs = 15000, maxBytes = 64 * 1024 } = {}) {
|
|
81
|
+
const ctrl = new AbortController();
|
|
82
|
+
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
83
|
+
try {
|
|
84
|
+
const res = await fetch(url, {
|
|
85
|
+
redirect: 'follow',
|
|
86
|
+
signal: ctrl.signal,
|
|
87
|
+
headers: { 'user-agent': 'kushi-references/1.0 (+https://github.com/ushakrishnan/kushi)' },
|
|
88
|
+
});
|
|
89
|
+
const contentType = res.headers.get('content-type') || '';
|
|
90
|
+
const reader = res.body?.getReader();
|
|
91
|
+
let received = 0;
|
|
92
|
+
const chunks = [];
|
|
93
|
+
if (reader) {
|
|
94
|
+
while (received < maxBytes) {
|
|
95
|
+
const { done, value } = await reader.read();
|
|
96
|
+
if (done) break;
|
|
97
|
+
chunks.push(value);
|
|
98
|
+
received += value.byteLength;
|
|
99
|
+
}
|
|
100
|
+
try { reader.cancel(); } catch {}
|
|
101
|
+
}
|
|
102
|
+
const buf = Buffer.concat(chunks.map(c => Buffer.from(c)));
|
|
103
|
+
const text = buf.toString('utf8');
|
|
104
|
+
const html = /html|xml/i.test(contentType) || /^\s*<!doctype html|<html/i.test(text);
|
|
105
|
+
const title = html ? extractTag(text, 'title') : '';
|
|
106
|
+
const description = html ? extractMeta(text, 'description') : '';
|
|
107
|
+
const h1 = html ? extractTag(text, 'h1') : '';
|
|
108
|
+
const snippet = html ? stripHtml(text).slice(0, 600) : text.slice(0, 600);
|
|
109
|
+
return {
|
|
110
|
+
ok: res.ok,
|
|
111
|
+
status: res.status,
|
|
112
|
+
title: clean(title),
|
|
113
|
+
description: clean(description),
|
|
114
|
+
h1: clean(h1),
|
|
115
|
+
contentType,
|
|
116
|
+
bytes: received,
|
|
117
|
+
snippet: clean(snippet),
|
|
118
|
+
};
|
|
119
|
+
} catch (e) {
|
|
120
|
+
return { ok: false, status: 0, error: e.name === 'AbortError' ? 'timeout' : (e.code || e.message) };
|
|
121
|
+
} finally {
|
|
122
|
+
clearTimeout(timer);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function extractTag(html, tag) {
|
|
127
|
+
const m = html.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'i'));
|
|
128
|
+
return m ? m[1] : '';
|
|
129
|
+
}
|
|
130
|
+
function extractMeta(html, name) {
|
|
131
|
+
const m = html.match(new RegExp(`<meta[^>]+name=["']${name}["'][^>]*content=["']([^"']+)["']`, 'i'))
|
|
132
|
+
|| html.match(new RegExp(`<meta[^>]+property=["']og:${name}["'][^>]*content=["']([^"']+)["']`, 'i'));
|
|
133
|
+
return m ? m[1] : '';
|
|
134
|
+
}
|
|
135
|
+
function stripHtml(s) {
|
|
136
|
+
return s.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
137
|
+
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
138
|
+
.replace(/<[^>]+>/g, ' ')
|
|
139
|
+
.replace(/\s+/g, ' ')
|
|
140
|
+
.trim();
|
|
141
|
+
}
|
|
142
|
+
function clean(s) {
|
|
143
|
+
return (s || '').replace(/\s+/g, ' ').trim().slice(0, 800);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Walk a directory recursively, returning files matching `extensions`. */
|
|
147
|
+
export async function walkFiles(dir, { extensions = ['.yml','.yaml','.md','.txt','.json'], skipDirs = ['_shared/references','node_modules','.git'] } = {}) {
|
|
148
|
+
const out = [];
|
|
149
|
+
async function walk(d) {
|
|
150
|
+
let entries;
|
|
151
|
+
try { entries = await fs.readdir(d, { withFileTypes: true }); } catch { return; }
|
|
152
|
+
for (const e of entries) {
|
|
153
|
+
const full = path.join(d, e.name);
|
|
154
|
+
if (e.isDirectory()) {
|
|
155
|
+
const skip = skipDirs.some(s => full.replaceAll('\\','/').includes(s));
|
|
156
|
+
if (!skip) await walk(full);
|
|
157
|
+
} else if (e.isFile()) {
|
|
158
|
+
if (extensions.includes(path.extname(e.name).toLowerCase())) out.push(full);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
await walk(dir);
|
|
163
|
+
return out;
|
|
164
|
+
}
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// plugin/runners/pull-references.mjs
|
|
3
|
+
// Unified references pool. Scans Evidence/ for URLs, dedupes against an index,
|
|
4
|
+
// classifies by host, snapshots external URLs via HTTP. Auth-protected URLs
|
|
5
|
+
// (SP/Loop/Teams/ADO/CRM) are recorded with metadata only and marked
|
|
6
|
+
// `pending-auth-fetch` for follow-up by source-specific pulls.
|
|
7
|
+
//
|
|
8
|
+
// Project-shared. Not dated. One snapshot per URL. Re-crawl with --refresh.
|
|
9
|
+
//
|
|
10
|
+
// Usage:
|
|
11
|
+
// node plugin/runners/pull-references.mjs --project <P> [--refresh] [--dry-run]
|
|
12
|
+
// [--timeout-ms N] [--max-fetch N] [--only-host <h>]
|
|
13
|
+
|
|
14
|
+
import path from 'node:path';
|
|
15
|
+
import { promises as fs } from 'node:fs';
|
|
16
|
+
import YAML from 'yaml';
|
|
17
|
+
import { evidenceRoot, sharedRoot, projectRoot } from './lib/layout.mjs';
|
|
18
|
+
import { writeAtomic, pathExists } from './lib/evidence.mjs';
|
|
19
|
+
import { extractUrls, urlHash, normalizeUrl, classify, safeHost, fetchSnapshot, walkFiles } from './lib/references.mjs';
|
|
20
|
+
|
|
21
|
+
function parseArgs(argv) {
|
|
22
|
+
const args = { dryRun: false, refresh: false, timeoutMs: 15000, maxFetch: 50 };
|
|
23
|
+
for (let i = 0; i < argv.length; i++) {
|
|
24
|
+
const a = argv[i];
|
|
25
|
+
if (a === '--project') args.project = argv[++i];
|
|
26
|
+
else if (a === '--refresh') args.refresh = true;
|
|
27
|
+
else if (a === '--dry-run') args.dryRun = true;
|
|
28
|
+
else if (a === '--timeout-ms') args.timeoutMs = Number(argv[++i]) || 15000;
|
|
29
|
+
else if (a === '--max-fetch') args.maxFetch = Number(argv[++i]) || 50;
|
|
30
|
+
else if (a === '--only-host') args.onlyHost = argv[++i];
|
|
31
|
+
else if (a === '--help' || a === '-h') args.help = true;
|
|
32
|
+
}
|
|
33
|
+
return args;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function help() {
|
|
37
|
+
return `Usage: node pull-references.mjs --project <P> [--refresh] [--dry-run] [--timeout-ms N] [--max-fetch N] [--only-host <h>]`;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function emit(obj) { process.stdout.write(JSON.stringify(obj) + '\n'); }
|
|
41
|
+
function log(msg) { process.stderr.write(`[references] ${msg}\n`); }
|
|
42
|
+
|
|
43
|
+
function refsRoot(project) { return path.join(sharedRoot(project), 'references'); }
|
|
44
|
+
function indexPath(project) { return path.join(refsRoot(project), 'index.yml'); }
|
|
45
|
+
function recordPath(project, host, hash) {
|
|
46
|
+
return path.join(refsRoot(project), 'by-host', safeHost(host), `${hash}.md`);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
async function loadIndex(p) {
|
|
50
|
+
if (!await pathExists(p)) return { version: 1, entries: {} };
|
|
51
|
+
try { return YAML.parse(await fs.readFile(p, 'utf8')) || { version: 1, entries: {} }; }
|
|
52
|
+
catch { return { version: 1, entries: {} }; }
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function recordTemplate({ url, host, kind, authRequired, firstSeen, sourceFiles, snapshot }) {
|
|
56
|
+
const fm = {
|
|
57
|
+
url,
|
|
58
|
+
normalized_url: normalizeUrl(url),
|
|
59
|
+
host,
|
|
60
|
+
kind,
|
|
61
|
+
auth_required: authRequired,
|
|
62
|
+
first_seen: firstSeen,
|
|
63
|
+
last_crawled: snapshot?.crawledAt || null,
|
|
64
|
+
fetch_status: snapshot?.fetch_status || (authRequired ? 'pending-auth-fetch' : 'unfetched'),
|
|
65
|
+
http_status: snapshot?.status ?? null,
|
|
66
|
+
title: snapshot?.title || '',
|
|
67
|
+
description: snapshot?.description || '',
|
|
68
|
+
source_files: sourceFiles.slice(0, 20),
|
|
69
|
+
};
|
|
70
|
+
const yamlFm = YAML.stringify(fm).trimEnd();
|
|
71
|
+
const body = snapshot?.snippet
|
|
72
|
+
? `\n## Snippet\n\n${snapshot.snippet}\n`
|
|
73
|
+
: (authRequired
|
|
74
|
+
? `\n_Content fetch deferred — this URL requires Microsoft 365 authentication. The matching source-specific pull (sharepoint/loop/teams/ado/crm) will populate richer evidence._\n`
|
|
75
|
+
: `\n_No snapshot captured._\n`);
|
|
76
|
+
return `---\n${yamlFm}\n---\n${body}`;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async function main() {
|
|
80
|
+
const args = parseArgs(process.argv.slice(2));
|
|
81
|
+
if (args.help) { console.log(help()); return 0; }
|
|
82
|
+
if (!args.project) { console.error(help()); emit({ status: 'failed', error: 'required: --project' }); return 2; }
|
|
83
|
+
|
|
84
|
+
const root = projectRoot(args.project);
|
|
85
|
+
if (!await pathExists(root)) { emit({ status: 'failed', error: `project-not-bootstrapped: ${root}` }); return 2; }
|
|
86
|
+
|
|
87
|
+
const evRoot = evidenceRoot(root);
|
|
88
|
+
if (!await pathExists(evRoot)) { emit({ status: 'failed', error: `evidence-missing: ${evRoot}` }); return 2; }
|
|
89
|
+
|
|
90
|
+
log(`scanning ${evRoot} for URLs...`);
|
|
91
|
+
const files = await walkFiles(evRoot);
|
|
92
|
+
log(`scanning ${files.length} file(s)...`);
|
|
93
|
+
|
|
94
|
+
/** url(normalized) → { url, sourceFiles:Set } */
|
|
95
|
+
const found = new Map();
|
|
96
|
+
for (const f of files) {
|
|
97
|
+
let txt = '';
|
|
98
|
+
try { txt = await fs.readFile(f, 'utf8'); } catch { continue; }
|
|
99
|
+
for (const u of extractUrls(txt)) {
|
|
100
|
+
const k = normalizeUrl(u);
|
|
101
|
+
const rel = path.relative(root, f);
|
|
102
|
+
if (!found.has(k)) found.set(k, { url: u, sourceFiles: new Set() });
|
|
103
|
+
found.get(k).sourceFiles.add(rel);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
log(`found ${found.size} unique URL(s)`);
|
|
107
|
+
|
|
108
|
+
const idxFile = indexPath(root);
|
|
109
|
+
const index = await loadIndex(idxFile);
|
|
110
|
+
index.entries ??= {};
|
|
111
|
+
|
|
112
|
+
const today = new Date().toISOString().slice(0, 10);
|
|
113
|
+
const tasks = [];
|
|
114
|
+
let newCount = 0, refreshCount = 0, skippedCount = 0;
|
|
115
|
+
|
|
116
|
+
for (const [normalized, { url, sourceFiles }] of found.entries()) {
|
|
117
|
+
if (args.onlyHost) {
|
|
118
|
+
const c = classify(url);
|
|
119
|
+
if (c.host !== args.onlyHost) { skippedCount++; continue; }
|
|
120
|
+
}
|
|
121
|
+
const hash = urlHash(url);
|
|
122
|
+
const existing = index.entries[hash];
|
|
123
|
+
const isNew = !existing;
|
|
124
|
+
const needsRefresh = !isNew && args.refresh;
|
|
125
|
+
if (!isNew && !needsRefresh) {
|
|
126
|
+
// Update source_files only.
|
|
127
|
+
const merged = new Set([...(existing.source_files || []), ...sourceFiles]);
|
|
128
|
+
existing.source_files = [...merged].slice(0, 20);
|
|
129
|
+
existing.last_seen = today;
|
|
130
|
+
skippedCount++;
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
const c = classify(url);
|
|
134
|
+
tasks.push({ url, normalized, hash, sourceFiles: [...sourceFiles], cls: c, isNew });
|
|
135
|
+
if (isNew) newCount++; else refreshCount++;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Cap fetches per run.
|
|
139
|
+
const fetchable = tasks.filter(t => !t.cls.authRequired && t.cls.kind !== 'invalid').slice(0, args.maxFetch);
|
|
140
|
+
const fetchKeys = new Set(fetchable.map(t => t.hash));
|
|
141
|
+
log(`new: ${newCount}, refresh: ${refreshCount}, skip-existing: ${skippedCount}, will fetch: ${fetchable.length}`);
|
|
142
|
+
|
|
143
|
+
let written = 0, fetched = 0, fetchOk = 0;
|
|
144
|
+
for (const t of tasks) {
|
|
145
|
+
let snap = null;
|
|
146
|
+
if (fetchKeys.has(t.hash)) {
|
|
147
|
+
log(` fetch ${t.cls.host}: ${t.url.slice(0, 100)}`);
|
|
148
|
+
snap = await fetchSnapshot(t.url, { timeoutMs: args.timeoutMs });
|
|
149
|
+
snap.crawledAt = today;
|
|
150
|
+
snap.fetch_status = snap.ok ? 'fetched' : `fetch-failed:${snap.error || snap.status}`;
|
|
151
|
+
fetched++;
|
|
152
|
+
if (snap.ok) fetchOk++;
|
|
153
|
+
}
|
|
154
|
+
const firstSeen = index.entries[t.hash]?.first_seen || today;
|
|
155
|
+
const record = recordTemplate({
|
|
156
|
+
url: t.url,
|
|
157
|
+
host: t.cls.host,
|
|
158
|
+
kind: t.cls.kind,
|
|
159
|
+
authRequired: t.cls.authRequired,
|
|
160
|
+
firstSeen,
|
|
161
|
+
sourceFiles: t.sourceFiles,
|
|
162
|
+
snapshot: snap,
|
|
163
|
+
});
|
|
164
|
+
const recPath = recordPath(root, t.cls.host, t.hash);
|
|
165
|
+
if (!args.dryRun) {
|
|
166
|
+
const r = await writeAtomic(recPath, record, { skipIfUnchanged: true });
|
|
167
|
+
if (r.written) written++;
|
|
168
|
+
}
|
|
169
|
+
index.entries[t.hash] = {
|
|
170
|
+
url: t.url,
|
|
171
|
+
normalized_url: t.normalized,
|
|
172
|
+
host: t.cls.host,
|
|
173
|
+
kind: t.cls.kind,
|
|
174
|
+
auth_required: t.cls.authRequired,
|
|
175
|
+
first_seen: firstSeen,
|
|
176
|
+
last_seen: today,
|
|
177
|
+
last_crawled: snap?.crawledAt || index.entries[t.hash]?.last_crawled || null,
|
|
178
|
+
fetch_status: snap?.fetch_status || index.entries[t.hash]?.fetch_status || (t.cls.authRequired ? 'pending-auth-fetch' : 'unfetched'),
|
|
179
|
+
http_status: snap?.status ?? index.entries[t.hash]?.http_status ?? null,
|
|
180
|
+
title: snap?.title || index.entries[t.hash]?.title || '',
|
|
181
|
+
record_path: path.relative(root, recPath).replaceAll('\\', '/'),
|
|
182
|
+
source_files: t.sourceFiles.slice(0, 20),
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (!args.dryRun) {
|
|
187
|
+
await writeAtomic(idxFile, YAML.stringify(index), { skipIfUnchanged: true });
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
emit({
|
|
191
|
+
status: 'ok',
|
|
192
|
+
project: root,
|
|
193
|
+
dry_run: args.dryRun,
|
|
194
|
+
scanned_files: files.length,
|
|
195
|
+
urls_total: found.size,
|
|
196
|
+
new: newCount,
|
|
197
|
+
refresh: refreshCount,
|
|
198
|
+
fetched,
|
|
199
|
+
fetch_ok: fetchOk,
|
|
200
|
+
written,
|
|
201
|
+
index: path.relative(root, idxFile).replaceAll('\\', '/'),
|
|
202
|
+
});
|
|
203
|
+
return 0;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
main().then(code => process.exit(code || 0)).catch(e => {
|
|
207
|
+
emit({ status: 'failed', error: e.message || String(e) });
|
|
208
|
+
process.exit(1);
|
|
209
|
+
});
|