kushi-agents 6.0.1 → 6.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/package.json +61 -61
  2. package/plugin/runners/discover.mjs +52 -11
  3. package/plugin/runners/lib/csc-pull.mjs +415 -0
  4. package/plugin/runners/lib/workiq.mjs +5 -2
  5. package/plugin/runners/pull-email.mjs +3 -208
  6. package/plugin/runners/pull-meetings.mjs +4 -207
  7. package/plugin/runners/pull-onenote.mjs +3 -239
  8. package/plugin/runners/pull-sharepoint.mjs +3 -196
  9. package/plugin/runners/pull-teams.mjs +3 -170
  10. package/plugin/runners/test/fixtures/csc-captured.json +5 -0
  11. package/plugin/runners/test/fixtures/csc-citation-tokens.json +5 -0
  12. package/plugin/runners/test/fixtures/csc-empty.json +5 -0
  13. package/plugin/runners/test/fixtures/csc-novel-error.json +5 -0
  14. package/plugin/runners/test/fixtures/csc-throttled.json +5 -0
  15. package/plugin/runners/test/fixtures/refresh-dir/email.json +5 -16
  16. package/plugin/runners/test/fixtures/refresh-dir/teams.json +5 -12
  17. package/plugin/runners/test/integration/csc-pull.integration.test.mjs +160 -0
  18. package/plugin/runners/test/unit/csc-pull.test.mjs +96 -0
  19. package/plugin/runners/test/fixtures/email-abn-amro.json +0 -13
  20. package/plugin/runners/test/fixtures/email-novel-error.json +0 -9
  21. package/plugin/runners/test/fixtures/meetings-abn-amro.json +0 -10
  22. package/plugin/runners/test/fixtures/meetings-body-unavailable.json +0 -10
  23. package/plugin/runners/test/fixtures/onenote-abn-amro.json +0 -30
  24. package/plugin/runners/test/fixtures/onenote-partial.json +0 -21
  25. package/plugin/runners/test/fixtures/sharepoint-abn-amro.json +0 -12
  26. package/plugin/runners/test/fixtures/teams-abn-amro.json +0 -11
  27. package/plugin/runners/test/integration/pull-email.integration.test.mjs +0 -149
  28. package/plugin/runners/test/integration/pull-meetings.integration.test.mjs +0 -92
  29. package/plugin/runners/test/integration/pull-onenote.integration.test.mjs +0 -86
  30. package/plugin/runners/test/integration/pull-sharepoint.integration.test.mjs +0 -93
  31. package/plugin/runners/test/integration/pull-teams.integration.test.mjs +0 -91
package/package.json CHANGED
@@ -1,61 +1,61 @@
1
- {
2
- "name": "kushi-agents",
3
- "version": "6.0.1",
4
- "description": "Install Kushi — multi-source project evidence agent with Comprehensive Structured Capture (CSC) into weekly-only files across Email, Teams, OneNote, Loop, SharePoint, Meetings, CRM, ADO. Meetings retain a sibling verbatim/ audit folder. WorkIQ-only for M365 sources (Graph / m365_* FORBIDDEN as fallbacks; user-paste is first-class). Host-agnostic.",
5
- "type": "module",
6
- "bin": {
7
- "kushi": "./bin/kushi.mjs",
8
- "kushi-agents": "./bin/kushi-agents.mjs"
9
- },
10
- "files": [
11
- "bin/",
12
- "src/",
13
- "plugin/",
14
- ".github/copilot-instructions.kushi.md"
15
- ],
16
- "engines": {
17
- "node": ">=18.0.0"
18
- },
19
- "dependencies": {
20
- "@azure/identity": "^4.5.0",
21
- "@mozilla/readability": "^0.6.0",
22
- "jsdom": "^29.1.1",
23
- "jsonc-parser": "^3.3.1",
24
- "yaml": "^2.6.0"
25
- },
26
- "keywords": [
27
- "vscode",
28
- "copilot",
29
- "agents",
30
- "kushi",
31
- "project-evidence",
32
- "workiq",
33
- "m365",
34
- "ai",
35
- "cli"
36
- ],
37
- "repository": {
38
- "type": "git",
39
- "url": "git+https://github.com/gim-home/kushi.git"
40
- },
41
- "homepage": "https://gim-home.github.io/kushi/",
42
- "bugs": {
43
- "url": "https://github.com/gim-home/kushi/issues"
44
- },
45
- "license": "MIT",
46
- "scripts": {
47
- "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs src/eval-aggregator.test.mjs src/eval-runner.test.mjs src/hooks-dispatcher.test.mjs src/parallel-refresh.test.mjs src/otel-emit.test.mjs src/doctor.test.mjs src/setup-wizard.test.mjs src/cli-no-args.test.mjs src/cli-no-args-tty.test.mjs src/per-user-files.test.mjs src/layout-portable.test.mjs src/profile-coverage.test.mjs src/get-kushi-config.test.mjs src/seed-config-derived.test.mjs src/resolve-alias.test.mjs plugin/runners/test/unit/*.test.mjs",
48
- "test:runners": "node --test plugin/runners/test/unit/*.test.mjs",
49
- "test:runners:integration": "node --test plugin/runners/test/integration/*.test.mjs",
50
- "test:integration:bootstrap": "node src/bootstrap-dryrun.integration.test.mjs",
51
- "smoke": "node scripts/smoke.mjs",
52
- "eval": "pwsh plugin/skills/eval/run-evals.ps1 -Skill",
53
- "eval:all": "pwsh plugin/skills/eval/run-evals.ps1 -All",
54
- "eval:canary": "pwsh plugin/skills/eval/run-evals.ps1 -Canary",
55
- "eval:baseline": "pwsh plugin/skills/eval/run-evals.ps1 -All -UpdateBaseline",
56
- "prepublishOnly": "npm test && npm run smoke"
57
- },
58
- "publishConfig": {
59
- "access": "public"
60
- }
61
- }
1
+ {
2
+ "name": "kushi-agents",
3
+ "version": "6.1.1",
4
+ "description": "Install Kushi — multi-source project evidence agent with Comprehensive Structured Capture (CSC) into weekly-only files across Email, Teams, OneNote, Loop, SharePoint, Meetings, CRM, ADO. Meetings retain a sibling verbatim/ audit folder. WorkIQ-only for M365 sources (Graph / m365_* FORBIDDEN as fallbacks; user-paste is first-class). Host-agnostic.",
5
+ "type": "module",
6
+ "bin": {
7
+ "kushi": "./bin/kushi.mjs",
8
+ "kushi-agents": "./bin/kushi-agents.mjs"
9
+ },
10
+ "files": [
11
+ "bin/",
12
+ "src/",
13
+ "plugin/",
14
+ ".github/copilot-instructions.kushi.md"
15
+ ],
16
+ "engines": {
17
+ "node": ">=18.0.0"
18
+ },
19
+ "dependencies": {
20
+ "@azure/identity": "^4.5.0",
21
+ "@mozilla/readability": "^0.6.0",
22
+ "jsdom": "^29.1.1",
23
+ "jsonc-parser": "^3.3.1",
24
+ "yaml": "^2.6.0"
25
+ },
26
+ "keywords": [
27
+ "vscode",
28
+ "copilot",
29
+ "agents",
30
+ "kushi",
31
+ "project-evidence",
32
+ "workiq",
33
+ "m365",
34
+ "ai",
35
+ "cli"
36
+ ],
37
+ "repository": {
38
+ "type": "git",
39
+ "url": "git+https://github.com/gim-home/kushi.git"
40
+ },
41
+ "homepage": "https://gim-home.github.io/kushi/",
42
+ "bugs": {
43
+ "url": "https://github.com/gim-home/kushi/issues"
44
+ },
45
+ "license": "MIT",
46
+ "scripts": {
47
+ "test": "node --test src/check-workiq.test.mjs src/seed-config.test.mjs src/sanitize-workiq-input.test.mjs src/detect-vertex-repo.test.mjs src/vertex-validate.test.mjs src/emit-vertex.e2e.test.mjs src/config-root-resolve.test.mjs src/forbidden-workiq-phrasings.test.mjs src/multi-host-install.test.mjs src/eval-aggregator.test.mjs src/eval-runner.test.mjs src/hooks-dispatcher.test.mjs src/parallel-refresh.test.mjs src/otel-emit.test.mjs src/doctor.test.mjs src/setup-wizard.test.mjs src/cli-no-args.test.mjs src/cli-no-args-tty.test.mjs src/per-user-files.test.mjs src/layout-portable.test.mjs src/profile-coverage.test.mjs src/get-kushi-config.test.mjs src/seed-config-derived.test.mjs src/resolve-alias.test.mjs plugin/runners/test/unit/*.test.mjs",
48
+ "test:runners": "node --test plugin/runners/test/unit/*.test.mjs",
49
+ "test:runners:integration": "node --test plugin/runners/test/integration/*.test.mjs",
50
+ "test:integration:bootstrap": "node src/bootstrap-dryrun.integration.test.mjs",
51
+ "smoke": "node scripts/smoke.mjs",
52
+ "eval": "pwsh plugin/skills/eval/run-evals.ps1 -Skill",
53
+ "eval:all": "pwsh plugin/skills/eval/run-evals.ps1 -All",
54
+ "eval:canary": "pwsh plugin/skills/eval/run-evals.ps1 -Canary",
55
+ "eval:baseline": "pwsh plugin/skills/eval/run-evals.ps1 -All -UpdateBaseline",
56
+ "prepublishOnly": "npm test && npm run smoke"
57
+ },
58
+ "publishConfig": {
59
+ "access": "public"
60
+ }
61
+ }
@@ -162,7 +162,7 @@ function isPlaceholder(v) {
162
162
  if (!s) return true;
163
163
  if (/^<.*>$/.test(s)) return true; // <value>, <chat_id>, etc.
164
164
  if (/^turn\d+search\d+$/i.test(s)) return true; // WorkIQ web citation tokens
165
- if (/^(unknown|n\/a|none|null|tbd|todo)$/i.test(s)) return true;
165
+ if (/^(unknown|n\/a|none|null|tbd|todo|not\s+explicitly\s+(available|provided|specified)|not\s+(available|provided|specified|applicable)|undisclosed)$/i.test(s)) return true;
166
166
  return false;
167
167
  }
168
168
 
@@ -178,19 +178,26 @@ function isValidValueFor(source, field, raw) {
178
178
  return true;
179
179
  }
180
180
  if (source === 'teams' && field === 'chat_id') {
181
- // Graph chat IDs are long strings ending in '@thread.v2' or similar.
182
- return v.includes('@thread') || v.length > 30;
181
+ // Accept Graph chat IDs ('@thread' / long opaque) OR human-readable
182
+ // chat topics WorkIQ frequently returns the topic when the Graph ID
183
+ // is unavailable. Refresh stage will resolve to a real chat.
184
+ return v.length >= 2 && v.length <= 200;
183
185
  }
184
186
  if (source === 'meetings' && field === 'join_url') {
185
- return /^https:\/\/teams\.microsoft\.com\/.+meetup-join/i.test(v);
187
+ // Prefer real Teams meetup URLs but also accept any https URL or
188
+ // a meeting subject — refresh stage will resolve.
189
+ if (v.startsWith('http')) return true;
190
+ return v.length >= 3 && v.length <= 200;
186
191
  }
187
192
  if (source === 'onenote' && field === 'section_file_id') {
188
- // OneNote section file IDs are hex strings, typically prefixed `0-` and
189
- // dozens of chars long. Reject short citation tokens.
190
- return /^[0-9a-f][0-9a-f\-]{20,}$/i.test(v) || v.startsWith('0-');
193
+ // Accept hex section IDs OR section/page names. WorkIQ often only has
194
+ // the human-readable name; refresh resolves to the file ID.
195
+ return v.length >= 2 && v.length <= 200;
191
196
  }
192
197
  if (source === 'sharepoint' && field === 'site_url') {
193
- return v.startsWith('https://') && v.includes('.sharepoint.com');
198
+ // Prefer real SharePoint URLs; also accept site names/relative paths.
199
+ if (v.startsWith('http')) return v.includes('.sharepoint.com') || v.includes('/sites/');
200
+ return v.length >= 2 && v.length <= 300;
194
201
  }
195
202
  if (source === 'crm' && (field === 'request_id' || field === 'incident_number')) {
196
203
  // CRM IDs vary: FE-2026-001458, REQ-12345, INC-9, plain numerics. Reject
@@ -228,7 +235,16 @@ function applyRows(source, rows, currentBounds, currentInteg) {
228
235
  }
229
236
  if (source === 'teams') {
230
237
  const existing = currentBounds.teams?.chats || [];
231
- const incoming = rows.map(r => r.chat_id).filter(v => isValidValueFor('teams', 'chat_id', v));
238
+ // WorkIQ rarely has Graph chat IDs citation tokens like "turn1search1"
239
+ // are common. Prefer chat_id; fall back to topic so refresh has a usable
240
+ // boundary descriptor instead of nothing.
241
+ const incoming = rows.map(r => {
242
+ const id = r.chat_id;
243
+ if (id && !isPlaceholder(id) && isValidValueFor('teams', 'chat_id', id)) return id;
244
+ const topic = r.topic;
245
+ if (topic && !isPlaceholder(topic)) return topic;
246
+ return null;
247
+ }).filter(Boolean);
232
248
  const merged = dedup([...existing, ...incoming]);
233
249
  const added = merged.filter(v => !existing.includes(v));
234
250
  if (added.length) accepted.push(...added);
@@ -236,7 +252,13 @@ function applyRows(source, rows, currentBounds, currentInteg) {
236
252
  }
237
253
  if (source === 'meetings') {
238
254
  const existing = currentBounds.meetings?.joinUrls || [];
239
- const incoming = rows.map(r => r.join_url).filter(v => isValidValueFor('meetings', 'join_url', v));
255
+ const incoming = rows.map(r => {
256
+ const url = r.join_url;
257
+ if (url && !isPlaceholder(url) && isValidValueFor('meetings', 'join_url', url) && url.startsWith('http')) return url;
258
+ const subj = r.subject;
259
+ if (subj && !isPlaceholder(subj)) return subj;
260
+ return null;
261
+ }).filter(Boolean);
240
262
  const merged = dedup([...existing, ...incoming]);
241
263
  const added = merged.filter(v => !existing.includes(v));
242
264
  if (added.length) accepted.push(...added);
@@ -244,7 +266,16 @@ function applyRows(source, rows, currentBounds, currentInteg) {
244
266
  }
245
267
  if (source === 'onenote') {
246
268
  const existing = currentBounds.onenote?.section_file_ids || [];
247
- const incoming = rows.map(r => r.section_file_id).filter(v => isValidValueFor('onenote', 'section_file_id', v));
269
+ // WorkIQ rarely has section_file_id (Graph property); usually returns
270
+ // citation tokens. Prefer real hex IDs; fall back to section_name so
271
+ // refresh has a usable boundary descriptor.
272
+ const incoming = rows.map(r => {
273
+ const id = r.section_file_id;
274
+ if (id && !isPlaceholder(id) && /^[0-9a-f][0-9a-f\-]{20,}$/i.test(String(id).trim())) return id;
275
+ const name = r.section_name;
276
+ if (name && !isPlaceholder(name)) return name;
277
+ return null;
278
+ }).filter(Boolean);
248
279
  const merged = dedup([...existing, ...incoming]);
249
280
  const added = merged.filter(v => !existing.includes(v));
250
281
  if (added.length) accepted.push(...added);
@@ -391,6 +422,16 @@ async function main() {
391
422
  rows = rowsFromBlocks(blocks, source);
392
423
  const elapsed = Date.now() - t0;
393
424
  const bytes = Buffer.byteLength(workiqStdout || '', 'utf8');
425
+ // Persist raw WorkIQ output for diagnosis. Lets users inspect why a
426
+ // source returned 0 accepted rows without re-running the query.
427
+ if (!args.dryRun) {
428
+ try {
429
+ const discoveryDir = path.join(aliasRoot(args.project, args.alias), '_discovery');
430
+ await fs.mkdir(discoveryDir, { recursive: true });
431
+ const header = `# discover ${source} @ ${new Date().toISOString()}\n# elapsed=${elapsed}ms bytes=${bytes} blocks=${blocks.length} rows=${rows.length}\n# prompt:\n${prompt.split('\n').map(l => '# ' + l).join('\n')}\n# --- workiq stdout ---\n`;
432
+ await fs.writeFile(path.join(discoveryDir, `${source}-raw.txt`), header + (workiqStdout || ''), 'utf8');
433
+ } catch { /* best-effort */ }
434
+ }
394
435
  if (rows.length === 0 && bytes < 8) {
395
436
  // Distinguish "workiq returned empty" from "timeout" — both used to look the same.
396
437
  skipReason = 'workiq-empty-response';
@@ -0,0 +1,415 @@
1
+ // plugin/runners/lib/csc-pull.mjs
2
+ // Shared deterministic pull pipeline for the 5 WorkIQ-only M365 sources
3
+ // (email, teams, meetings, onenote, sharepoint).
4
+ //
5
+ // Per `workiq-only.instructions.md` (HARD RULE, kushi v3.11.0+):
6
+ // - WorkIQ is the ONLY path. Graph REST is FORBIDDEN.
7
+ // - On WorkIQ failure: write deferred-retry marker, continue.
8
+ //
9
+ // Doctrine references:
10
+ // - `comprehensive-structured-capture.instructions.md` (CSC block shape)
11
+ // - `weekly-csc.instructions.md` (weekly/<week>_<source>-csc.md + _index/)
12
+ // - `meetings-verbatim-required.instructions.md` (transcript.txt parallel)
13
+ //
14
+ // Output layout (per source per week):
15
+ // Evidence/<alias>/<source>/weekly/<YYYY-MM-DD>_<source>-csc.md
16
+ // Evidence/<alias>/<source>/_index/entities.yml
17
+ //
18
+ // Each puller is a thin wrapper that calls pullSource() with its source name
19
+ // and a per-source prompt builder.
20
+
21
+ import path from 'node:path';
22
+ import { promises as fs } from 'node:fs';
23
+ import YAML from 'yaml';
24
+ import { sourceDir, aliasRoot } from './layout.mjs';
25
+ import { writeAtomic, safeSegment, pathExists } from './evidence.mjs';
26
+ import { ask as workiqAsk, resolveWorkiqBin } from './workiq.mjs';
27
+ import { loadM365Auth, scopeForSource } from './m365-auth.mjs';
28
+ import { updateCell } from './ledger.mjs';
29
+ import { appendRunLog } from './runlog.mjs';
30
+ import { enqueue, clear } from './deferred.mjs';
31
+ import { emitLearningCandidate } from './learnings.mjs';
32
+ import { currentIsoMonday, ymd, parseYmd } from './weeks.mjs';
33
+
34
+ /** Compute Monday + Sunday-EOD ISO bounds for a week-start string. */
35
+ export function weekBounds(weekStartYmd) {
36
+ const start = parseYmd(weekStartYmd);
37
+ const end = new Date(start);
38
+ end.setDate(end.getDate() + 7);
39
+ return {
40
+ fromIso: start.toISOString(),
41
+ toIso: end.toISOString(),
42
+ fromYmd: weekStartYmd,
43
+ toYmd: ymd(new Date(end.getTime() - 1)),
44
+ };
45
+ }
46
+
47
+ /**
48
+ * Build the canonical CSC prompt for a per-entity weekly pull.
49
+ * Mirrors the doctrine in workiq-only.instructions.md § "CSC canonical prompts".
50
+ */
51
+ export function buildPullPrompt({ source, project, entity, weekStart, scope, opts = {} }) {
52
+ const { fromYmd, toYmd } = weekBounds(weekStart);
53
+ const lines = [];
54
+
55
+ if (source === 'email') {
56
+ lines.push(`Find all emails in Outlook folder "${entity}" related to project "${project}" between ${fromYmd} and ${toYmd}, inclusive.`);
57
+ if (scope?.includeSubfolders !== false) lines.push(`Include every nested subfolder beneath "${entity}".`);
58
+ if (scope?.dateFloor) lines.push(`Hard date floor: ${scope.dateFloor} (do not consider mail older than this).`);
59
+ lines.push('Group messages by conversationId. One CSC block per conversation touched in the week.');
60
+ } else if (source === 'teams') {
61
+ lines.push(`Find all Microsoft Teams messages in chat / channel "${entity}" between ${fromYmd} and ${toYmd}, inclusive.`);
62
+ if (scope?.dateFloor) lines.push(`Hard date floor: ${scope.dateFloor}.`);
63
+ lines.push('One CSC block for the chat thread (the whole entity), summarizing all messages in the week.');
64
+ } else if (source === 'meetings') {
65
+ lines.push(`Find the Teams meeting whose join URL is "${entity}" with occurrence between ${fromYmd} and ${toYmd}, inclusive.`);
66
+ lines.push('One CSC block per occurrence touched in the week.');
67
+ } else if (source === 'onenote') {
68
+ lines.push(`Find OneNote pages in section_file_id "${entity}" modified between ${fromYmd} and ${toYmd}, inclusive.`);
69
+ if (scope?.notebookName) lines.push(`Restrict to notebook "${scope.notebookName}".`);
70
+ lines.push('One CSC block per page touched in the week.');
71
+ } else if (source === 'sharepoint') {
72
+ lines.push(`Find SharePoint files within site "${entity}" modified between ${fromYmd} and ${toYmd}, inclusive.`);
73
+ lines.push('One CSC block per file touched in the week.');
74
+ } else {
75
+ throw new Error(`csc-pull: unsupported source "${source}"`);
76
+ }
77
+
78
+ lines.push('');
79
+ lines.push('Return ONLY structured-capture blocks of this exact shape:');
80
+ lines.push('> [block: csc]');
81
+ lines.push('> entity_id: <stable id>');
82
+ lines.push('> display_name: <human-readable title>');
83
+ lines.push('> last_touched: <ISO timestamp>');
84
+ lines.push('> participants: <comma-separated list>');
85
+ lines.push('> topics: <comma-separated short phrases>');
86
+ lines.push('> decisions: <comma-separated, or "_none_">');
87
+ lines.push('> action_items: <semicolon-separated "<owner> | <due> | <text>" tuples, or "_none_">');
88
+ lines.push('> open_questions: <comma-separated, or "_none_">');
89
+ lines.push('> risks: <comma-separated, or "_none_">');
90
+ lines.push('> next_steps: <comma-separated, or "_none_">');
91
+ lines.push('> summary: <one-sentence narrative>');
92
+ lines.push('');
93
+ lines.push('One block per entity touched in the week. No prose, no commentary.');
94
+ lines.push('If nothing was touched, return an empty response.');
95
+ lines.push('Skip web-search citation tokens like "turn1search5".');
96
+ return lines.join('\n');
97
+ }
98
+
99
+ /**
100
+ * Build the verbatim transcript prompt for meetings (alongside CSC block).
101
+ * Per meetings-verbatim-required.instructions.md.
102
+ */
103
+ export function buildVerbatimTranscriptPrompt({ entity, weekStart }) {
104
+ const { fromYmd, toYmd } = weekBounds(weekStart);
105
+ return [
106
+ `For the Teams meeting with join URL "${entity}", occurrence between ${fromYmd} and ${toYmd}:`,
107
+ 'Return the full verbatim transcript with timestamps. Do NOT summarize. Do NOT paraphrase.',
108
+ 'If no transcript is available (meeting did not record, or transcription was off), return exactly:',
109
+ '> [block: transcript-unavailable]',
110
+ '> reason: <short explanation>',
111
+ '',
112
+ 'If a transcript exists, return ONLY the verbatim text — no prose, no headers.',
113
+ ].join('\n');
114
+ }
115
+
116
+ /**
117
+ * Read --fixture file and return { stdout, stderr, exitCode } shape.
118
+ * Fixture is a JSON file with shape:
119
+ * { "stdout": "...", "stderr": "", "exitCode": 0 } // success
120
+ * { "exitCode": 1, "stderr": "throttled" } // failure
121
+ * { "timeout": true } // timeout simulation
122
+ */
123
+ async function loadFixture(p) {
124
+ const txt = await fs.readFile(p, 'utf8');
125
+ return JSON.parse(txt);
126
+ }
127
+
128
+ /**
129
+ * Parse CSC blockquote blocks from WorkIQ stdout. Returns array of
130
+ * { entity_id, display_name, last_touched, raw, fields }
131
+ * Blocks without entity_id are dropped (low-signal).
132
+ * Web-citation tokens like turn1search5 in entity_id are also dropped.
133
+ */
134
+ export function parseEntityBlocks(text) {
135
+ if (!text || typeof text !== 'string') return [];
136
+ const out = [];
137
+ // Match `> [block: csc]` followed by lines starting with `> `.
138
+ const re = /(^|\n)>\s*\[block:\s*csc\]\s*\n((?:>\s*[^\n]*\n?)+)/g;
139
+ let m;
140
+ while ((m = re.exec(text)) !== null) {
141
+ const body = m[2].split('\n').map(l => l.replace(/^>\s?/, '')).filter(l => l.length).join('\n');
142
+ const fields = {};
143
+ for (const line of body.split('\n')) {
144
+ const mm = line.match(/^([a-zA-Z0-9_.-]+)\s*:\s*(.*)$/);
145
+ if (!mm) continue;
146
+ fields[mm[1].trim()] = mm[2].trim();
147
+ }
148
+ if (!fields.entity_id) continue;
149
+ if (/^turn\d+search\d+$/i.test(fields.entity_id)) continue;
150
+ if (/^<.*>$/.test(fields.entity_id)) continue;
151
+ out.push({
152
+ entity_id: fields.entity_id,
153
+ display_name: fields.display_name || fields.entity_id,
154
+ last_touched: fields.last_touched || null,
155
+ raw: body,
156
+ fields,
157
+ });
158
+ }
159
+ return out;
160
+ }
161
+
162
+ /** Slugify an entity_id into a safe markdown anchor / file-name segment. */
163
+ export function entityAnchor(entityId) {
164
+ return String(entityId).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '').slice(0, 80);
165
+ }
166
+
167
+ /** Format the CSC weekly markdown file from blocks + metadata. */
168
+ export function formatWeeklyMarkdown({ source, weekStart, blocks, project, entity, pulledAt }) {
169
+ const lines = [
170
+ `# ${source.toUpperCase()} CSC — week ${weekStart}`,
171
+ '',
172
+ `- project: ${project}`,
173
+ `- source: ${source}`,
174
+ `- entity: ${entity}`,
175
+ `- week_start: ${weekStart}`,
176
+ `- pulled_at: ${pulledAt}`,
177
+ `- entities_touched: ${blocks.length}`,
178
+ '',
179
+ ];
180
+ if (blocks.length === 0) {
181
+ lines.push('_No activity this week._');
182
+ lines.push('');
183
+ } else {
184
+ for (const b of blocks) {
185
+ const anchor = entityAnchor(b.entity_id);
186
+ lines.push(`## ${b.display_name} {#${anchor}}`);
187
+ lines.push('');
188
+ lines.push(`- entity_id: \`${b.entity_id}\``);
189
+ if (b.last_touched) lines.push(`- last_touched: ${b.last_touched}`);
190
+ if (b.fields.participants) lines.push(`- participants: ${b.fields.participants}`);
191
+ if (b.fields.topics) lines.push(`- topics: ${b.fields.topics}`);
192
+ if (b.fields.summary) lines.push(`- summary: ${b.fields.summary}`);
193
+ const sections = [
194
+ ['Decisions', b.fields.decisions],
195
+ ['Action Items', b.fields.action_items],
196
+ ['Open Questions', b.fields.open_questions],
197
+ ['Risks', b.fields.risks],
198
+ ['Next Steps', b.fields.next_steps],
199
+ ];
200
+ for (const [label, val] of sections) {
201
+ if (!val || val === '_none_') continue;
202
+ lines.push('');
203
+ lines.push(`### ${label}`);
204
+ lines.push('');
205
+ for (const item of val.split(/[;,]/).map(s => s.trim()).filter(Boolean)) {
206
+ lines.push(`- ${item}`);
207
+ }
208
+ }
209
+ lines.push('');
210
+ }
211
+ }
212
+ return lines.join('\n');
213
+ }
214
+
215
+ /** Upsert one row per entity into _index/entities.yml. */
216
+ export async function upsertEntitiesIndex(indexPath, blocks, { source, weekStart, latestCscFile }) {
217
+ let current = { entities: [] };
218
+ try {
219
+ const txt = await fs.readFile(indexPath, 'utf8');
220
+ current = YAML.parse(txt) ?? { entities: [] };
221
+ if (!Array.isArray(current.entities)) current.entities = [];
222
+ } catch (e) {
223
+ if (e.code !== 'ENOENT') throw e;
224
+ }
225
+ const byId = new Map(current.entities.map(e => [e.id, e]));
226
+ const now = new Date().toISOString();
227
+ for (const b of blocks) {
228
+ const id = `${source}://${b.entity_id}`;
229
+ const prev = byId.get(id);
230
+ const weeks = new Set([...(prev?.weeks_touched || []), weekStart]);
231
+ byId.set(id, {
232
+ id,
233
+ display_name: b.display_name,
234
+ entity_anchor: entityAnchor(b.entity_id),
235
+ latest_csc_file: latestCscFile,
236
+ last_touched: b.last_touched || now,
237
+ first_seen: prev?.first_seen || now,
238
+ weeks_touched: [...weeks].sort(),
239
+ status: 'captured',
240
+ });
241
+ }
242
+ const next = { entities: [...byId.values()].sort((a, b) => a.id.localeCompare(b.id)) };
243
+ await fs.mkdir(path.dirname(indexPath), { recursive: true });
244
+ return writeAtomic(indexPath, YAML.stringify(next), { skipIfUnchanged: false });
245
+ }
246
+
247
+ /** Classify a WorkIQ failure into deferred vs failed. */
248
+ function classifyError(err) {
249
+ if (err.code === 'WORKIQ_TIMEOUT') return { status: 'deferred', signature: 'workiq-timeout', retryable: true };
250
+ if (err.code === 'WORKIQ_NOT_FOUND') return { status: 'failed', signature: 'workiq-not-found', retryable: false };
251
+ if (err.code === 'WORKIQ_EXIT_NONZERO') {
252
+ const stderr = String(err.stderr || '').toLowerCase();
253
+ if (/throttl|rate.?limit|429/.test(stderr)) return { status: 'deferred', signature: 'workiq-throttled', retryable: true };
254
+ if (/unauthor|forbidden|401|403/.test(stderr)) return { status: 'failed', signature: 'workiq-auth', retryable: false };
255
+ return { status: 'failed', signature: 'workiq-error', retryable: false };
256
+ }
257
+ return { status: 'failed', signature: 'workiq-unknown', retryable: false };
258
+ }
259
+
260
+ /**
261
+ * Full pipeline: prompt → WorkIQ → parse → write weekly + index → ledger + runlog.
262
+ * Returns { status, items_pulled, files_written, errors? }.
263
+ */
264
+ export async function pullSource({ source, project, alias, entity, week, dryRun = false, fixture = null, mailbox = null, runner }) {
265
+ const weekStart = week || ymd(currentIsoMonday());
266
+ const startedAt = new Date().toISOString();
267
+
268
+ // 1. Load m365 scope hints (deterministic, optional).
269
+ const m365 = await loadM365Auth({ workspace: project }).catch(() => null)
270
+ || await loadM365Auth().catch(() => ({ config: {} }));
271
+ const scope = scopeForSource(m365.config, source);
272
+ if (scope && scope.enabled === false) {
273
+ const out = { source, entity, week: weekStart, status: 'not-applicable', items_pulled: 0, files_written: [] };
274
+ if (!dryRun) {
275
+ await updateCell(project, alias, source, entity, weekStart, { last_status: 'not-applicable' });
276
+ await appendRunLog(project, { runner, alias, entity, week: weekStart, status: 'not-applicable', reason: `${source} disabled in m365-auth.json` });
277
+ }
278
+ return out;
279
+ }
280
+
281
+ // 2. Build prompt.
282
+ const prompt = buildPullPrompt({ source, project: path.basename(project), entity, weekStart, scope });
283
+
284
+ // 3. Call WorkIQ (or fixture).
285
+ let stdout = '', stderr = '', workiqErr = null;
286
+ if (fixture) {
287
+ const fx = await loadFixture(fixture);
288
+ if (fx.timeout) { workiqErr = Object.assign(new Error('fixture: timeout'), { code: 'WORKIQ_TIMEOUT' }); }
289
+ else if (fx.exitCode && fx.exitCode !== 0) {
290
+ workiqErr = Object.assign(new Error('fixture: nonzero'), { code: 'WORKIQ_EXIT_NONZERO', exitCode: fx.exitCode, stderr: fx.stderr || '', stdout: fx.stdout || '' });
291
+ } else {
292
+ stdout = fx.stdout || '';
293
+ stderr = fx.stderr || '';
294
+ }
295
+ } else {
296
+ const workiqBin = resolveWorkiqBin();
297
+ if (!await pathExists(workiqBin)) {
298
+ workiqErr = Object.assign(new Error(`workiq not found at ${workiqBin}`), { code: 'WORKIQ_NOT_FOUND' });
299
+ } else {
300
+ try {
301
+ const r = await workiqAsk(prompt, { bin: workiqBin, timeoutMs: 300_000 });
302
+ stdout = r.stdout;
303
+ stderr = r.stderr;
304
+ } catch (e) {
305
+ workiqErr = e;
306
+ }
307
+ }
308
+ }
309
+
310
+ // 4. Handle WorkIQ failure.
311
+ if (workiqErr) {
312
+ const { status, signature, retryable } = classifyError(workiqErr);
313
+ const errMsg = (workiqErr.message || '').slice(0, 1000);
314
+ if (!dryRun) {
315
+ await updateCell(project, alias, source, entity, weekStart, { last_status: status, last_error: `${signature}: ${errMsg}` });
316
+ if (retryable) await enqueue(project, alias, { source, entity, weekStart, signature, reason: errMsg });
317
+ else await emitLearningCandidate({ projectRoot: project, alias, source, entity, week: weekStart, error: { signature, message: errMsg }, context: { runner } });
318
+ await appendRunLog(project, { runner, alias, entity, week: weekStart, status, error: errMsg, signature });
319
+ }
320
+ return { source, entity, week: weekStart, status, items_pulled: 0, files_written: [], errors: [{ signature, message: errMsg }] };
321
+ }
322
+
323
+ // 5. Parse blocks.
324
+ const blocks = parseEntityBlocks(stdout);
325
+
326
+ // 6. No-activity case.
327
+ if (blocks.length === 0) {
328
+ if (!dryRun) {
329
+ await updateCell(project, alias, source, entity, weekStart, { last_status: 'no-activity', items_pulled: 0 });
330
+ await appendRunLog(project, { runner, alias, entity, week: weekStart, status: 'no-activity', items_pulled: 0 });
331
+ await clear(project, alias, source, entity).catch(() => {});
332
+ }
333
+ return { source, entity, week: weekStart, status: 'no-activity', items_pulled: 0, files_written: [] };
334
+ }
335
+
336
+ // 7. Write weekly file + index + raw stdout sidecar.
337
+ const outDir = path.join(sourceDir(project, alias, source), 'weekly');
338
+ const indexDir = path.join(sourceDir(project, alias, source), '_index');
339
+ const fname = `${weekStart}_${source}-csc.md`;
340
+ const filePath = path.join(outDir, fname);
341
+ const indexPath = path.join(indexDir, 'entities.yml');
342
+ const rawPath = path.join(sourceDir(project, alias, source), '_raw', `${weekStart}_${safeSegment(entity)}.txt`);
343
+ const filesWritten = [];
344
+
345
+ if (!dryRun) {
346
+ const md = formatWeeklyMarkdown({ source, weekStart, blocks, project: path.basename(project), entity, pulledAt: startedAt });
347
+ const r1 = await writeAtomic(filePath, md, { skipIfUnchanged: false });
348
+ if (r1.written !== false) filesWritten.push(path.relative(project, r1.path));
349
+ const r2 = await upsertEntitiesIndex(indexPath, blocks, { source, weekStart, latestCscFile: `weekly/${fname}` });
350
+ if (r2.written !== false) filesWritten.push(path.relative(project, r2.path));
351
+ const r3 = await writeAtomic(rawPath, stdout, { skipIfUnchanged: false });
352
+ if (r3.written !== false) filesWritten.push(path.relative(project, r3.path));
353
+ await updateCell(project, alias, source, entity, weekStart, { last_status: 'captured', items_pulled: blocks.length });
354
+ await appendRunLog(project, { runner, alias, entity, week: weekStart, status: 'captured', items_pulled: blocks.length });
355
+ await clear(project, alias, source, entity).catch(() => {});
356
+ }
357
+
358
+ return {
359
+ source,
360
+ entity,
361
+ week: weekStart,
362
+ status: 'captured',
363
+ items_pulled: blocks.length,
364
+ files_written: filesWritten,
365
+ ledger_key: `${source}::${entity}::${weekStart}`,
366
+ };
367
+ }
368
+
369
+ /**
370
+ * Standard CLI entrypoint shared across all 5 pullers.
371
+ * Each puller imports this and calls runCli(SOURCE_NAME).
372
+ */
373
+ export async function runCli(source) {
374
+ const argv = process.argv.slice(2);
375
+ const args = { dryRun: false };
376
+ for (let i = 0; i < argv.length; i++) {
377
+ const a = argv[i];
378
+ if (a === '--project') args.project = argv[++i];
379
+ else if (a === '--alias') args.alias = argv[++i];
380
+ else if (a === '--entity') args.entity = argv[++i];
381
+ else if (a === '--mailbox') args.mailbox = argv[++i];
382
+ else if (a === '--week') args.week = argv[++i];
383
+ else if (a === '--dry-run') args.dryRun = true;
384
+ else if (a === '--force') args.force = true;
385
+ else if (a === '--fixture') args.fixture = argv[++i];
386
+ else if (a === '--help' || a === '-h') args.help = true;
387
+ }
388
+ if (args.help) {
389
+ console.log(`Usage: node pull-${source}.mjs --project <P> --alias <A> --entity <e> [--week YYYY-MM-DD] [--dry-run] [--fixture <path>]`);
390
+ return 0;
391
+ }
392
+ if (!args.project || !args.alias || !args.entity) {
393
+ console.error(`required: --project --alias --entity`);
394
+ process.stdout.write(JSON.stringify({ source, status: 'failed', errors: [{ signature: 'bad-args', message: 'required: --project --alias --entity' }] }) + '\n');
395
+ return 2;
396
+ }
397
+ try {
398
+ const result = await pullSource({
399
+ source,
400
+ project: path.resolve(args.project),
401
+ alias: args.alias,
402
+ entity: args.entity,
403
+ week: args.week,
404
+ dryRun: args.dryRun,
405
+ fixture: args.fixture,
406
+ mailbox: args.mailbox,
407
+ runner: `pull-${source}`,
408
+ });
409
+ process.stdout.write(JSON.stringify(result) + '\n');
410
+ return 0;
411
+ } catch (e) {
412
+ process.stdout.write(JSON.stringify({ source, status: 'failed', errors: [{ message: e.message }] }) + '\n');
413
+ return 1;
414
+ }
415
+ }