@abraca/cli 1.9.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,471 @@
1
+ /**
2
+ * `abracadabra wiki <title>` — fetch Wikipedia articles and seed them into
3
+ * the active space as a graph of docs.
4
+ *
5
+ * Streaming flow (no buffering): every newly-discovered title becomes a
6
+ * shell doc immediately (visible in the dashboard right away), then bodies
7
+ * are filled in one fetch at a time. The user sees the tree skeleton appear
8
+ * before the first body is written.
9
+ *
10
+ * The command authenticates separately from the parent CLI (it's listed in
11
+ * NO_CONNECT_COMMANDS in src/index.ts) so the parent harness doesn't open a
12
+ * second connection. We use the modern DocumentManager API from @abraca/dabra.
13
+ */
14
+ import type { DocumentManager } from '@abraca/dabra'
15
+ import { registerCommand } from '../../command.ts'
16
+ import type { CLIConnection } from '../../connection.ts'
17
+ import type { ParsedArgs } from '../../parser.ts'
18
+ import { WikipediaClient } from './wikipedia.ts'
19
+ import { snapshotArticle, canonicalTitle, prettyCategoryLabel } from './snapshot.ts'
20
+ import {
21
+ ICONS,
22
+ pickSectionType,
23
+ renderArticleLead,
24
+ renderArticleSingleDoc,
25
+ renderInfoboxBody,
26
+ renderCategoryBody,
27
+ rewriteLinks,
28
+ } from './render.ts'
29
+ import { openSession } from './connect.ts'
30
+ import type { WikiOptions, ExtractMode, ExtractedArticle, ExtractedSection } from './types.ts'
31
+
32
+ registerCommand({
33
+ name: 'wiki',
34
+ aliases: ['wikipedia'],
35
+ description: 'Fetch Wikipedia articles into a graph of docs (streaming).',
36
+ usage: [
37
+ 'wiki "<Article Title>"',
38
+ ' mode=single|split single doc per article OR split into sections+infobox [split]',
39
+ ' depth=<n> follow internal links to depth N [1]',
40
+ ' category-depth=<n> recurse into sub-categories [1]',
41
+ ' lang=<code> wiki language [en]',
42
+ ' domain=<host> 3rd-party MediaWiki host (overrides lang)',
43
+ ' parent=<docId> parent doc for the new graph [active space root]',
44
+ ' user-agent=<str> Api-User-Agent header (REQUIRED by Wikimedia etiquette)',
45
+ ' rate=<rps> max wikipedia requests per second [3]',
46
+ ' --include-categories expand each article\'s categories into nested graphs',
47
+ ' --dry-run fetch only the entry article, print outline, no writes',
48
+ ].join('\n'),
49
+ async run(_conn: CLIConnection | null, args: ParsedArgs): Promise<string> {
50
+ const opts = parseOptions(args)
51
+ if (typeof opts === 'string') return opts
52
+
53
+ const log = (msg: string) => {
54
+ if (!args.flags.has('quiet') && !args.flags.has('q')) {
55
+ console.error(`[wiki] ${msg}`)
56
+ }
57
+ }
58
+
59
+ const wp = new WikipediaClient({
60
+ lang: opts.lang,
61
+ domain: opts.domain,
62
+ userAgent: opts.userAgent,
63
+ rate: opts.rate,
64
+ })
65
+
66
+ if (opts.dryRun) {
67
+ // Dry-run: fetch only the entry, print its outline, no server.
68
+ log(`fetch ${opts.title}`)
69
+ const doc = await wp.fetchArticle(opts.title)
70
+ if (!doc) return `Article not found: "${opts.title}"`
71
+ const snap = snapshotArticle(doc, canonicalTitle(doc.title?.() ?? opts.title))
72
+ return [
73
+ `Entry: ${snap.title}`,
74
+ `URL: ${snap.url ?? '(none)'}`,
75
+ `Internal links: ${snap.linkTitles.length}`,
76
+ `Categories: ${snap.categories.length}`,
77
+ `Sections: ${snap.sections.length}`,
78
+ `Has infobox: ${snap.infobox && snap.infobox.length > 0 ? 'yes' : 'no'}`,
79
+ '',
80
+ '── Sections ──',
81
+ printSections(snap.sections, ''),
82
+ ].join('\n')
83
+ }
84
+
85
+ // ── Connect ──────────────────────────────────────────────────────────
86
+ // process.env access uses bracket notation to satisfy noUncheckedIndexedAccess.
87
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
88
+ const env = (globalThis as any).process?.env ?? {}
89
+ const url = env['ABRA_URL']
90
+ if (!url) {
91
+ return 'ABRA_URL is required to write to the server. Set it or pass --dry-run.'
92
+ }
93
+ const { dm } = await openSession({
94
+ url,
95
+ name: env['ABRA_NAME'],
96
+ color: env['ABRA_COLOR'],
97
+ inviteCode: env['ABRA_INVITE_CODE'],
98
+ keyFile: env['ABRA_KEY_FILE'],
99
+ quiet: args.flags.has('quiet') || args.flags.has('q'),
100
+ })
101
+
102
+ try {
103
+ const result = await runStreaming(dm, wp, opts, log)
104
+ return [
105
+ `Done. Created ${result.articleCount} articles${
106
+ result.categoryCount > 0 ? ` + ${result.categoryCount} categories` : ''
107
+ }.`,
108
+ `Root: ${result.rootDocId}`,
109
+ ].join('\n')
110
+ } finally {
111
+ await dm.destroy().catch(() => {})
112
+ }
113
+ },
114
+ })
115
+
116
+ // ─────────────────────────────────────────────────────────────────────────
117
+ // Streaming orchestrator
118
+ // ─────────────────────────────────────────────────────────────────────────
119
+
120
+ interface StreamResult {
121
+ rootDocId: string
122
+ articleCount: number
123
+ categoryCount: number
124
+ }
125
+
126
+ async function runStreaming(
127
+ dm: DocumentManager,
128
+ wp: WikipediaClient,
129
+ opts: WikiOptions,
130
+ log: (msg: string) => void,
131
+ ): Promise<StreamResult> {
132
+ // Title → docId map. Drives [[Title]] → [[docId|label]] rewriting at write time.
133
+ const titleToDocId = new Map<string, string>()
134
+ // Snapshots of articles we've already fetched (avoid re-fetching).
135
+ const fetched = new Map<string, ExtractedArticle>()
136
+ // Articles whose section/infobox children have been created (split mode).
137
+ const childrenCreated = new Set<string>()
138
+ // Categories whose shells have been created.
139
+ const categoryToDocId = new Map<string, string>()
140
+ let categoriesContainerId: string | null = null
141
+
142
+ // ── Fetch entry first; we need its title to label the root graph ─────
143
+ log(`fetch ${opts.title}`)
144
+ const entryDoc = await wp.fetchArticle(opts.title)
145
+ if (!entryDoc) {
146
+ throw new Error(`Article not found: "${opts.title}"`)
147
+ }
148
+ const entryTitle = canonicalTitle(entryDoc.title?.() ?? opts.title)
149
+ const entrySnap = snapshotArticle(entryDoc, entryTitle)
150
+ fetched.set(entryTitle, entrySnap)
151
+
152
+ // ── Step 1: create root graph doc (visible immediately) ──────────────
153
+ const rootEntry = dm.tree.create({
154
+ parentId: opts.parentDocId ?? null,
155
+ label: entryTitle,
156
+ type: 'graph',
157
+ meta: { icon: ICONS.graph },
158
+ })
159
+ log(`+ ${rootEntry.id.slice(0, 8)}… ${entryTitle} (graph)`)
160
+
161
+ // ── Step 2: create the entry article shell ───────────────────────────
162
+ const entryArticleId = createArticleShell(dm, entrySnap, rootEntry.id, log)
163
+ titleToDocId.set(entryTitle, entryArticleId)
164
+
165
+ // Queue of (title, depth) to process. Each entry is guaranteed to have
166
+ // a shell doc already in titleToDocId.
167
+ const queue: Array<{ title: string; depth: number }> = [{ title: entryTitle, depth: 0 }]
168
+ let articleCount = 0
169
+
170
+ // ── Step 3: streaming process ───────────────────────────────────────
171
+ while (queue.length > 0) {
172
+ const { title, depth } = queue.shift()!
173
+ const articleDocId = titleToDocId.get(title)!
174
+
175
+ // Ensure we've fetched this article.
176
+ let snap = fetched.get(title)
177
+ if (!snap) {
178
+ log(`fetch [d${depth}] ${title}`)
179
+ try {
180
+ const doc = await wp.fetchArticle(title)
181
+ if (!doc) {
182
+ log(` not found — leaving stub`)
183
+ continue
184
+ }
185
+ snap = snapshotArticle(doc, canonicalTitle(doc.title?.() ?? title))
186
+ fetched.set(title, snap)
187
+ } catch (err: any) {
188
+ log(`! fetch failed: ${err?.message ?? err}`)
189
+ continue
190
+ }
191
+ }
192
+
193
+ // Create this article's section/infobox children (split mode only,
194
+ // and only once per article).
195
+ if (opts.mode === 'split' && !childrenCreated.has(title)) {
196
+ createArticleChildren(dm, snap, articleDocId, log)
197
+ childrenCreated.add(title)
198
+ }
199
+
200
+ // Discover new linked titles and pre-allocate shells immediately.
201
+ if (depth < opts.depth) {
202
+ for (const linkTitle of snap.linkTitles) {
203
+ if (titleToDocId.has(linkTitle)) continue
204
+ const shell = dm.tree.create({
205
+ parentId: rootEntry.id,
206
+ label: linkTitle,
207
+ type: 'doc',
208
+ meta: { icon: ICONS.article },
209
+ })
210
+ titleToDocId.set(linkTitle, shell.id)
211
+ queue.push({ title: linkTitle, depth: depth + 1 })
212
+ log(`+ ${shell.id.slice(0, 8)}… ${linkTitle} (doc, shell)`)
213
+ }
214
+ }
215
+
216
+ // Pre-allocate category shells when first discovered.
217
+ if (opts.includeCategories && snap.categories.length > 0) {
218
+ if (!categoriesContainerId) {
219
+ const c = dm.tree.create({
220
+ parentId: rootEntry.id,
221
+ label: 'Categories',
222
+ type: 'graph',
223
+ meta: { icon: ICONS.categories },
224
+ })
225
+ categoriesContainerId = c.id
226
+ log(`+ ${c.id.slice(0, 8)}… Categories (graph)`)
227
+ }
228
+ for (const catTitle of snap.categories) {
229
+ if (categoryToDocId.has(catTitle)) continue
230
+ const cat = dm.tree.create({
231
+ parentId: categoriesContainerId,
232
+ label: prettyCategoryLabel(catTitle),
233
+ type: 'graph',
234
+ meta: { icon: ICONS.category },
235
+ })
236
+ categoryToDocId.set(catTitle, cat.id)
237
+ log(`+ ${cat.id.slice(0, 8)}… ${prettyCategoryLabel(catTitle)} (graph, cat)`)
238
+ }
239
+ }
240
+
241
+ // Write this article's body NOW (links resolve to whatever shells we
242
+ // have allocated so far — that's all of this article's links since we
243
+ // just allocated them above).
244
+ const body =
245
+ opts.mode === 'split' ? renderArticleLead(snap) : renderArticleSingleDoc(snap)
246
+ if (body.trim().length > 0) {
247
+ const rewritten = rewriteLinks(body, titleToDocId)
248
+ try {
249
+ await dm.content.write(articleDocId, rewritten)
250
+ log(`✓ body ${title}`)
251
+ } catch (err: any) {
252
+ log(`! body write failed for ${title}: ${err?.message ?? err}`)
253
+ }
254
+ }
255
+
256
+ // In split mode, also write each section/infobox doc body.
257
+ if (opts.mode === 'split') {
258
+ await writeChildrenBodies(dm, snap, articleDocId, titleToDocId, log)
259
+ }
260
+
261
+ articleCount++
262
+ }
263
+
264
+ // ── Step 4: fill in category bodies ─────────────────────────────────
265
+ let categoryCount = 0
266
+ if (opts.includeCategories && categoryToDocId.size > 0) {
267
+ for (const [catTitle, catDocId] of categoryToDocId) {
268
+ log(`category ${catTitle}`)
269
+ try {
270
+ const members = await wp.fetchCategoryPages(
271
+ catTitle,
272
+ opts.categoryDepth > 0,
273
+ Math.max(0, opts.categoryDepth),
274
+ )
275
+ const memberArticles: string[] = []
276
+ const subcats: string[] = []
277
+ for (const m of members) {
278
+ if (m.type === 'subcat') subcats.push(prettyCategoryLabel(m.title))
279
+ else memberArticles.push(m.title)
280
+ }
281
+ const body = renderCategoryBody(memberArticles, subcats)
282
+ const rewritten = rewriteLinks(body, titleToDocId)
283
+ if (rewritten.trim().length > 0) {
284
+ await dm.content.write(catDocId, rewritten)
285
+ log(`✓ body category ${catTitle}`)
286
+ }
287
+ categoryCount++
288
+ } catch (err: any) {
289
+ log(`! category ${catTitle}: ${err?.message ?? err}`)
290
+ }
291
+ }
292
+ }
293
+
294
+ return { rootDocId: rootEntry.id, articleCount, categoryCount }
295
+ }
296
+
297
+ // ─────────────────────────────────────────────────────────────────────────
298
+ // Shell + body helpers
299
+ // ─────────────────────────────────────────────────────────────────────────
300
+
301
+ function createArticleShell(
302
+ dm: DocumentManager,
303
+ article: ExtractedArticle,
304
+ parentId: string,
305
+ log: (msg: string) => void,
306
+ ): string {
307
+ const meta: Record<string, unknown> = { icon: ICONS.article }
308
+ if (article.url) meta.url = article.url
309
+ const entry = dm.tree.create({
310
+ parentId,
311
+ label: article.title,
312
+ type: 'doc',
313
+ meta,
314
+ })
315
+ log(`+ ${entry.id.slice(0, 8)}… ${article.title} (doc)`)
316
+ return entry.id
317
+ }
318
+
319
+ /**
320
+ * Create section + infobox child docs for a split-mode article. Returns nothing
321
+ * — children get bodies written later in writeChildrenBodies.
322
+ */
323
+ function createArticleChildren(
324
+ dm: DocumentManager,
325
+ article: ExtractedArticle,
326
+ articleDocId: string,
327
+ log: (msg: string) => void,
328
+ ): void {
329
+ if (article.infobox && article.infobox.length > 0) {
330
+ const ib = dm.tree.create({
331
+ parentId: articleDocId,
332
+ label: 'Infobox',
333
+ type: 'outline',
334
+ meta: { icon: ICONS.infobox },
335
+ })
336
+ log(` + ${ib.id.slice(0, 8)}… Infobox (outline)`)
337
+ // We attach the docId to the article object so writeChildrenBodies
338
+ // can find it without a second tree query.
339
+ ;(article as any)._infoboxDocId = ib.id
340
+ }
341
+ for (const section of article.sections) {
342
+ createSectionShell(dm, section, articleDocId, log)
343
+ }
344
+ }
345
+
346
+ function createSectionShell(
347
+ dm: DocumentManager,
348
+ section: ExtractedSection,
349
+ parentDocId: string,
350
+ log: (msg: string) => void,
351
+ ): void {
352
+ const hasChildren = section.children.length > 0
353
+ if (!section.body.trim() && !hasChildren) return
354
+ const { type, icon } = pickSectionType(section)
355
+ const entry = dm.tree.create({
356
+ parentId: parentDocId,
357
+ label: section.title || 'Untitled section',
358
+ type,
359
+ meta: { icon },
360
+ })
361
+ log(` + ${entry.id.slice(0, 8)}… ${entry.label} (${type})`)
362
+ ;(section as any)._docId = entry.id
363
+ for (const child of section.children) {
364
+ createSectionShell(dm, child, entry.id, log)
365
+ }
366
+ }
367
+
368
+ async function writeChildrenBodies(
369
+ dm: DocumentManager,
370
+ article: ExtractedArticle,
371
+ _articleDocId: string,
372
+ titleToDocId: Map<string, string>,
373
+ log: (msg: string) => void,
374
+ ): Promise<void> {
375
+ const infoboxDocId = (article as any)._infoboxDocId as string | undefined
376
+ if (infoboxDocId && article.infobox && article.infobox.length > 0) {
377
+ try {
378
+ await dm.content.write(infoboxDocId, renderInfoboxBody(article.infobox))
379
+ } catch (err: any) {
380
+ log(`! infobox body write failed: ${err?.message ?? err}`)
381
+ }
382
+ }
383
+ for (const section of article.sections) {
384
+ await writeSectionBody(dm, section, titleToDocId, log)
385
+ }
386
+ }
387
+
388
+ async function writeSectionBody(
389
+ dm: DocumentManager,
390
+ section: ExtractedSection,
391
+ titleToDocId: Map<string, string>,
392
+ log: (msg: string) => void,
393
+ ): Promise<void> {
394
+ const docId = (section as any)._docId as string | undefined
395
+ if (docId && section.body.trim().length > 0) {
396
+ try {
397
+ await dm.content.write(docId, rewriteLinks(section.body, titleToDocId))
398
+ } catch (err: any) {
399
+ log(`! section body write failed for ${section.title}: ${err?.message ?? err}`)
400
+ }
401
+ }
402
+ for (const child of section.children) {
403
+ await writeSectionBody(dm, child, titleToDocId, log)
404
+ }
405
+ }
406
+
407
+ // ─────────────────────────────────────────────────────────────────────────
408
+ // Argument parsing + dry-run printing
409
+ // ─────────────────────────────────────────────────────────────────────────
410
+
411
+ function parseOptions(args: ParsedArgs): WikiOptions | string {
412
+ const title = args.positional[0]?.trim() || args.params['title']
413
+ if (!title) return 'Missing required positional argument: <title>. Example: abracadabra wiki "Toronto Raptors"'
414
+
415
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
416
+ const env = (globalThis as any).process?.env ?? {}
417
+ const userAgent = args.params['user-agent'] || args.params['userAgent'] || env['ABRA_WIKI_USER_AGENT']
418
+ if (!userAgent) {
419
+ return [
420
+ 'Missing required parameter: user-agent="your-name (you@example.com)"',
421
+ '(Wikimedia etiquette requires an Api-User-Agent header. Pass user-agent=... or set ABRA_WIKI_USER_AGENT.)',
422
+ ].join('\n')
423
+ }
424
+
425
+ const mode = (args.params['mode'] ?? 'split') as ExtractMode
426
+ if (mode !== 'single' && mode !== 'split') {
427
+ return `Invalid mode "${mode}". Use mode=single or mode=split.`
428
+ }
429
+
430
+ const depth = parseIntOr(args.params['depth'], 1)
431
+ const categoryDepth = parseIntOr(args.params['category-depth'] ?? args.params['categoryDepth'], 1)
432
+ const rate = parseFloatOr(args.params['rate'], 3)
433
+
434
+ return {
435
+ title,
436
+ mode,
437
+ depth,
438
+ categoryDepth,
439
+ includeCategories: args.flags.has('include-categories') || args.flags.has('includeCategories'),
440
+ lang: args.params['lang'] ?? 'en',
441
+ domain: args.params['domain'],
442
+ parentDocId: args.params['parent'],
443
+ userAgent,
444
+ rate,
445
+ dryRun: args.flags.has('dry-run') || args.flags.has('dryRun'),
446
+ }
447
+ }
448
+
449
+ function parseIntOr(s: string | undefined, fallback: number): number {
450
+ if (!s) return fallback
451
+ const n = Number.parseInt(s, 10)
452
+ return Number.isFinite(n) && n >= 0 ? n : fallback
453
+ }
454
+
455
+ function parseFloatOr(s: string | undefined, fallback: number): number {
456
+ if (!s) return fallback
457
+ const n = Number.parseFloat(s)
458
+ return Number.isFinite(n) && n > 0 ? n : fallback
459
+ }
460
+
461
+ function printSections(sections: ExtractedSection[], indent: string): string {
462
+ const lines: string[] = []
463
+ for (const s of sections) {
464
+ const hint = s.body ? ` (${s.body.length}b)` : ''
465
+ lines.push(`${indent}- ${s.title}${hint}${s.children.length > 0 ? ` [${s.children.length} sub]` : ''}`)
466
+ if (s.children.length > 0) {
467
+ lines.push(printSections(s.children, indent + ' '))
468
+ }
469
+ }
470
+ return lines.join('\n')
471
+ }
@@ -0,0 +1,91 @@
1
+ /**
2
+ * Body rendering + page-type decisions for the streaming orchestrator.
3
+ *
4
+ * All rendering is title-driven: bodies are rendered with `[[Title]]`
5
+ * placeholders, and `rewriteLinks` rewrites them to `[[docId|label]]`
6
+ * using the live title→docId map at write time.
7
+ */
8
+ import type { ExtractedArticle, ExtractedSection } from './types.ts'
9
+
10
+ export const ICONS = {
11
+ graph: 'git-fork',
12
+ article: 'book-open',
13
+ category: 'tag',
14
+ infobox: 'info',
15
+ outline: 'list',
16
+ gallery: 'images',
17
+ section: 'pilcrow',
18
+ categories: 'tags',
19
+ } as const
20
+
21
+ /** Decide a page type for a section based on its shape. */
22
+ export function pickSectionType(section: ExtractedSection): { type: string; icon: string } {
23
+ if (section.children.length > 0) return { type: 'outline', icon: ICONS.outline }
24
+ if (section.isList && section.listLength >= 5) return { type: 'outline', icon: ICONS.outline }
25
+ return { type: 'doc', icon: ICONS.section }
26
+ }
27
+
28
+ /** Render the lead paragraph as the article-doc body. */
29
+ export function renderArticleLead(article: ExtractedArticle): string {
30
+ return article.lead ?? ''
31
+ }
32
+
33
+ /** Render the article as a single doc, sections + infobox inlined. */
34
+ export function renderArticleSingleDoc(article: ExtractedArticle): string {
35
+ const parts: string[] = []
36
+ if (article.lead) parts.push(article.lead)
37
+ if (article.infobox && article.infobox.length > 0) {
38
+ parts.push('## Infobox', renderInfoboxBody(article.infobox))
39
+ }
40
+ for (const section of article.sections) {
41
+ parts.push(...renderSectionInline(section, 2))
42
+ }
43
+ return parts.join('\n\n')
44
+ }
45
+
46
+ function renderSectionInline(section: ExtractedSection, level: number): string[] {
47
+ const out: string[] = []
48
+ const prefix = '#'.repeat(Math.min(6, level))
49
+ if (section.title) out.push(`${prefix} ${section.title}`)
50
+ if (section.body.trim()) out.push(section.body)
51
+ for (const child of section.children) {
52
+ out.push(...renderSectionInline(child, level + 1))
53
+ }
54
+ return out
55
+ }
56
+
57
+ export function renderInfoboxBody(rows: Array<{ key: string; value: string }>): string {
58
+ return rows.map((r) => `- **${r.key}:** ${r.value}`).join('\n')
59
+ }
60
+
61
+ export function renderCategoryBody(members: string[], subcategories: string[]): string {
62
+ const parts: string[] = []
63
+ if (members.length > 0) {
64
+ parts.push('## Pages')
65
+ parts.push(members.map((m) => `- [[${m}]]`).join('\n'))
66
+ }
67
+ if (subcategories.length > 0) {
68
+ parts.push('## Sub-categories')
69
+ parts.push(subcategories.map((s) => `- ${s}`).join('\n'))
70
+ }
71
+ return parts.join('\n\n')
72
+ }
73
+
74
+ /**
75
+ * Replace `[[Title]]` / `[[Title|Alias]]` in markdown with
76
+ * `[[docId|label]]` using the title→docId map. Unresolved titles fall
77
+ * back to plain text (their alias or original title).
78
+ */
79
+ export function rewriteLinks(
80
+ markdown: string,
81
+ titleToDocId: Map<string, string>,
82
+ ): string {
83
+ const re = /\[\[([^\]|]+?)(?:\|([^\]]+?))?\]\]/g
84
+ return markdown.replace(re, (_match, target: string, alias?: string) => {
85
+ const title = target.trim()
86
+ const docId = titleToDocId.get(title)
87
+ const display = (alias && alias.trim().length > 0 ? alias : title).trim()
88
+ if (!docId) return display
89
+ return `[[${docId}|${display}]]`
90
+ })
91
+ }