@abraca/wiki 2.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -0
- package/dist/abracadabra-wiki.cjs +1418 -0
- package/dist/abracadabra-wiki.cjs.map +1 -0
- package/dist/abracadabra-wiki.esm.js +1387 -0
- package/dist/abracadabra-wiki.esm.js.map +1 -0
- package/dist/index.d.ts +27 -0
- package/package.json +44 -0
- package/src/connect.ts +69 -0
- package/src/crypto.ts +70 -0
- package/src/index.ts +508 -0
- package/src/parser.ts +62 -0
- package/src/render.ts +91 -0
- package/src/snapshot.ts +210 -0
- package/src/types.ts +45 -0
- package/src/wikipedia.ts +154 -0
package/src/index.ts
ADDED
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* `abracadabra-wiki <title>` — fetch Wikipedia articles and seed them into the
|
|
4
|
+
* active Abracadabra space as a graph of docs.
|
|
5
|
+
*
|
|
6
|
+
* Streaming flow (no buffering): every newly-discovered title becomes a shell
|
|
7
|
+
* doc immediately (visible in the dashboard right away), then bodies are filled
|
|
8
|
+
* in one fetch at a time. The user sees the tree skeleton appear before the
|
|
9
|
+
* first body is written.
|
|
10
|
+
*
|
|
11
|
+
* Authenticates with its own Ed25519 key (`ABRA_KEY_FILE`, default
|
|
12
|
+
* `~/.abracadabra/cli.key`) via the modern `DocumentManager` API — the same
|
|
13
|
+
* identity the `abracadabra` CLI uses, so both tools share one account.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* abracadabra-wiki "<Article Title>" user-agent="you (you@example.com)" [options]
|
|
17
|
+
*
|
|
18
|
+
* Environment:
|
|
19
|
+
* ABRA_URL Server URL (required unless --dry-run)
|
|
20
|
+
* ABRA_KEY_FILE Ed25519 key file (~/.abracadabra/cli.key)
|
|
21
|
+
* ABRA_NAME / ABRA_COLOR Presence identity
|
|
22
|
+
* ABRA_INVITE_CODE Invite code for first-run registration
|
|
23
|
+
* ABRA_WIKI_USER_AGENT Default Api-User-Agent (or pass user-agent=...)
|
|
24
|
+
*/
|
|
25
|
+
import type { DocumentManager } from '@abraca/dabra'
|
|
26
|
+
import { parseArgs, type ParsedArgs } from './parser.ts'
|
|
27
|
+
import { WikipediaClient } from './wikipedia.ts'
|
|
28
|
+
import { snapshotArticle, canonicalTitle, prettyCategoryLabel } from './snapshot.ts'
|
|
29
|
+
import {
|
|
30
|
+
ICONS,
|
|
31
|
+
pickSectionType,
|
|
32
|
+
renderArticleLead,
|
|
33
|
+
renderArticleSingleDoc,
|
|
34
|
+
renderInfoboxBody,
|
|
35
|
+
renderCategoryBody,
|
|
36
|
+
rewriteLinks,
|
|
37
|
+
} from './render.ts'
|
|
38
|
+
import { openSession } from './connect.ts'
|
|
39
|
+
import type { WikiOptions, ExtractMode, ExtractedArticle, ExtractedSection } from './types.ts'
|
|
40
|
+
|
|
41
|
+
const USAGE = [
|
|
42
|
+
'abracadabra-wiki "<Article Title>" user-agent="<name (email)>" [options]',
|
|
43
|
+
'',
|
|
44
|
+
'Options:',
|
|
45
|
+
' mode=single|split single doc per article OR split into sections+infobox [split]',
|
|
46
|
+
' depth=<n> follow internal links to depth N [1]',
|
|
47
|
+
' category-depth=<n> recurse into sub-categories [1]',
|
|
48
|
+
' lang=<code> wiki language [en]',
|
|
49
|
+
' domain=<host> 3rd-party MediaWiki host (overrides lang)',
|
|
50
|
+
' parent=<docId> parent doc for the new graph [active space root]',
|
|
51
|
+
' user-agent=<str> Api-User-Agent header (REQUIRED by Wikimedia etiquette)',
|
|
52
|
+
' rate=<rps> max wikipedia requests per second [3]',
|
|
53
|
+
' --include-categories expand each article\'s categories into nested graphs',
|
|
54
|
+
' --dry-run fetch only the entry article, print outline, no writes',
|
|
55
|
+
'',
|
|
56
|
+
'Environment: ABRA_URL (required unless --dry-run), ABRA_KEY_FILE, ABRA_NAME,',
|
|
57
|
+
' ABRA_COLOR, ABRA_INVITE_CODE, ABRA_WIKI_USER_AGENT.',
|
|
58
|
+
].join('\n')
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Run a Wikipedia import for already-parsed args. Returns a human-readable
|
|
62
|
+
* summary (or an error/usage string). Exported for programmatic use.
|
|
63
|
+
*/
|
|
64
|
+
export async function runWiki(args: ParsedArgs): Promise<string> {
|
|
65
|
+
const opts = parseOptions(args)
|
|
66
|
+
if (typeof opts === 'string') return opts
|
|
67
|
+
|
|
68
|
+
const log = (msg: string) => {
|
|
69
|
+
if (!args.flags.has('quiet') && !args.flags.has('q')) {
|
|
70
|
+
console.error(`[wiki] ${msg}`)
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const wp = new WikipediaClient({
|
|
75
|
+
lang: opts.lang,
|
|
76
|
+
domain: opts.domain,
|
|
77
|
+
userAgent: opts.userAgent,
|
|
78
|
+
rate: opts.rate,
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
if (opts.dryRun) {
|
|
82
|
+
// Dry-run: fetch only the entry, print its outline, no server.
|
|
83
|
+
log(`fetch ${opts.title}`)
|
|
84
|
+
const doc = await wp.fetchArticle(opts.title)
|
|
85
|
+
if (!doc) return `Article not found: "${opts.title}"`
|
|
86
|
+
const snap = snapshotArticle(doc, canonicalTitle(doc.title?.() ?? opts.title))
|
|
87
|
+
return [
|
|
88
|
+
`Entry: ${snap.title}`,
|
|
89
|
+
`URL: ${snap.url ?? '(none)'}`,
|
|
90
|
+
`Internal links: ${snap.linkTitles.length}`,
|
|
91
|
+
`Categories: ${snap.categories.length}`,
|
|
92
|
+
`Sections: ${snap.sections.length}`,
|
|
93
|
+
`Has infobox: ${snap.infobox && snap.infobox.length > 0 ? 'yes' : 'no'}`,
|
|
94
|
+
'',
|
|
95
|
+
'── Sections ──',
|
|
96
|
+
printSections(snap.sections, ''),
|
|
97
|
+
].join('\n')
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ── Connect ──────────────────────────────────────────────────────────
|
|
101
|
+
// process.env access uses bracket notation to satisfy noUncheckedIndexedAccess.
|
|
102
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
103
|
+
const env = (globalThis as any).process?.env ?? {}
|
|
104
|
+
const url = env['ABRA_URL']
|
|
105
|
+
if (!url) {
|
|
106
|
+
return 'ABRA_URL is required to write to the server. Set it or pass --dry-run.'
|
|
107
|
+
}
|
|
108
|
+
const { dm } = await openSession({
|
|
109
|
+
url,
|
|
110
|
+
name: env['ABRA_NAME'],
|
|
111
|
+
color: env['ABRA_COLOR'],
|
|
112
|
+
inviteCode: env['ABRA_INVITE_CODE'],
|
|
113
|
+
keyFile: env['ABRA_KEY_FILE'],
|
|
114
|
+
quiet: args.flags.has('quiet') || args.flags.has('q'),
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
const result = await runStreaming(dm, wp, opts, log)
|
|
119
|
+
return [
|
|
120
|
+
`Done. Created ${result.articleCount} articles${
|
|
121
|
+
result.categoryCount > 0 ? ` + ${result.categoryCount} categories` : ''
|
|
122
|
+
}.`,
|
|
123
|
+
`Root: ${result.rootDocId}`,
|
|
124
|
+
].join('\n')
|
|
125
|
+
} finally {
|
|
126
|
+
await dm.destroy().catch(() => {})
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
131
|
+
// Streaming orchestrator
|
|
132
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
133
|
+
|
|
134
|
+
interface StreamResult {
|
|
135
|
+
rootDocId: string
|
|
136
|
+
articleCount: number
|
|
137
|
+
categoryCount: number
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async function runStreaming(
|
|
141
|
+
dm: DocumentManager,
|
|
142
|
+
wp: WikipediaClient,
|
|
143
|
+
opts: WikiOptions,
|
|
144
|
+
log: (msg: string) => void,
|
|
145
|
+
): Promise<StreamResult> {
|
|
146
|
+
// Title → docId map. Drives [[Title]] → [[docId|label]] rewriting at write time.
|
|
147
|
+
const titleToDocId = new Map<string, string>()
|
|
148
|
+
// Snapshots of articles we've already fetched (avoid re-fetching).
|
|
149
|
+
const fetched = new Map<string, ExtractedArticle>()
|
|
150
|
+
// Articles whose section/infobox children have been created (split mode).
|
|
151
|
+
const childrenCreated = new Set<string>()
|
|
152
|
+
// Categories whose shells have been created.
|
|
153
|
+
const categoryToDocId = new Map<string, string>()
|
|
154
|
+
let categoriesContainerId: string | null = null
|
|
155
|
+
|
|
156
|
+
// ── Fetch entry first; we need its title to label the root graph ─────
|
|
157
|
+
log(`fetch ${opts.title}`)
|
|
158
|
+
const entryDoc = await wp.fetchArticle(opts.title)
|
|
159
|
+
if (!entryDoc) {
|
|
160
|
+
throw new Error(`Article not found: "${opts.title}"`)
|
|
161
|
+
}
|
|
162
|
+
const entryTitle = canonicalTitle(entryDoc.title?.() ?? opts.title)
|
|
163
|
+
const entrySnap = snapshotArticle(entryDoc, entryTitle)
|
|
164
|
+
fetched.set(entryTitle, entrySnap)
|
|
165
|
+
|
|
166
|
+
// ── Step 1: create root graph doc (visible immediately) ──────────────
|
|
167
|
+
const rootEntry = dm.tree.create({
|
|
168
|
+
parentId: opts.parentDocId ?? null,
|
|
169
|
+
label: entryTitle,
|
|
170
|
+
type: 'graph',
|
|
171
|
+
meta: { icon: ICONS.graph },
|
|
172
|
+
})
|
|
173
|
+
log(`+ ${rootEntry.id.slice(0, 8)}… ${entryTitle} (graph)`)
|
|
174
|
+
|
|
175
|
+
// ── Step 2: create the entry article shell ───────────────────────────
|
|
176
|
+
const entryArticleId = createArticleShell(dm, entrySnap, rootEntry.id, log)
|
|
177
|
+
titleToDocId.set(entryTitle, entryArticleId)
|
|
178
|
+
|
|
179
|
+
// Queue of (title, depth) to process. Each entry is guaranteed to have
|
|
180
|
+
// a shell doc already in titleToDocId.
|
|
181
|
+
const queue: Array<{ title: string; depth: number }> = [{ title: entryTitle, depth: 0 }]
|
|
182
|
+
let articleCount = 0
|
|
183
|
+
|
|
184
|
+
// ── Step 3: streaming process ───────────────────────────────────────
|
|
185
|
+
while (queue.length > 0) {
|
|
186
|
+
const { title, depth } = queue.shift()!
|
|
187
|
+
const articleDocId = titleToDocId.get(title)!
|
|
188
|
+
|
|
189
|
+
// Ensure we've fetched this article.
|
|
190
|
+
let snap = fetched.get(title)
|
|
191
|
+
if (!snap) {
|
|
192
|
+
log(`fetch [d${depth}] ${title}`)
|
|
193
|
+
try {
|
|
194
|
+
const doc = await wp.fetchArticle(title)
|
|
195
|
+
if (!doc) {
|
|
196
|
+
log(` not found — leaving stub`)
|
|
197
|
+
continue
|
|
198
|
+
}
|
|
199
|
+
snap = snapshotArticle(doc, canonicalTitle(doc.title?.() ?? title))
|
|
200
|
+
fetched.set(title, snap)
|
|
201
|
+
} catch (err: any) {
|
|
202
|
+
log(`! fetch failed: ${err?.message ?? err}`)
|
|
203
|
+
continue
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Create this article's section/infobox children (split mode only,
|
|
208
|
+
// and only once per article).
|
|
209
|
+
if (opts.mode === 'split' && !childrenCreated.has(title)) {
|
|
210
|
+
createArticleChildren(dm, snap, articleDocId, log)
|
|
211
|
+
childrenCreated.add(title)
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Discover new linked titles and pre-allocate shells immediately.
|
|
215
|
+
if (depth < opts.depth) {
|
|
216
|
+
for (const linkTitle of snap.linkTitles) {
|
|
217
|
+
if (titleToDocId.has(linkTitle)) continue
|
|
218
|
+
const shell = dm.tree.create({
|
|
219
|
+
parentId: rootEntry.id,
|
|
220
|
+
label: linkTitle,
|
|
221
|
+
type: 'doc',
|
|
222
|
+
meta: { icon: ICONS.article },
|
|
223
|
+
})
|
|
224
|
+
titleToDocId.set(linkTitle, shell.id)
|
|
225
|
+
queue.push({ title: linkTitle, depth: depth + 1 })
|
|
226
|
+
log(`+ ${shell.id.slice(0, 8)}… ${linkTitle} (doc, shell)`)
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Pre-allocate category shells when first discovered.
|
|
231
|
+
if (opts.includeCategories && snap.categories.length > 0) {
|
|
232
|
+
if (!categoriesContainerId) {
|
|
233
|
+
const c = dm.tree.create({
|
|
234
|
+
parentId: rootEntry.id,
|
|
235
|
+
label: 'Categories',
|
|
236
|
+
type: 'graph',
|
|
237
|
+
meta: { icon: ICONS.categories },
|
|
238
|
+
})
|
|
239
|
+
categoriesContainerId = c.id
|
|
240
|
+
log(`+ ${c.id.slice(0, 8)}… Categories (graph)`)
|
|
241
|
+
}
|
|
242
|
+
for (const catTitle of snap.categories) {
|
|
243
|
+
if (categoryToDocId.has(catTitle)) continue
|
|
244
|
+
const cat = dm.tree.create({
|
|
245
|
+
parentId: categoriesContainerId,
|
|
246
|
+
label: prettyCategoryLabel(catTitle),
|
|
247
|
+
type: 'graph',
|
|
248
|
+
meta: { icon: ICONS.category },
|
|
249
|
+
})
|
|
250
|
+
categoryToDocId.set(catTitle, cat.id)
|
|
251
|
+
log(`+ ${cat.id.slice(0, 8)}… ${prettyCategoryLabel(catTitle)} (graph, cat)`)
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Write this article's body NOW (links resolve to whatever shells we
|
|
256
|
+
// have allocated so far — that's all of this article's links since we
|
|
257
|
+
// just allocated them above).
|
|
258
|
+
const body =
|
|
259
|
+
opts.mode === 'split' ? renderArticleLead(snap) : renderArticleSingleDoc(snap)
|
|
260
|
+
if (body.trim().length > 0) {
|
|
261
|
+
const rewritten = rewriteLinks(body, titleToDocId)
|
|
262
|
+
try {
|
|
263
|
+
await dm.content.write(articleDocId, rewritten)
|
|
264
|
+
log(`✓ body ${title}`)
|
|
265
|
+
} catch (err: any) {
|
|
266
|
+
log(`! body write failed for ${title}: ${err?.message ?? err}`)
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// In split mode, also write each section/infobox doc body.
|
|
271
|
+
if (opts.mode === 'split') {
|
|
272
|
+
await writeChildrenBodies(dm, snap, articleDocId, titleToDocId, log)
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
articleCount++
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// ── Step 4: fill in category bodies ─────────────────────────────────
|
|
279
|
+
let categoryCount = 0
|
|
280
|
+
if (opts.includeCategories && categoryToDocId.size > 0) {
|
|
281
|
+
for (const [catTitle, catDocId] of categoryToDocId) {
|
|
282
|
+
log(`category ${catTitle}`)
|
|
283
|
+
try {
|
|
284
|
+
const members = await wp.fetchCategoryPages(
|
|
285
|
+
catTitle,
|
|
286
|
+
opts.categoryDepth > 0,
|
|
287
|
+
Math.max(0, opts.categoryDepth),
|
|
288
|
+
)
|
|
289
|
+
const memberArticles: string[] = []
|
|
290
|
+
const subcats: string[] = []
|
|
291
|
+
for (const m of members) {
|
|
292
|
+
if (m.type === 'subcat') subcats.push(prettyCategoryLabel(m.title))
|
|
293
|
+
else memberArticles.push(m.title)
|
|
294
|
+
}
|
|
295
|
+
const body = renderCategoryBody(memberArticles, subcats)
|
|
296
|
+
const rewritten = rewriteLinks(body, titleToDocId)
|
|
297
|
+
if (rewritten.trim().length > 0) {
|
|
298
|
+
await dm.content.write(catDocId, rewritten)
|
|
299
|
+
log(`✓ body category ${catTitle}`)
|
|
300
|
+
}
|
|
301
|
+
categoryCount++
|
|
302
|
+
} catch (err: any) {
|
|
303
|
+
log(`! category ${catTitle}: ${err?.message ?? err}`)
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return { rootDocId: rootEntry.id, articleCount, categoryCount }
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
312
|
+
// Shell + body helpers
|
|
313
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
314
|
+
|
|
315
|
+
function createArticleShell(
|
|
316
|
+
dm: DocumentManager,
|
|
317
|
+
article: ExtractedArticle,
|
|
318
|
+
parentId: string,
|
|
319
|
+
log: (msg: string) => void,
|
|
320
|
+
): string {
|
|
321
|
+
const meta: Record<string, unknown> = { icon: ICONS.article }
|
|
322
|
+
if (article.url) meta.url = article.url
|
|
323
|
+
const entry = dm.tree.create({
|
|
324
|
+
parentId,
|
|
325
|
+
label: article.title,
|
|
326
|
+
type: 'doc',
|
|
327
|
+
meta,
|
|
328
|
+
})
|
|
329
|
+
log(`+ ${entry.id.slice(0, 8)}… ${article.title} (doc)`)
|
|
330
|
+
return entry.id
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Create section + infobox child docs for a split-mode article. Returns nothing
|
|
335
|
+
* — children get bodies written later in writeChildrenBodies.
|
|
336
|
+
*/
|
|
337
|
+
function createArticleChildren(
|
|
338
|
+
dm: DocumentManager,
|
|
339
|
+
article: ExtractedArticle,
|
|
340
|
+
articleDocId: string,
|
|
341
|
+
log: (msg: string) => void,
|
|
342
|
+
): void {
|
|
343
|
+
if (article.infobox && article.infobox.length > 0) {
|
|
344
|
+
const ib = dm.tree.create({
|
|
345
|
+
parentId: articleDocId,
|
|
346
|
+
label: 'Infobox',
|
|
347
|
+
type: 'outline',
|
|
348
|
+
meta: { icon: ICONS.infobox },
|
|
349
|
+
})
|
|
350
|
+
log(` + ${ib.id.slice(0, 8)}… Infobox (outline)`)
|
|
351
|
+
// We attach the docId to the article object so writeChildrenBodies
|
|
352
|
+
// can find it without a second tree query.
|
|
353
|
+
;(article as any)._infoboxDocId = ib.id
|
|
354
|
+
}
|
|
355
|
+
for (const section of article.sections) {
|
|
356
|
+
createSectionShell(dm, section, articleDocId, log)
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function createSectionShell(
|
|
361
|
+
dm: DocumentManager,
|
|
362
|
+
section: ExtractedSection,
|
|
363
|
+
parentDocId: string,
|
|
364
|
+
log: (msg: string) => void,
|
|
365
|
+
): void {
|
|
366
|
+
const hasChildren = section.children.length > 0
|
|
367
|
+
if (!section.body.trim() && !hasChildren) return
|
|
368
|
+
const { type, icon } = pickSectionType(section)
|
|
369
|
+
const entry = dm.tree.create({
|
|
370
|
+
parentId: parentDocId,
|
|
371
|
+
label: section.title || 'Untitled section',
|
|
372
|
+
type,
|
|
373
|
+
meta: { icon },
|
|
374
|
+
})
|
|
375
|
+
log(` + ${entry.id.slice(0, 8)}… ${entry.label} (${type})`)
|
|
376
|
+
;(section as any)._docId = entry.id
|
|
377
|
+
for (const child of section.children) {
|
|
378
|
+
createSectionShell(dm, child, entry.id, log)
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
async function writeChildrenBodies(
|
|
383
|
+
dm: DocumentManager,
|
|
384
|
+
article: ExtractedArticle,
|
|
385
|
+
_articleDocId: string,
|
|
386
|
+
titleToDocId: Map<string, string>,
|
|
387
|
+
log: (msg: string) => void,
|
|
388
|
+
): Promise<void> {
|
|
389
|
+
const infoboxDocId = (article as any)._infoboxDocId as string | undefined
|
|
390
|
+
if (infoboxDocId && article.infobox && article.infobox.length > 0) {
|
|
391
|
+
try {
|
|
392
|
+
await dm.content.write(infoboxDocId, renderInfoboxBody(article.infobox))
|
|
393
|
+
} catch (err: any) {
|
|
394
|
+
log(`! infobox body write failed: ${err?.message ?? err}`)
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
for (const section of article.sections) {
|
|
398
|
+
await writeSectionBody(dm, section, titleToDocId, log)
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
async function writeSectionBody(
|
|
403
|
+
dm: DocumentManager,
|
|
404
|
+
section: ExtractedSection,
|
|
405
|
+
titleToDocId: Map<string, string>,
|
|
406
|
+
log: (msg: string) => void,
|
|
407
|
+
): Promise<void> {
|
|
408
|
+
const docId = (section as any)._docId as string | undefined
|
|
409
|
+
if (docId && section.body.trim().length > 0) {
|
|
410
|
+
try {
|
|
411
|
+
await dm.content.write(docId, rewriteLinks(section.body, titleToDocId))
|
|
412
|
+
} catch (err: any) {
|
|
413
|
+
log(`! section body write failed for ${section.title}: ${err?.message ?? err}`)
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
for (const child of section.children) {
|
|
417
|
+
await writeSectionBody(dm, child, titleToDocId, log)
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
422
|
+
// Argument parsing + dry-run printing
|
|
423
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
424
|
+
|
|
425
|
+
function parseOptions(args: ParsedArgs): WikiOptions | string {
|
|
426
|
+
const title = args.positional[0]?.trim() || args.params['title']
|
|
427
|
+
if (!title) return 'Missing required positional argument: <title>. Example: abracadabra-wiki "Toronto Raptors"'
|
|
428
|
+
|
|
429
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
430
|
+
const env = (globalThis as any).process?.env ?? {}
|
|
431
|
+
const userAgent = args.params['user-agent'] || args.params['userAgent'] || env['ABRA_WIKI_USER_AGENT']
|
|
432
|
+
if (!userAgent) {
|
|
433
|
+
return [
|
|
434
|
+
'Missing required parameter: user-agent="your-name (you@example.com)"',
|
|
435
|
+
'(Wikimedia etiquette requires an Api-User-Agent header. Pass user-agent=... or set ABRA_WIKI_USER_AGENT.)',
|
|
436
|
+
].join('\n')
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
const mode = (args.params['mode'] ?? 'split') as ExtractMode
|
|
440
|
+
if (mode !== 'single' && mode !== 'split') {
|
|
441
|
+
return `Invalid mode "${mode}". Use mode=single or mode=split.`
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
const depth = parseIntOr(args.params['depth'], 1)
|
|
445
|
+
const categoryDepth = parseIntOr(args.params['category-depth'] ?? args.params['categoryDepth'], 1)
|
|
446
|
+
const rate = parseFloatOr(args.params['rate'], 3)
|
|
447
|
+
|
|
448
|
+
return {
|
|
449
|
+
title,
|
|
450
|
+
mode,
|
|
451
|
+
depth,
|
|
452
|
+
categoryDepth,
|
|
453
|
+
includeCategories: args.flags.has('include-categories') || args.flags.has('includeCategories'),
|
|
454
|
+
lang: args.params['lang'] ?? 'en',
|
|
455
|
+
domain: args.params['domain'],
|
|
456
|
+
parentDocId: args.params['parent'],
|
|
457
|
+
userAgent,
|
|
458
|
+
rate,
|
|
459
|
+
dryRun: args.flags.has('dry-run') || args.flags.has('dryRun'),
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
function parseIntOr(s: string | undefined, fallback: number): number {
|
|
464
|
+
if (!s) return fallback
|
|
465
|
+
const n = Number.parseInt(s, 10)
|
|
466
|
+
return Number.isFinite(n) && n >= 0 ? n : fallback
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
function parseFloatOr(s: string | undefined, fallback: number): number {
|
|
470
|
+
if (!s) return fallback
|
|
471
|
+
const n = Number.parseFloat(s)
|
|
472
|
+
return Number.isFinite(n) && n > 0 ? n : fallback
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
function printSections(sections: ExtractedSection[], indent: string): string {
|
|
476
|
+
const lines: string[] = []
|
|
477
|
+
for (const s of sections) {
|
|
478
|
+
const hint = s.body ? ` (${s.body.length}b)` : ''
|
|
479
|
+
lines.push(`${indent}- ${s.title}${hint}${s.children.length > 0 ? ` [${s.children.length} sub]` : ''}`)
|
|
480
|
+
if (s.children.length > 0) {
|
|
481
|
+
lines.push(printSections(s.children, indent + ' '))
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
return lines.join('\n')
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
488
|
+
// Bin entry
|
|
489
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
490
|
+
|
|
491
|
+
async function main(): Promise<void> {
|
|
492
|
+
const args = parseArgs(process.argv)
|
|
493
|
+
if (
|
|
494
|
+
args.flags.has('help') ||
|
|
495
|
+
args.flags.has('h') ||
|
|
496
|
+
(!args.positional[0]?.trim() && !args.params['title'])
|
|
497
|
+
) {
|
|
498
|
+
console.log(USAGE)
|
|
499
|
+
return
|
|
500
|
+
}
|
|
501
|
+
const output = await runWiki(args)
|
|
502
|
+
if (output) console.log(output)
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
main().catch((err) => {
|
|
506
|
+
console.error(`Fatal: ${err?.message ?? err}`)
|
|
507
|
+
process.exit(1)
|
|
508
|
+
})
|
package/src/parser.ts
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal argument parser for the `abracadabra-wiki` bin.
|
|
3
|
+
*
|
|
4
|
+
* Single-purpose: there is no subcommand — the first bare word is the article
|
|
5
|
+
* title (`positional[0]`). Supports `key=value`, `key="value with spaces"`,
|
|
6
|
+
* and `--flag` / `--key=value`.
|
|
7
|
+
*
|
|
8
|
+
* abracadabra-wiki "<Article Title>" [key=value ...] [--flags]
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
export interface ParsedArgs {
|
|
12
|
+
/** Positional arguments — `positional[0]` is the article title. */
|
|
13
|
+
positional: string[]
|
|
14
|
+
/** Key-value parameters, e.g. `{ lang: "en", "user-agent": "..." }`. */
|
|
15
|
+
params: Record<string, string>
|
|
16
|
+
/** Boolean flags, e.g. `dry-run`, `include-categories`, `quiet`. */
|
|
17
|
+
flags: Set<string>
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Parse CLI arguments into a structured object.
|
|
22
|
+
* @param argv Raw `process.argv` (includes node path and script path).
|
|
23
|
+
*/
|
|
24
|
+
export function parseArgs(argv: string[]): ParsedArgs {
|
|
25
|
+
const args = argv.slice(2) // skip node + script
|
|
26
|
+
const result: ParsedArgs = { positional: [], params: {}, flags: new Set() }
|
|
27
|
+
|
|
28
|
+
for (let i = 0; i < args.length; i++) {
|
|
29
|
+
const arg = args[i]
|
|
30
|
+
|
|
31
|
+
// --flag or --key=value
|
|
32
|
+
if (arg.startsWith('--')) {
|
|
33
|
+
const stripped = arg.slice(2)
|
|
34
|
+
const eqIdx = stripped.indexOf('=')
|
|
35
|
+
if (eqIdx !== -1) {
|
|
36
|
+
result.params[stripped.slice(0, eqIdx)] = stripped.slice(eqIdx + 1)
|
|
37
|
+
} else {
|
|
38
|
+
result.flags.add(stripped)
|
|
39
|
+
}
|
|
40
|
+
continue
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// key=value pair
|
|
44
|
+
const eqIdx = arg.indexOf('=')
|
|
45
|
+
if (eqIdx > 0) {
|
|
46
|
+
const key = arg.slice(0, eqIdx)
|
|
47
|
+
let value = arg.slice(eqIdx + 1)
|
|
48
|
+
// Strip surrounding quotes if present
|
|
49
|
+
if ((value.startsWith('"') && value.endsWith('"')) ||
|
|
50
|
+
(value.startsWith("'") && value.endsWith("'"))) {
|
|
51
|
+
value = value.slice(1, -1)
|
|
52
|
+
}
|
|
53
|
+
result.params[key] = value
|
|
54
|
+
continue
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Every bare word is positional (no subcommand for this single-purpose bin).
|
|
58
|
+
result.positional.push(arg)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return result
|
|
62
|
+
}
|
package/src/render.ts
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Body rendering + page-type decisions for the streaming orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* All rendering is title-driven: bodies are rendered with `[[Title]]`
|
|
5
|
+
* placeholders, and `rewriteLinks` rewrites them to `[[docId|label]]`
|
|
6
|
+
* using the live title→docId map at write time.
|
|
7
|
+
*/
|
|
8
|
+
import type { ExtractedArticle, ExtractedSection } from './types.ts'
|
|
9
|
+
|
|
10
|
+
export const ICONS = {
|
|
11
|
+
graph: 'git-fork',
|
|
12
|
+
article: 'book-open',
|
|
13
|
+
category: 'tag',
|
|
14
|
+
infobox: 'info',
|
|
15
|
+
outline: 'list',
|
|
16
|
+
gallery: 'images',
|
|
17
|
+
section: 'pilcrow',
|
|
18
|
+
categories: 'tags',
|
|
19
|
+
} as const
|
|
20
|
+
|
|
21
|
+
/** Decide a page type for a section based on its shape. */
|
|
22
|
+
export function pickSectionType(section: ExtractedSection): { type: string; icon: string } {
|
|
23
|
+
if (section.children.length > 0) return { type: 'outline', icon: ICONS.outline }
|
|
24
|
+
if (section.isList && section.listLength >= 5) return { type: 'outline', icon: ICONS.outline }
|
|
25
|
+
return { type: 'doc', icon: ICONS.section }
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Render the lead paragraph as the article-doc body. */
|
|
29
|
+
export function renderArticleLead(article: ExtractedArticle): string {
|
|
30
|
+
return article.lead ?? ''
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Render the article as a single doc, sections + infobox inlined. */
|
|
34
|
+
export function renderArticleSingleDoc(article: ExtractedArticle): string {
|
|
35
|
+
const parts: string[] = []
|
|
36
|
+
if (article.lead) parts.push(article.lead)
|
|
37
|
+
if (article.infobox && article.infobox.length > 0) {
|
|
38
|
+
parts.push('## Infobox', renderInfoboxBody(article.infobox))
|
|
39
|
+
}
|
|
40
|
+
for (const section of article.sections) {
|
|
41
|
+
parts.push(...renderSectionInline(section, 2))
|
|
42
|
+
}
|
|
43
|
+
return parts.join('\n\n')
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function renderSectionInline(section: ExtractedSection, level: number): string[] {
|
|
47
|
+
const out: string[] = []
|
|
48
|
+
const prefix = '#'.repeat(Math.min(6, level))
|
|
49
|
+
if (section.title) out.push(`${prefix} ${section.title}`)
|
|
50
|
+
if (section.body.trim()) out.push(section.body)
|
|
51
|
+
for (const child of section.children) {
|
|
52
|
+
out.push(...renderSectionInline(child, level + 1))
|
|
53
|
+
}
|
|
54
|
+
return out
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function renderInfoboxBody(rows: Array<{ key: string; value: string }>): string {
|
|
58
|
+
return rows.map((r) => `- **${r.key}:** ${r.value}`).join('\n')
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function renderCategoryBody(members: string[], subcategories: string[]): string {
|
|
62
|
+
const parts: string[] = []
|
|
63
|
+
if (members.length > 0) {
|
|
64
|
+
parts.push('## Pages')
|
|
65
|
+
parts.push(members.map((m) => `- [[${m}]]`).join('\n'))
|
|
66
|
+
}
|
|
67
|
+
if (subcategories.length > 0) {
|
|
68
|
+
parts.push('## Sub-categories')
|
|
69
|
+
parts.push(subcategories.map((s) => `- ${s}`).join('\n'))
|
|
70
|
+
}
|
|
71
|
+
return parts.join('\n\n')
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Replace `[[Title]]` / `[[Title|Alias]]` in markdown with
|
|
76
|
+
* `[[docId|label]]` using the title→docId map. Unresolved titles fall
|
|
77
|
+
* back to plain text (their alias or original title).
|
|
78
|
+
*/
|
|
79
|
+
export function rewriteLinks(
|
|
80
|
+
markdown: string,
|
|
81
|
+
titleToDocId: Map<string, string>,
|
|
82
|
+
): string {
|
|
83
|
+
const re = /\[\[([^\]|]+?)(?:\|([^\]]+?))?\]\]/g
|
|
84
|
+
return markdown.replace(re, (_match, target: string, alias?: string) => {
|
|
85
|
+
const title = target.trim()
|
|
86
|
+
const docId = titleToDocId.get(title)
|
|
87
|
+
const display = (alias && alias.trim().length > 0 ? alias : title).trim()
|
|
88
|
+
if (!docId) return display
|
|
89
|
+
return `[[${docId}|${display}]]`
|
|
90
|
+
})
|
|
91
|
+
}
|