@abraca/cli 2.25.0 → 2.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/abracadabra-cli.cjs +1 -696
- package/dist/abracadabra-cli.cjs.map +1 -1
- package/dist/abracadabra-cli.esm.js +2 -695
- package/dist/abracadabra-cli.esm.js.map +1 -1
- package/package.json +3 -5
- package/src/index.ts +0 -5
- package/src/commands/wiki/connect.ts +0 -69
- package/src/commands/wiki/index.ts +0 -471
- package/src/commands/wiki/render.ts +0 -91
- package/src/commands/wiki/snapshot.ts +0 -210
- package/src/commands/wiki/types.ts +0 -45
- package/src/commands/wiki/wikipedia.ts +0 -154
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Snapshot a wtf_wikipedia Document into a plain-data shape that's easy to
|
|
3
|
-
* work with downstream. No BFS / no plan-building here — just a pure read
|
|
4
|
-
* of one parsed page.
|
|
5
|
-
*/
|
|
6
|
-
import type { ExtractedArticle, ExtractedSection } from './types.ts'
|
|
7
|
-
import { canonicalTitle, isCategoryTitle, stripCategoryPrefix } from './wikipedia.ts'
|
|
8
|
-
|
|
9
|
-
export { canonicalTitle, isCategoryTitle, stripCategoryPrefix }
|
|
10
|
-
|
|
11
|
-
export function snapshotArticle(doc: any, title: string): ExtractedArticle {
|
|
12
|
-
return {
|
|
13
|
-
title,
|
|
14
|
-
linkTitles: collectLinkTitles(doc),
|
|
15
|
-
categories: collectCategories(doc),
|
|
16
|
-
sections: snapshotSections(doc.sections?.() ?? []),
|
|
17
|
-
infobox: snapshotInfobox(doc.infobox?.()),
|
|
18
|
-
lead: leadParagraph(doc),
|
|
19
|
-
url: typeof doc.url === 'function' ? doc.url() : null,
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
export function prettyCategoryLabel(catTitle: string): string {
|
|
24
|
-
return stripCategoryPrefix(catTitle)
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
28
|
-
// Link / category extraction
|
|
29
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
30
|
-
|
|
31
|
-
function collectLinkTitles(doc: any): string[] {
|
|
32
|
-
const links = doc.links?.() ?? []
|
|
33
|
-
const out = new Set<string>()
|
|
34
|
-
for (const l of links) {
|
|
35
|
-
if (!l) continue
|
|
36
|
-
const page = typeof l.page === 'function' ? l.page() : null
|
|
37
|
-
if (typeof page !== 'string' || page.length === 0) continue
|
|
38
|
-
if (isCategoryTitle(page)) continue
|
|
39
|
-
out.add(canonicalTitle(page))
|
|
40
|
-
}
|
|
41
|
-
return [...out]
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
function collectCategories(doc: any): string[] {
|
|
45
|
-
const out: string[] = []
|
|
46
|
-
for (const c of (doc.categories?.() as string[] | undefined) ?? []) {
|
|
47
|
-
const norm = canonicalTitle(c)
|
|
48
|
-
if (norm) out.push(norm)
|
|
49
|
-
}
|
|
50
|
-
return out
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
54
|
-
// Sections — flatten wtf's parent-child references into a real tree
|
|
55
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
56
|
-
|
|
57
|
-
function snapshotSections(rawSections: any[]): ExtractedSection[] {
|
|
58
|
-
const all = rawSections.map((s) => ({
|
|
59
|
-
raw: s,
|
|
60
|
-
title: s.title?.() || '',
|
|
61
|
-
parentRef: typeof s.parent === 'function' ? s.parent() : null,
|
|
62
|
-
children: [] as ExtractedSection[],
|
|
63
|
-
}))
|
|
64
|
-
|
|
65
|
-
const byRaw = new Map<any, (typeof all)[number]>()
|
|
66
|
-
for (const s of all) byRaw.set(s.raw, s)
|
|
67
|
-
|
|
68
|
-
const roots: (typeof all)[number][] = []
|
|
69
|
-
for (const s of all) {
|
|
70
|
-
if (s.parentRef && byRaw.has(s.parentRef)) {
|
|
71
|
-
byRaw.get(s.parentRef)!.children.push(materialize(s))
|
|
72
|
-
} else {
|
|
73
|
-
roots.push(s)
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
return roots.map(materialize)
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
function materialize(node: {
|
|
80
|
-
raw: any
|
|
81
|
-
title: string
|
|
82
|
-
children: ExtractedSection[]
|
|
83
|
-
}): ExtractedSection {
|
|
84
|
-
const lists = node.raw.lists?.() ?? []
|
|
85
|
-
const paragraphs = node.raw.paragraphs?.() ?? []
|
|
86
|
-
|
|
87
|
-
let listLength = 0
|
|
88
|
-
for (const l of lists) {
|
|
89
|
-
const lines = l.lines?.() ?? []
|
|
90
|
-
listLength += lines.length
|
|
91
|
-
}
|
|
92
|
-
const isList =
|
|
93
|
-
lists.length > 0 && (paragraphs.length === 0 || listLength >= paragraphs.length * 2)
|
|
94
|
-
|
|
95
|
-
const bodyParts: string[] = []
|
|
96
|
-
for (const p of paragraphs) {
|
|
97
|
-
const md = paragraphMarkdown(p)
|
|
98
|
-
if (md) bodyParts.push(md)
|
|
99
|
-
}
|
|
100
|
-
for (const l of lists) {
|
|
101
|
-
const lines = (l.lines?.() ?? []) as any[]
|
|
102
|
-
for (const line of lines) {
|
|
103
|
-
const text = lineText(line)
|
|
104
|
-
if (text) bodyParts.push(`- ${text}`)
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
return {
|
|
109
|
-
title: node.title,
|
|
110
|
-
body: bodyParts.join('\n\n'),
|
|
111
|
-
isList,
|
|
112
|
-
listLength,
|
|
113
|
-
children: node.children,
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
118
|
-
// Infobox
|
|
119
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
120
|
-
|
|
121
|
-
function snapshotInfobox(box: any | null | undefined): Array<{ key: string; value: string }> | undefined {
|
|
122
|
-
if (!box) return undefined
|
|
123
|
-
const data = typeof box.json === 'function' ? box.json() : null
|
|
124
|
-
if (!data || typeof data !== 'object') return undefined
|
|
125
|
-
const rows: Array<{ key: string; value: string }> = []
|
|
126
|
-
for (const [key, val] of Object.entries(data)) {
|
|
127
|
-
const value = stringifyInfoboxValue(val)
|
|
128
|
-
if (!value) continue
|
|
129
|
-
rows.push({ key: humanKey(key), value })
|
|
130
|
-
}
|
|
131
|
-
return rows.length > 0 ? rows : undefined
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
function stringifyInfoboxValue(val: unknown): string {
|
|
135
|
-
if (val == null) return ''
|
|
136
|
-
if (typeof val === 'string') return val
|
|
137
|
-
if (typeof val === 'number' || typeof val === 'boolean') return String(val)
|
|
138
|
-
if (Array.isArray(val)) {
|
|
139
|
-
return val.map(stringifyInfoboxValue).filter(Boolean).join(', ')
|
|
140
|
-
}
|
|
141
|
-
if (typeof val === 'object') {
|
|
142
|
-
const o = val as Record<string, unknown>
|
|
143
|
-
if (typeof o.text === 'string') return o.text
|
|
144
|
-
if (typeof o.number === 'number') return String(o.number)
|
|
145
|
-
}
|
|
146
|
-
return ''
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
function humanKey(k: string): string {
|
|
150
|
-
return k.replace(/_/g, ' ').replace(/^./, (m) => m.toUpperCase())
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
154
|
-
// Markdown rendering
|
|
155
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
156
|
-
|
|
157
|
-
function leadParagraph(doc: any): string {
|
|
158
|
-
const paras = doc.paragraphs?.() ?? []
|
|
159
|
-
const first = paras[0]
|
|
160
|
-
if (!first) return ''
|
|
161
|
-
return paragraphMarkdown(first)
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
/**
|
|
165
|
-
* Render a paragraph as markdown, replacing internal links with `[[Title]]`.
|
|
166
|
-
* The streaming orchestrator's link rewriter later swaps `[[Title]]` →
|
|
167
|
-
* `[[docId|label]]` once IDs are known.
|
|
168
|
-
*/
|
|
169
|
-
function paragraphMarkdown(paragraph: any): string {
|
|
170
|
-
const sentences = paragraph.sentences?.() ?? []
|
|
171
|
-
const out: string[] = []
|
|
172
|
-
for (const s of sentences) {
|
|
173
|
-
out.push(sentenceWithWikilinks(s))
|
|
174
|
-
}
|
|
175
|
-
return out.join(' ').trim()
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
function sentenceWithWikilinks(sentence: any): string {
|
|
179
|
-
const text: string = (sentence.text?.() ?? '').toString()
|
|
180
|
-
const links = sentence.links?.() ?? []
|
|
181
|
-
if (links.length === 0) return text
|
|
182
|
-
|
|
183
|
-
let result = text
|
|
184
|
-
const replacements = links
|
|
185
|
-
.map((l: any) => {
|
|
186
|
-
const page = typeof l.page === 'function' ? l.page() : null
|
|
187
|
-
const display = typeof l.text === 'function' ? l.text() : null
|
|
188
|
-
if (typeof page !== 'string' || page.length === 0) return null
|
|
189
|
-
if (isCategoryTitle(page)) return null
|
|
190
|
-
const shown = (display && display.length > 0 ? display : page) as string
|
|
191
|
-
return { page: canonicalTitle(page), shown }
|
|
192
|
-
})
|
|
193
|
-
.filter((x: any): x is { page: string; shown: string } => x !== null)
|
|
194
|
-
.sort((a: any, b: any) => b.shown.length - a.shown.length)
|
|
195
|
-
|
|
196
|
-
for (const { page, shown } of replacements) {
|
|
197
|
-
if (!result.includes(shown)) continue
|
|
198
|
-
const replacement = shown === page ? `[[${page}]]` : `[[${page}|${shown}]]`
|
|
199
|
-
result = result.replace(shown, replacement)
|
|
200
|
-
}
|
|
201
|
-
return result
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
function lineText(line: any): string {
|
|
205
|
-
if (!line) return ''
|
|
206
|
-
if (typeof line === 'string') return line
|
|
207
|
-
if (typeof line.text === 'string') return line.text
|
|
208
|
-
if (typeof line.text === 'function') return line.text()
|
|
209
|
-
return ''
|
|
210
|
-
}
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Internal types for the Wikipedia extractor command.
|
|
3
|
-
*/
|
|
4
|
-
|
|
5
|
-
export type ExtractMode = 'single' | 'split'
|
|
6
|
-
|
|
7
|
-
export interface WikiOptions {
|
|
8
|
-
title: string
|
|
9
|
-
mode: ExtractMode
|
|
10
|
-
depth: number
|
|
11
|
-
includeCategories: boolean
|
|
12
|
-
categoryDepth: number
|
|
13
|
-
lang: string
|
|
14
|
-
domain?: string
|
|
15
|
-
parentDocId?: string
|
|
16
|
-
userAgent: string
|
|
17
|
-
rate: number
|
|
18
|
-
/** Print plan-tree to stderr without writing (no server connection). */
|
|
19
|
-
dryRun: boolean
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/** A simplified article snapshot extracted from a wtf_wikipedia Document. */
|
|
23
|
-
export interface ExtractedArticle {
|
|
24
|
-
title: string
|
|
25
|
-
/** Wikipedia internal links (page titles) referenced by this article. */
|
|
26
|
-
linkTitles: string[]
|
|
27
|
-
/** Categories this article belongs to. */
|
|
28
|
-
categories: string[]
|
|
29
|
-
/** Top-level sections; each section has nested subsections. */
|
|
30
|
-
sections: ExtractedSection[]
|
|
31
|
-
/** Infobox key-value rows (first infobox only). */
|
|
32
|
-
infobox?: Array<{ key: string; value: string }>
|
|
33
|
-
/** Plaintext lead paragraph(s) for the single-doc body. */
|
|
34
|
-
lead: string
|
|
35
|
-
/** Source URL on Wikipedia. */
|
|
36
|
-
url: string | null
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
export interface ExtractedSection {
|
|
40
|
-
title: string
|
|
41
|
-
body: string
|
|
42
|
-
isList: boolean
|
|
43
|
-
listLength: number
|
|
44
|
-
children: ExtractedSection[]
|
|
45
|
-
}
|
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Rate-limited wrapper around wtf_wikipedia + wtf-plugin-api.
|
|
3
|
-
*
|
|
4
|
-
* Responsibilities:
|
|
5
|
-
* - Throttle requests to respect Wikimedia API etiquette
|
|
6
|
-
* - Cache parsed Documents by canonical title
|
|
7
|
-
* - Resolve redirects so callers always see the redirect target
|
|
8
|
-
* - Expose getCategoryPages via wtf-plugin-api
|
|
9
|
-
*/
|
|
10
|
-
// @ts-ignore — wtf_wikipedia ships its own types but they are imprecise; we
|
|
11
|
-
// cast Link/Section APIs as needed below.
|
|
12
|
-
import wtf from 'wtf_wikipedia'
|
|
13
|
-
// @ts-ignore — wtf-plugin-api is JS-only with no types; treat as opaque.
|
|
14
|
-
import wtfApiPlugin from 'wtf-plugin-api'
|
|
15
|
-
|
|
16
|
-
// Augment wtf at module load so getCategoryPages becomes available.
|
|
17
|
-
let pluginExtended = false
|
|
18
|
-
function ensurePlugin(): void {
|
|
19
|
-
if (pluginExtended) return
|
|
20
|
-
// @ts-ignore — extend is dynamically attached to the wtf default export.
|
|
21
|
-
wtf.extend(wtfApiPlugin)
|
|
22
|
-
pluginExtended = true
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
export interface WikipediaClientConfig {
|
|
26
|
-
lang: string
|
|
27
|
-
domain?: string
|
|
28
|
-
userAgent: string
|
|
29
|
-
/** Max requests per second. */
|
|
30
|
-
rate: number
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
interface FetchOpts {
|
|
34
|
-
lang?: string
|
|
35
|
-
domain?: string
|
|
36
|
-
'Api-User-Agent'?: string
|
|
37
|
-
follow_redirects?: boolean
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/** A token-bucket-ish throttle: at most `rate` calls per second, FIFO. */
|
|
41
|
-
class RateLimiter {
|
|
42
|
-
private lastTickMs = 0
|
|
43
|
-
constructor(private intervalMs: number) {}
|
|
44
|
-
|
|
45
|
-
async wait(): Promise<void> {
|
|
46
|
-
const now = Date.now()
|
|
47
|
-
const earliest = this.lastTickMs + this.intervalMs
|
|
48
|
-
if (now < earliest) {
|
|
49
|
-
await new Promise((r) => setTimeout(r, earliest - now))
|
|
50
|
-
}
|
|
51
|
-
this.lastTickMs = Math.max(now, earliest)
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
export class WikipediaClient {
|
|
56
|
-
private cache = new Map<string, any>()
|
|
57
|
-
private redirects = new Map<string, string>()
|
|
58
|
-
private limiter: RateLimiter
|
|
59
|
-
private fetchOpts: FetchOpts
|
|
60
|
-
|
|
61
|
-
constructor(private config: WikipediaClientConfig) {
|
|
62
|
-
ensurePlugin()
|
|
63
|
-
this.limiter = new RateLimiter(Math.max(50, Math.floor(1000 / Math.max(0.1, config.rate))))
|
|
64
|
-
this.fetchOpts = {
|
|
65
|
-
lang: config.lang,
|
|
66
|
-
'Api-User-Agent': config.userAgent,
|
|
67
|
-
follow_redirects: true,
|
|
68
|
-
}
|
|
69
|
-
if (config.domain) this.fetchOpts.domain = config.domain
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
/**
|
|
73
|
-
* Fetch and parse a Wikipedia article.
|
|
74
|
-
* - Returns the cached Document if we've seen this title before.
|
|
75
|
-
* - Follows redirects and caches under both source and target titles.
|
|
76
|
-
* - Returns null when the page does not exist.
|
|
77
|
-
*/
|
|
78
|
-
async fetchArticle(rawTitle: string): Promise<any | null> {
|
|
79
|
-
const title = canonicalTitle(rawTitle)
|
|
80
|
-
if (this.cache.has(title)) return this.cache.get(title)
|
|
81
|
-
if (this.redirects.has(title)) {
|
|
82
|
-
const target = this.redirects.get(title)!
|
|
83
|
-
return this.cache.get(target) ?? null
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
await this.limiter.wait()
|
|
87
|
-
let doc: any
|
|
88
|
-
try {
|
|
89
|
-
doc = await (wtf as any).fetch(title, this.fetchOpts)
|
|
90
|
-
} catch (err: any) {
|
|
91
|
-
throw new Error(`Wikipedia fetch failed for "${title}": ${err?.message ?? err}`)
|
|
92
|
-
}
|
|
93
|
-
if (!doc) return null
|
|
94
|
-
|
|
95
|
-
// wtf usually follows redirects automatically when follow_redirects=true,
|
|
96
|
-
// but defensively handle the case where it surfaces a redirect doc.
|
|
97
|
-
if (typeof doc.isRedirect === 'function' && doc.isRedirect()) {
|
|
98
|
-
const target = doc.redirectTo?.()?.page
|
|
99
|
-
if (typeof target === 'string') {
|
|
100
|
-
this.redirects.set(title, canonicalTitle(target))
|
|
101
|
-
const inner = await this.fetchArticle(target)
|
|
102
|
-
return inner
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
const resolvedTitle = canonicalTitle(doc.title?.() ?? title)
|
|
107
|
-
this.cache.set(resolvedTitle, doc)
|
|
108
|
-
if (resolvedTitle !== title) this.redirects.set(title, resolvedTitle)
|
|
109
|
-
return doc
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Fetch the member pages of a category (and optionally sub-categories).
|
|
114
|
-
* @param category Category title (with or without "Category:" prefix).
|
|
115
|
-
* @param recursive Whether to traverse sub-categories.
|
|
116
|
-
* @param maxDepth Recursion depth when recursive=true.
|
|
117
|
-
*/
|
|
118
|
-
async fetchCategoryPages(
|
|
119
|
-
category: string,
|
|
120
|
-
recursive: boolean,
|
|
121
|
-
maxDepth: number,
|
|
122
|
-
): Promise<Array<{ title: string; type: 'page' | 'subcat' }>> {
|
|
123
|
-
await this.limiter.wait()
|
|
124
|
-
const opts: Record<string, unknown> = {
|
|
125
|
-
lang: this.config.lang,
|
|
126
|
-
'Api-User-Agent': this.config.userAgent,
|
|
127
|
-
recursive,
|
|
128
|
-
maxDepth,
|
|
129
|
-
}
|
|
130
|
-
if (this.config.domain) opts.domain = this.config.domain
|
|
131
|
-
// @ts-ignore — getCategoryPages is attached at runtime via wtf.extend.
|
|
132
|
-
const list: any[] = await wtf.getCategoryPages(category, opts)
|
|
133
|
-
return (list ?? []).map((m) => ({
|
|
134
|
-
title: canonicalTitle(m.title),
|
|
135
|
-
type: m.type === 'subcat' ? 'subcat' : 'page',
|
|
136
|
-
}))
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
/** Normalize a Wikipedia title — trim, collapse spaces, strip leading/trailing colons. */
|
|
141
|
-
export function canonicalTitle(s: string): string {
|
|
142
|
-
return (s ?? '').toString().replace(/_/g, ' ').replace(/\s+/g, ' ').trim()
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
/** Detect a category-namespaced title. */
|
|
146
|
-
const CATEGORY_PREFIX = /^(Category|Catégorie|Kategorie|Categoría|Categoria|Categorie|Kategoria):/i
|
|
147
|
-
export function isCategoryTitle(title: string): boolean {
|
|
148
|
-
return CATEGORY_PREFIX.test(title)
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/** Strip the "Category:" prefix for display. */
|
|
152
|
-
export function stripCategoryPrefix(title: string): string {
|
|
153
|
-
return title.replace(CATEGORY_PREFIX, '').trim()
|
|
154
|
-
}
|