@abraca/cli 2.26.0 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,210 +0,0 @@
1
- /**
2
- * Snapshot a wtf_wikipedia Document into a plain-data shape that's easy to
3
- * work with downstream. No BFS / no plan-building here — just a pure read
4
- * of one parsed page.
5
- */
6
- import type { ExtractedArticle, ExtractedSection } from './types.ts'
7
- import { canonicalTitle, isCategoryTitle, stripCategoryPrefix } from './wikipedia.ts'
8
-
9
- export { canonicalTitle, isCategoryTitle, stripCategoryPrefix }
10
-
11
- export function snapshotArticle(doc: any, title: string): ExtractedArticle {
12
- return {
13
- title,
14
- linkTitles: collectLinkTitles(doc),
15
- categories: collectCategories(doc),
16
- sections: snapshotSections(doc.sections?.() ?? []),
17
- infobox: snapshotInfobox(doc.infobox?.()),
18
- lead: leadParagraph(doc),
19
- url: typeof doc.url === 'function' ? doc.url() : null,
20
- }
21
- }
22
-
23
- export function prettyCategoryLabel(catTitle: string): string {
24
- return stripCategoryPrefix(catTitle)
25
- }
26
-
27
- // ─────────────────────────────────────────────────────────────────────────
28
- // Link / category extraction
29
- // ─────────────────────────────────────────────────────────────────────────
30
-
31
- function collectLinkTitles(doc: any): string[] {
32
- const links = doc.links?.() ?? []
33
- const out = new Set<string>()
34
- for (const l of links) {
35
- if (!l) continue
36
- const page = typeof l.page === 'function' ? l.page() : null
37
- if (typeof page !== 'string' || page.length === 0) continue
38
- if (isCategoryTitle(page)) continue
39
- out.add(canonicalTitle(page))
40
- }
41
- return [...out]
42
- }
43
-
44
- function collectCategories(doc: any): string[] {
45
- const out: string[] = []
46
- for (const c of (doc.categories?.() as string[] | undefined) ?? []) {
47
- const norm = canonicalTitle(c)
48
- if (norm) out.push(norm)
49
- }
50
- return out
51
- }
52
-
53
- // ─────────────────────────────────────────────────────────────────────────
54
- // Sections — flatten wtf's parent-child references into a real tree
55
- // ─────────────────────────────────────────────────────────────────────────
56
-
57
- function snapshotSections(rawSections: any[]): ExtractedSection[] {
58
- const all = rawSections.map((s) => ({
59
- raw: s,
60
- title: s.title?.() || '',
61
- parentRef: typeof s.parent === 'function' ? s.parent() : null,
62
- children: [] as ExtractedSection[],
63
- }))
64
-
65
- const byRaw = new Map<any, (typeof all)[number]>()
66
- for (const s of all) byRaw.set(s.raw, s)
67
-
68
- const roots: (typeof all)[number][] = []
69
- for (const s of all) {
70
- if (s.parentRef && byRaw.has(s.parentRef)) {
71
- byRaw.get(s.parentRef)!.children.push(materialize(s))
72
- } else {
73
- roots.push(s)
74
- }
75
- }
76
- return roots.map(materialize)
77
- }
78
-
79
- function materialize(node: {
80
- raw: any
81
- title: string
82
- children: ExtractedSection[]
83
- }): ExtractedSection {
84
- const lists = node.raw.lists?.() ?? []
85
- const paragraphs = node.raw.paragraphs?.() ?? []
86
-
87
- let listLength = 0
88
- for (const l of lists) {
89
- const lines = l.lines?.() ?? []
90
- listLength += lines.length
91
- }
92
- const isList =
93
- lists.length > 0 && (paragraphs.length === 0 || listLength >= paragraphs.length * 2)
94
-
95
- const bodyParts: string[] = []
96
- for (const p of paragraphs) {
97
- const md = paragraphMarkdown(p)
98
- if (md) bodyParts.push(md)
99
- }
100
- for (const l of lists) {
101
- const lines = (l.lines?.() ?? []) as any[]
102
- for (const line of lines) {
103
- const text = lineText(line)
104
- if (text) bodyParts.push(`- ${text}`)
105
- }
106
- }
107
-
108
- return {
109
- title: node.title,
110
- body: bodyParts.join('\n\n'),
111
- isList,
112
- listLength,
113
- children: node.children,
114
- }
115
- }
116
-
117
- // ─────────────────────────────────────────────────────────────────────────
118
- // Infobox
119
- // ─────────────────────────────────────────────────────────────────────────
120
-
121
- function snapshotInfobox(box: any | null | undefined): Array<{ key: string; value: string }> | undefined {
122
- if (!box) return undefined
123
- const data = typeof box.json === 'function' ? box.json() : null
124
- if (!data || typeof data !== 'object') return undefined
125
- const rows: Array<{ key: string; value: string }> = []
126
- for (const [key, val] of Object.entries(data)) {
127
- const value = stringifyInfoboxValue(val)
128
- if (!value) continue
129
- rows.push({ key: humanKey(key), value })
130
- }
131
- return rows.length > 0 ? rows : undefined
132
- }
133
-
134
- function stringifyInfoboxValue(val: unknown): string {
135
- if (val == null) return ''
136
- if (typeof val === 'string') return val
137
- if (typeof val === 'number' || typeof val === 'boolean') return String(val)
138
- if (Array.isArray(val)) {
139
- return val.map(stringifyInfoboxValue).filter(Boolean).join(', ')
140
- }
141
- if (typeof val === 'object') {
142
- const o = val as Record<string, unknown>
143
- if (typeof o.text === 'string') return o.text
144
- if (typeof o.number === 'number') return String(o.number)
145
- }
146
- return ''
147
- }
148
-
149
- function humanKey(k: string): string {
150
- return k.replace(/_/g, ' ').replace(/^./, (m) => m.toUpperCase())
151
- }
152
-
153
- // ─────────────────────────────────────────────────────────────────────────
154
- // Markdown rendering
155
- // ─────────────────────────────────────────────────────────────────────────
156
-
157
- function leadParagraph(doc: any): string {
158
- const paras = doc.paragraphs?.() ?? []
159
- const first = paras[0]
160
- if (!first) return ''
161
- return paragraphMarkdown(first)
162
- }
163
-
164
- /**
165
- * Render a paragraph as markdown, replacing internal links with `[[Title]]`.
166
- * The streaming orchestrator's link rewriter later swaps `[[Title]]` →
167
- * `[[docId|label]]` once IDs are known.
168
- */
169
- function paragraphMarkdown(paragraph: any): string {
170
- const sentences = paragraph.sentences?.() ?? []
171
- const out: string[] = []
172
- for (const s of sentences) {
173
- out.push(sentenceWithWikilinks(s))
174
- }
175
- return out.join(' ').trim()
176
- }
177
-
178
- function sentenceWithWikilinks(sentence: any): string {
179
- const text: string = (sentence.text?.() ?? '').toString()
180
- const links = sentence.links?.() ?? []
181
- if (links.length === 0) return text
182
-
183
- let result = text
184
- const replacements = links
185
- .map((l: any) => {
186
- const page = typeof l.page === 'function' ? l.page() : null
187
- const display = typeof l.text === 'function' ? l.text() : null
188
- if (typeof page !== 'string' || page.length === 0) return null
189
- if (isCategoryTitle(page)) return null
190
- const shown = (display && display.length > 0 ? display : page) as string
191
- return { page: canonicalTitle(page), shown }
192
- })
193
- .filter((x: any): x is { page: string; shown: string } => x !== null)
194
- .sort((a: any, b: any) => b.shown.length - a.shown.length)
195
-
196
- for (const { page, shown } of replacements) {
197
- if (!result.includes(shown)) continue
198
- const replacement = shown === page ? `[[${page}]]` : `[[${page}|${shown}]]`
199
- result = result.replace(shown, replacement)
200
- }
201
- return result
202
- }
203
-
204
- function lineText(line: any): string {
205
- if (!line) return ''
206
- if (typeof line === 'string') return line
207
- if (typeof line.text === 'string') return line.text
208
- if (typeof line.text === 'function') return line.text()
209
- return ''
210
- }
@@ -1,45 +0,0 @@
1
- /**
2
- * Internal types for the Wikipedia extractor command.
3
- */
4
-
5
- export type ExtractMode = 'single' | 'split'
6
-
7
- export interface WikiOptions {
8
- title: string
9
- mode: ExtractMode
10
- depth: number
11
- includeCategories: boolean
12
- categoryDepth: number
13
- lang: string
14
- domain?: string
15
- parentDocId?: string
16
- userAgent: string
17
- rate: number
18
- /** Print plan-tree to stderr without writing (no server connection). */
19
- dryRun: boolean
20
- }
21
-
22
- /** A simplified article snapshot extracted from a wtf_wikipedia Document. */
23
- export interface ExtractedArticle {
24
- title: string
25
- /** Wikipedia internal links (page titles) referenced by this article. */
26
- linkTitles: string[]
27
- /** Categories this article belongs to. */
28
- categories: string[]
29
- /** Top-level sections; each section has nested subsections. */
30
- sections: ExtractedSection[]
31
- /** Infobox key-value rows (first infobox only). */
32
- infobox?: Array<{ key: string; value: string }>
33
- /** Plaintext lead paragraph(s) for the single-doc body. */
34
- lead: string
35
- /** Source URL on Wikipedia. */
36
- url: string | null
37
- }
38
-
39
- export interface ExtractedSection {
40
- title: string
41
- body: string
42
- isList: boolean
43
- listLength: number
44
- children: ExtractedSection[]
45
- }
@@ -1,154 +0,0 @@
1
- /**
2
- * Rate-limited wrapper around wtf_wikipedia + wtf-plugin-api.
3
- *
4
- * Responsibilities:
5
- * - Throttle requests to respect Wikimedia API etiquette
6
- * - Cache parsed Documents by canonical title
7
- * - Resolve redirects so callers always see the redirect target
8
- * - Expose getCategoryPages via wtf-plugin-api
9
- */
10
- // @ts-ignore — wtf_wikipedia ships its own types but they are imprecise; we
11
- // cast Link/Section APIs as needed below.
12
- import wtf from 'wtf_wikipedia'
13
- // @ts-ignore — wtf-plugin-api is JS-only with no types; treat as opaque.
14
- import wtfApiPlugin from 'wtf-plugin-api'
15
-
16
- // Augment wtf at module load so getCategoryPages becomes available.
17
- let pluginExtended = false
18
- function ensurePlugin(): void {
19
- if (pluginExtended) return
20
- // @ts-ignore — extend is dynamically attached to the wtf default export.
21
- wtf.extend(wtfApiPlugin)
22
- pluginExtended = true
23
- }
24
-
25
- export interface WikipediaClientConfig {
26
- lang: string
27
- domain?: string
28
- userAgent: string
29
- /** Max requests per second. */
30
- rate: number
31
- }
32
-
33
- interface FetchOpts {
34
- lang?: string
35
- domain?: string
36
- 'Api-User-Agent'?: string
37
- follow_redirects?: boolean
38
- }
39
-
40
- /** A token-bucket-ish throttle: at most `rate` calls per second, FIFO. */
41
- class RateLimiter {
42
- private lastTickMs = 0
43
- constructor(private intervalMs: number) {}
44
-
45
- async wait(): Promise<void> {
46
- const now = Date.now()
47
- const earliest = this.lastTickMs + this.intervalMs
48
- if (now < earliest) {
49
- await new Promise((r) => setTimeout(r, earliest - now))
50
- }
51
- this.lastTickMs = Math.max(now, earliest)
52
- }
53
- }
54
-
55
- export class WikipediaClient {
56
- private cache = new Map<string, any>()
57
- private redirects = new Map<string, string>()
58
- private limiter: RateLimiter
59
- private fetchOpts: FetchOpts
60
-
61
- constructor(private config: WikipediaClientConfig) {
62
- ensurePlugin()
63
- this.limiter = new RateLimiter(Math.max(50, Math.floor(1000 / Math.max(0.1, config.rate))))
64
- this.fetchOpts = {
65
- lang: config.lang,
66
- 'Api-User-Agent': config.userAgent,
67
- follow_redirects: true,
68
- }
69
- if (config.domain) this.fetchOpts.domain = config.domain
70
- }
71
-
72
- /**
73
- * Fetch and parse a Wikipedia article.
74
- * - Returns the cached Document if we've seen this title before.
75
- * - Follows redirects and caches under both source and target titles.
76
- * - Returns null when the page does not exist.
77
- */
78
- async fetchArticle(rawTitle: string): Promise<any | null> {
79
- const title = canonicalTitle(rawTitle)
80
- if (this.cache.has(title)) return this.cache.get(title)
81
- if (this.redirects.has(title)) {
82
- const target = this.redirects.get(title)!
83
- return this.cache.get(target) ?? null
84
- }
85
-
86
- await this.limiter.wait()
87
- let doc: any
88
- try {
89
- doc = await (wtf as any).fetch(title, this.fetchOpts)
90
- } catch (err: any) {
91
- throw new Error(`Wikipedia fetch failed for "${title}": ${err?.message ?? err}`)
92
- }
93
- if (!doc) return null
94
-
95
- // wtf usually follows redirects automatically when follow_redirects=true,
96
- // but defensively handle the case where it surfaces a redirect doc.
97
- if (typeof doc.isRedirect === 'function' && doc.isRedirect()) {
98
- const target = doc.redirectTo?.()?.page
99
- if (typeof target === 'string') {
100
- this.redirects.set(title, canonicalTitle(target))
101
- const inner = await this.fetchArticle(target)
102
- return inner
103
- }
104
- }
105
-
106
- const resolvedTitle = canonicalTitle(doc.title?.() ?? title)
107
- this.cache.set(resolvedTitle, doc)
108
- if (resolvedTitle !== title) this.redirects.set(title, resolvedTitle)
109
- return doc
110
- }
111
-
112
- /**
113
- * Fetch the member pages of a category (and optionally sub-categories).
114
- * @param category Category title (with or without "Category:" prefix).
115
- * @param recursive Whether to traverse sub-categories.
116
- * @param maxDepth Recursion depth when recursive=true.
117
- */
118
- async fetchCategoryPages(
119
- category: string,
120
- recursive: boolean,
121
- maxDepth: number,
122
- ): Promise<Array<{ title: string; type: 'page' | 'subcat' }>> {
123
- await this.limiter.wait()
124
- const opts: Record<string, unknown> = {
125
- lang: this.config.lang,
126
- 'Api-User-Agent': this.config.userAgent,
127
- recursive,
128
- maxDepth,
129
- }
130
- if (this.config.domain) opts.domain = this.config.domain
131
- // @ts-ignore — getCategoryPages is attached at runtime via wtf.extend.
132
- const list: any[] = await wtf.getCategoryPages(category, opts)
133
- return (list ?? []).map((m) => ({
134
- title: canonicalTitle(m.title),
135
- type: m.type === 'subcat' ? 'subcat' : 'page',
136
- }))
137
- }
138
- }
139
-
140
- /** Normalize a Wikipedia title — trim, collapse spaces, strip leading/trailing colons. */
141
- export function canonicalTitle(s: string): string {
142
- return (s ?? '').toString().replace(/_/g, ' ').replace(/\s+/g, ' ').trim()
143
- }
144
-
145
- /** Detect a category-namespaced title. */
146
- const CATEGORY_PREFIX = /^(Category|Catégorie|Kategorie|Categoría|Categoria|Categorie|Kategoria):/i
147
- export function isCategoryTitle(title: string): boolean {
148
- return CATEGORY_PREFIX.test(title)
149
- }
150
-
151
- /** Strip the "Category:" prefix for display. */
152
- export function stripCategoryPrefix(title: string): string {
153
- return title.replace(CATEGORY_PREFIX, '').trim()
154
- }