@refract-org/ingestion 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -0
- package/dist/src/index.d.ts +49 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +5 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/mediawiki-client.d.ts +24 -0
- package/dist/src/mediawiki-client.d.ts.map +1 -0
- package/dist/src/mediawiki-client.js +292 -0
- package/dist/src/mediawiki-client.js.map +1 -0
- package/dist/src/rate-limiter.d.ts +8 -0
- package/dist/src/rate-limiter.d.ts.map +1 -0
- package/dist/src/rate-limiter.js +28 -0
- package/dist/src/rate-limiter.js.map +1 -0
- package/dist/src/wikidata-mapper.d.ts +29 -0
- package/dist/src/wikidata-mapper.d.ts.map +1 -0
- package/dist/src/wikidata-mapper.js +138 -0
- package/dist/src/wikidata-mapper.js.map +1 -0
- package/dist/src/xml-dump-source.d.ts +8 -0
- package/dist/src/xml-dump-source.d.ts.map +1 -0
- package/dist/src/xml-dump-source.js +77 -0
- package/dist/src/xml-dump-source.js.map +1 -0
- package/dist/tsconfig 2.tsbuildinfo +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +28 -0
- package/src/__tests__/auth-integration.test.ts +59 -0
- package/src/__tests__/integration.test.ts +95 -0
- package/src/__tests__/mediawiki-client.test.ts +113 -0
- package/src/__tests__/page-move.test.ts +31 -0
- package/src/__tests__/rate-limiter.test.ts +30 -0
- package/src/__tests__/talk-page.test.ts +46 -0
- package/src/__tests__/wikidata-mapper.test.ts +134 -0
- package/src/__tests__/xml-dump-source.test.ts +151 -0
- package/src/index.ts +63 -0
- package/src/mediawiki-client.ts +420 -0
- package/src/rate-limiter.ts +29 -0
- package/src/wikidata-mapper.ts +197 -0
- package/src/xml-dump-source.ts +89 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import type { Revision } from "@refract-org/evidence-graph";
|
|
3
|
+
import type { RevisionOptions, RevisionSource } from "./index.js";
|
|
4
|
+
|
|
5
|
+
function decodeXmlEntities(text: string): string {
|
|
6
|
+
return text
|
|
7
|
+
.replace(/&/g, "&")
|
|
8
|
+
.replace(/</g, "<")
|
|
9
|
+
.replace(/>/g, ">")
|
|
10
|
+
.replace(/"/g, '"')
|
|
11
|
+
.replace(/'/g, "'")
|
|
12
|
+
.replace(/&#(\d+);/g, (_m, n) => String.fromCodePoint(parseInt(n, 10)));
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function extractPageXml(text: string, title: string): string | null {
|
|
16
|
+
const pattern = /<page>([\s\S]*?)<\/page>/g;
|
|
17
|
+
let match: RegExpExecArray | null;
|
|
18
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: Standard regex loop pattern
|
|
19
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
20
|
+
const titleMatch = match[1].match(/<title>(.*?)<\/title>/);
|
|
21
|
+
if (titleMatch && decodeXmlEntities(titleMatch[1]) === title) {
|
|
22
|
+
return match[1];
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function extractRevisions(pageXml: string, pageTitle: string, pageId: number, options?: RevisionOptions): Revision[] {
|
|
29
|
+
const revs: Revision[] = [];
|
|
30
|
+
const re = /<revision>([\s\S]*?)<\/revision>/g;
|
|
31
|
+
let match: RegExpExecArray | null;
|
|
32
|
+
|
|
33
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: Standard regex loop pattern
|
|
34
|
+
while ((match = re.exec(pageXml)) !== null) {
|
|
35
|
+
const xml = match[1];
|
|
36
|
+
const idMatch = xml.match(/<id>(\d+)<\/id>/);
|
|
37
|
+
const timestampMatch = xml.match(/<timestamp>(.*?)<\/timestamp>/);
|
|
38
|
+
if (!idMatch || !timestampMatch) continue;
|
|
39
|
+
|
|
40
|
+
const revId = parseInt(idMatch[1], 10);
|
|
41
|
+
|
|
42
|
+
const commentRaw = xml.match(/<comment>(.*?)<\/comment>/);
|
|
43
|
+
const comment = commentRaw ? decodeXmlEntities(commentRaw[1]) : "";
|
|
44
|
+
|
|
45
|
+
const textMatch = xml.match(/<text[^>]*>([\s\S]*?)<\/text>/);
|
|
46
|
+
const content = textMatch ? decodeXmlEntities(textMatch[1]) : "";
|
|
47
|
+
|
|
48
|
+
const ts = timestampMatch[1];
|
|
49
|
+
if (options?.start && new Date(ts) < options.start) continue;
|
|
50
|
+
if (options?.end && new Date(ts) > options.end) continue;
|
|
51
|
+
|
|
52
|
+
revs.push({
|
|
53
|
+
revId,
|
|
54
|
+
pageId,
|
|
55
|
+
pageTitle,
|
|
56
|
+
timestamp: ts,
|
|
57
|
+
comment,
|
|
58
|
+
content,
|
|
59
|
+
size: content.length,
|
|
60
|
+
minor: false,
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
if (options?.limit && revs.length >= options.limit) break;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return revs;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export class XmlDumpRevisionSource implements RevisionSource {
|
|
70
|
+
private filePath: string;
|
|
71
|
+
|
|
72
|
+
constructor(filePath: string) {
|
|
73
|
+
this.filePath = filePath;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async *revisions(pageTitle: string, options?: RevisionOptions): AsyncIterable<Revision> {
|
|
77
|
+
const text = readFileSync(this.filePath, "utf-8");
|
|
78
|
+
const pageXml = extractPageXml(text, pageTitle);
|
|
79
|
+
if (!pageXml) return;
|
|
80
|
+
|
|
81
|
+
const idMatch = pageXml.match(/<id>(\d+)<\/id>/);
|
|
82
|
+
const pageId = idMatch ? parseInt(idMatch[1], 10) : 0;
|
|
83
|
+
|
|
84
|
+
const revs = extractRevisions(pageXml, pageTitle, pageId, options);
|
|
85
|
+
for (const rev of revs) {
|
|
86
|
+
yield rev;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|