synset 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -39
- package/dist/cli.cjs +534 -382
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +542 -383
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +301 -166
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +26 -7
- package/dist/index.d.ts +26 -7
- package/dist/index.js +307 -167
- package/dist/index.js.map +1 -1
- package/dist/schema.sql +22 -0
- package/package.json +10 -5
package/dist/index.js
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
// src/export-sqlite.ts
|
|
2
|
+
import { Database } from "bun:sqlite";
|
|
3
|
+
|
|
1
4
|
// src/types.ts
|
|
2
5
|
import { z } from "zod";
|
|
3
6
|
var LexiconId = z.string();
|
|
@@ -163,7 +166,290 @@ var Lexicon = z.object({
|
|
|
163
166
|
synsets: z.array(Synset).min(0),
|
|
164
167
|
syntacticBehaviors: z.array(SyntacticBehavior).min(0)
|
|
165
168
|
});
|
|
166
|
-
var partsOfSpeechList = PartsOfSpeech.options.map(
|
|
169
|
+
var partsOfSpeechList = PartsOfSpeech.options.map(
|
|
170
|
+
(v) => v.value
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
// src/helpers.ts
|
|
174
|
+
function PronunciationNode(node) {
|
|
175
|
+
const obj = {
|
|
176
|
+
variety: optAttr(node, "variety"),
|
|
177
|
+
inner: node.innerText
|
|
178
|
+
};
|
|
179
|
+
return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
180
|
+
}
|
|
181
|
+
function LemmaNode(node) {
|
|
182
|
+
const obj = {
|
|
183
|
+
writtenForm: attr(node, "writtenForm"),
|
|
184
|
+
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
185
|
+
pronunciations: (
|
|
186
|
+
//
|
|
187
|
+
children(node, "Pronunciation", (v) => PronunciationNode(v))
|
|
188
|
+
)
|
|
189
|
+
};
|
|
190
|
+
return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
191
|
+
}
|
|
192
|
+
function SenseRelationNode(node) {
|
|
193
|
+
const obj = {
|
|
194
|
+
relType: SenseRelationRelType.parse(attr(node, "relType")),
|
|
195
|
+
target: attr(node, "target"),
|
|
196
|
+
dcType: optAttr(node, "dc:type")
|
|
197
|
+
};
|
|
198
|
+
return SenseRelation.parse(
|
|
199
|
+
extendWithRestAttr(node, obj, (s) => s === "dc:type" ? "dcType" : s)
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
function SenseNode(node) {
|
|
203
|
+
const adjPos = optAttr(node, "adjposition");
|
|
204
|
+
const obj = {
|
|
205
|
+
id: attr(node, "id"),
|
|
206
|
+
synset: SynsetId.parse(attr(node, "synset")),
|
|
207
|
+
senseRelations: children(node, "SenseRelation", SenseRelationNode),
|
|
208
|
+
subCat: optAttr(node, "subcat"),
|
|
209
|
+
adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
|
|
210
|
+
};
|
|
211
|
+
return Sense.parse(
|
|
212
|
+
extendWithRestAttr(
|
|
213
|
+
node,
|
|
214
|
+
obj,
|
|
215
|
+
(s) => s === "subcat" ? "subCat" : s === "adjposition" ? "adjPosition" : s
|
|
216
|
+
)
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
function FormNode(node) {
|
|
220
|
+
const obj = {
|
|
221
|
+
writtenForm: attr(node, "writtenForm")
|
|
222
|
+
};
|
|
223
|
+
return Form.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
224
|
+
}
|
|
225
|
+
function LexicalEntryNode(node) {
|
|
226
|
+
const obj = {
|
|
227
|
+
id: attr(node, "id"),
|
|
228
|
+
lemmas: children(node, "Lemma", LemmaNode),
|
|
229
|
+
senses: children(node, "Sense", SenseNode),
|
|
230
|
+
forms: children(node, "Form", FormNode)
|
|
231
|
+
};
|
|
232
|
+
return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
233
|
+
}
|
|
234
|
+
function DefinitionNode(node) {
|
|
235
|
+
const obj = {
|
|
236
|
+
inner: node.innerText
|
|
237
|
+
};
|
|
238
|
+
return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
239
|
+
}
|
|
240
|
+
function ExampleNode(node) {
|
|
241
|
+
const obj = {
|
|
242
|
+
inner: node.innerText,
|
|
243
|
+
dcSource: optAttr(node, "dc:source")
|
|
244
|
+
};
|
|
245
|
+
return Example.parse(
|
|
246
|
+
extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
|
|
247
|
+
);
|
|
248
|
+
}
|
|
249
|
+
function ILIDefinitionNode(node) {
|
|
250
|
+
const obj = {
|
|
251
|
+
inner: node.innerText
|
|
252
|
+
};
|
|
253
|
+
return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
254
|
+
}
|
|
255
|
+
function SynsetRelationNode(node) {
|
|
256
|
+
const obj = {
|
|
257
|
+
relType: SynsetRelationRelType.parse(attr(node, "relType")),
|
|
258
|
+
target: attr(node, "target")
|
|
259
|
+
};
|
|
260
|
+
return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
261
|
+
}
|
|
262
|
+
function SyntacticBehaviorNode(node) {
|
|
263
|
+
const obj = {
|
|
264
|
+
id: attr(node, "id"),
|
|
265
|
+
subcategorizationFrame: attr(node, "subcategorizationFrame")
|
|
266
|
+
};
|
|
267
|
+
return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
268
|
+
}
|
|
269
|
+
function SynsetNode(node) {
|
|
270
|
+
const obj = {
|
|
271
|
+
id: attr(node, "id"),
|
|
272
|
+
ili: attr(node, "ili"),
|
|
273
|
+
lexfile: attr(node, "lexfile"),
|
|
274
|
+
members: attr(node, "members").split(" "),
|
|
275
|
+
dcSource: optAttr(node, "dc:source"),
|
|
276
|
+
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
277
|
+
definitions: children(node, "Definition", (v) => DefinitionNode(v)),
|
|
278
|
+
examples: children(node, "Example", (v) => ExampleNode(v)),
|
|
279
|
+
iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
|
|
280
|
+
synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
|
|
281
|
+
};
|
|
282
|
+
return Synset.parse(
|
|
283
|
+
extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
|
|
284
|
+
);
|
|
285
|
+
}
|
|
286
|
+
function LexiconNode(node) {
|
|
287
|
+
const obj = {
|
|
288
|
+
id: attr(node, "id"),
|
|
289
|
+
label: attr(node, "label"),
|
|
290
|
+
language: attr(node, "language"),
|
|
291
|
+
email: attr(node, "email"),
|
|
292
|
+
license: attr(node, "license"),
|
|
293
|
+
version: attr(node, "version"),
|
|
294
|
+
citation: optAttr(node, "citation"),
|
|
295
|
+
url: attr(node, "url"),
|
|
296
|
+
lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
|
|
297
|
+
synsets: children(node, "Synset", SynsetNode),
|
|
298
|
+
syntacticBehaviors: (
|
|
299
|
+
//
|
|
300
|
+
children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
|
|
301
|
+
)
|
|
302
|
+
};
|
|
303
|
+
return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
304
|
+
}
|
|
305
|
+
var decodeXmlEntities = (s) => {
|
|
306
|
+
if (s === void 0) return void 0;
|
|
307
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/'/g, "'").replace(/"/g, '"');
|
|
308
|
+
};
|
|
309
|
+
var attr = (node, attrName) => {
|
|
310
|
+
const value = decodeXmlEntities(node.attributes[attrName]);
|
|
311
|
+
if (value === void 0) {
|
|
312
|
+
throw new Error(
|
|
313
|
+
`Missing required attribute "${attrName}" on node "${node.type}"`
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
return value;
|
|
317
|
+
};
|
|
318
|
+
var optAttr = (node, attrName) => {
|
|
319
|
+
return decodeXmlEntities(node.attributes[attrName]);
|
|
320
|
+
};
|
|
321
|
+
var restAttrs = (node, obj, proxy) => {
|
|
322
|
+
const result = {};
|
|
323
|
+
Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
|
|
324
|
+
result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
|
|
325
|
+
});
|
|
326
|
+
return result;
|
|
327
|
+
};
|
|
328
|
+
var extendWithRestAttr = (node, obj, proxy) => {
|
|
329
|
+
return Object.assign(obj, restAttrs(node, obj, proxy));
|
|
330
|
+
};
|
|
331
|
+
var children = (node, type, fn) => {
|
|
332
|
+
return node.children.filter((v) => v.type === type).map((v) => fn(v));
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
// src/export-sqlite.ts
|
|
336
|
+
var SCHEMA = `
|
|
337
|
+
CREATE TABLE IF NOT EXISTS words (
|
|
338
|
+
id INTEGER PRIMARY KEY,
|
|
339
|
+
word TEXT NOT NULL,
|
|
340
|
+
word_display TEXT NOT NULL
|
|
341
|
+
);
|
|
342
|
+
CREATE INDEX IF NOT EXISTS idx_words_word ON words(word);
|
|
343
|
+
|
|
344
|
+
CREATE TABLE IF NOT EXISTS synsets (
|
|
345
|
+
id TEXT PRIMARY KEY,
|
|
346
|
+
pos TEXT NOT NULL,
|
|
347
|
+
definition TEXT NOT NULL
|
|
348
|
+
);
|
|
349
|
+
|
|
350
|
+
CREATE TABLE IF NOT EXISTS word_synsets (
|
|
351
|
+
word_id INTEGER NOT NULL,
|
|
352
|
+
synset_id TEXT NOT NULL,
|
|
353
|
+
PRIMARY KEY (word_id, synset_id)
|
|
354
|
+
);
|
|
355
|
+
CREATE INDEX IF NOT EXISTS idx_ws_word ON word_synsets(word_id);
|
|
356
|
+
`;
|
|
357
|
+
function exportToSQLite(lexicon, outputPath, options = {}) {
|
|
358
|
+
const { onProgress } = options;
|
|
359
|
+
const db = new Database(outputPath, { create: true });
|
|
360
|
+
db.exec("PRAGMA journal_mode = OFF");
|
|
361
|
+
db.exec("PRAGMA synchronous = OFF");
|
|
362
|
+
db.exec(SCHEMA);
|
|
363
|
+
const wordToEntries = /* @__PURE__ */ new Map();
|
|
364
|
+
for (const entry of lexicon.lexicalEntries) {
|
|
365
|
+
const word = entry.lemmas[0]?.writtenForm;
|
|
366
|
+
if (word) {
|
|
367
|
+
const lower = word.toLowerCase();
|
|
368
|
+
const existing = wordToEntries.get(lower) || [];
|
|
369
|
+
existing.push(entry);
|
|
370
|
+
wordToEntries.set(lower, existing);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
const synsetMap = /* @__PURE__ */ new Map();
|
|
374
|
+
for (const synset of lexicon.synsets) {
|
|
375
|
+
synsetMap.set(synset.id, synset);
|
|
376
|
+
}
|
|
377
|
+
const insertWord = db.prepare(
|
|
378
|
+
"INSERT INTO words (word, word_display) VALUES (?, ?)"
|
|
379
|
+
);
|
|
380
|
+
const wordIds = /* @__PURE__ */ new Map();
|
|
381
|
+
const words = Array.from(wordToEntries.keys()).sort();
|
|
382
|
+
const totalWords = words.length;
|
|
383
|
+
db.exec("BEGIN TRANSACTION");
|
|
384
|
+
let wordId = 0;
|
|
385
|
+
for (let i = 0; i < words.length; i++) {
|
|
386
|
+
const word = words[i];
|
|
387
|
+
const entries = wordToEntries.get(word);
|
|
388
|
+
if (!entries) continue;
|
|
389
|
+
const display = entries[0].lemmas[0]?.writtenForm || word;
|
|
390
|
+
insertWord.run(word, display);
|
|
391
|
+
wordId++;
|
|
392
|
+
wordIds.set(word, wordId);
|
|
393
|
+
if (onProgress && i % 1e4 === 0) {
|
|
394
|
+
onProgress({ phase: "words", current: i, total: totalWords });
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
db.exec("COMMIT");
|
|
398
|
+
const usedSynsetIds = /* @__PURE__ */ new Set();
|
|
399
|
+
for (const entries of wordToEntries.values()) {
|
|
400
|
+
for (const entry of entries) {
|
|
401
|
+
for (const sense of entry.senses) {
|
|
402
|
+
usedSynsetIds.add(sense.synset);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
const insertSynset = db.prepare(
|
|
407
|
+
"INSERT OR IGNORE INTO synsets (id, pos, definition) VALUES (?, ?, ?)"
|
|
408
|
+
);
|
|
409
|
+
const synsetList = Array.from(usedSynsetIds);
|
|
410
|
+
const totalSynsets = synsetList.length;
|
|
411
|
+
db.exec("BEGIN TRANSACTION");
|
|
412
|
+
for (let i = 0; i < synsetList.length; i++) {
|
|
413
|
+
const synsetId = synsetList[i];
|
|
414
|
+
const synset = synsetMap.get(synsetId);
|
|
415
|
+
if (synset) {
|
|
416
|
+
const def = decodeXmlEntities(synset.definitions[0]?.inner) || "";
|
|
417
|
+
insertSynset.run(synsetId, synset.partOfSpeech, def);
|
|
418
|
+
}
|
|
419
|
+
if (onProgress && i % 1e4 === 0) {
|
|
420
|
+
onProgress({ phase: "synsets", current: i, total: totalSynsets });
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
db.exec("COMMIT");
|
|
424
|
+
const insertRelation = db.prepare(
|
|
425
|
+
"INSERT OR IGNORE INTO word_synsets (word_id, synset_id) VALUES (?, ?)"
|
|
426
|
+
);
|
|
427
|
+
let relationCount = 0;
|
|
428
|
+
const totalRelations = Array.from(wordToEntries.values()).reduce(
|
|
429
|
+
(sum, entries) => sum + entries.reduce((s, e) => s + e.senses.length, 0),
|
|
430
|
+
0
|
|
431
|
+
);
|
|
432
|
+
db.exec("BEGIN TRANSACTION");
|
|
433
|
+
for (const [word, entries] of wordToEntries) {
|
|
434
|
+
const wordId2 = wordIds.get(word);
|
|
435
|
+
if (!wordId2) continue;
|
|
436
|
+
for (const entry of entries) {
|
|
437
|
+
for (const sense of entry.senses) {
|
|
438
|
+
insertRelation.run(wordId2, sense.synset);
|
|
439
|
+
relationCount++;
|
|
440
|
+
if (onProgress && relationCount % 1e4 === 0) {
|
|
441
|
+
onProgress({
|
|
442
|
+
phase: "relations",
|
|
443
|
+
current: relationCount,
|
|
444
|
+
total: totalRelations
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
db.exec("COMMIT");
|
|
451
|
+
db.close();
|
|
452
|
+
}
|
|
167
453
|
|
|
168
454
|
// src/literals.ts
|
|
169
455
|
var PartsOfSpeech2 = {
|
|
@@ -316,8 +602,16 @@ var AdjPosition2 = {
|
|
|
316
602
|
};
|
|
317
603
|
|
|
318
604
|
// src/loader.ts
|
|
319
|
-
import {
|
|
605
|
+
import {
|
|
606
|
+
createReadStream,
|
|
607
|
+
existsSync,
|
|
608
|
+
mkdirSync,
|
|
609
|
+
readdirSync,
|
|
610
|
+
statSync,
|
|
611
|
+
writeFileSync
|
|
612
|
+
} from "fs";
|
|
320
613
|
import path from "path";
|
|
614
|
+
import { Readable } from "stream";
|
|
321
615
|
|
|
322
616
|
// node_modules/@dbushell/xml-streamify/src/node.ts
|
|
323
617
|
var Node = class {
|
|
@@ -561,166 +855,6 @@ async function* parse(input, options) {
|
|
|
561
855
|
return document;
|
|
562
856
|
}
|
|
563
857
|
|
|
564
|
-
// src/helpers.ts
|
|
565
|
-
function PronunciationNode(node) {
|
|
566
|
-
const obj = {
|
|
567
|
-
variety: optAttr(node, "variety"),
|
|
568
|
-
inner: node.innerText
|
|
569
|
-
};
|
|
570
|
-
return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
571
|
-
}
|
|
572
|
-
function LemmaNode(node) {
|
|
573
|
-
const obj = {
|
|
574
|
-
writtenForm: attr(node, "writtenForm"),
|
|
575
|
-
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
576
|
-
pronunciations: (
|
|
577
|
-
//
|
|
578
|
-
children(node, "Pronunciation", (v) => PronunciationNode(v))
|
|
579
|
-
)
|
|
580
|
-
};
|
|
581
|
-
return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
582
|
-
}
|
|
583
|
-
function SenseRelationNode(node) {
|
|
584
|
-
const obj = {
|
|
585
|
-
relType: SenseRelationRelType.parse(attr(node, "relType")),
|
|
586
|
-
target: attr(node, "target"),
|
|
587
|
-
dcType: optAttr(node, "dc:type")
|
|
588
|
-
};
|
|
589
|
-
return SenseRelation.parse(
|
|
590
|
-
extendWithRestAttr(node, obj, (s) => s == "dc:type" ? "dcType" : s)
|
|
591
|
-
);
|
|
592
|
-
}
|
|
593
|
-
function SenseNode(node) {
|
|
594
|
-
const adjPos = optAttr(node, "adjposition");
|
|
595
|
-
const obj = {
|
|
596
|
-
id: attr(node, "id"),
|
|
597
|
-
synset: SynsetId.parse(attr(node, "synset")),
|
|
598
|
-
senseRelations: children(node, "SenseRelation", SenseRelationNode),
|
|
599
|
-
subCat: optAttr(node, "subcat"),
|
|
600
|
-
adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
|
|
601
|
-
};
|
|
602
|
-
return Sense.parse(
|
|
603
|
-
extendWithRestAttr(
|
|
604
|
-
node,
|
|
605
|
-
obj,
|
|
606
|
-
(s) => s == "subcat" ? "subCat" : s == "adjposition" ? "adjPosition" : s
|
|
607
|
-
)
|
|
608
|
-
);
|
|
609
|
-
}
|
|
610
|
-
function FormNode(node) {
|
|
611
|
-
const obj = {
|
|
612
|
-
writtenForm: attr(node, "writtenForm")
|
|
613
|
-
};
|
|
614
|
-
return Form.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
615
|
-
}
|
|
616
|
-
function LexicalEntryNode(node) {
|
|
617
|
-
const obj = {
|
|
618
|
-
id: attr(node, "id"),
|
|
619
|
-
lemmas: children(node, "Lemma", LemmaNode),
|
|
620
|
-
senses: children(node, "Sense", SenseNode),
|
|
621
|
-
forms: children(node, "Form", FormNode)
|
|
622
|
-
};
|
|
623
|
-
return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
624
|
-
}
|
|
625
|
-
function DefinitionNode(node) {
|
|
626
|
-
const obj = {
|
|
627
|
-
inner: node.innerText
|
|
628
|
-
};
|
|
629
|
-
return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
630
|
-
}
|
|
631
|
-
function ExampleNode(node) {
|
|
632
|
-
const obj = {
|
|
633
|
-
inner: node.innerText,
|
|
634
|
-
dcSource: optAttr(node, "dc:source")
|
|
635
|
-
};
|
|
636
|
-
return Example.parse(
|
|
637
|
-
extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
|
|
638
|
-
);
|
|
639
|
-
}
|
|
640
|
-
function ILIDefinitionNode(node) {
|
|
641
|
-
const obj = {
|
|
642
|
-
inner: node.innerText
|
|
643
|
-
};
|
|
644
|
-
return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
645
|
-
}
|
|
646
|
-
function SynsetRelationNode(node) {
|
|
647
|
-
const obj = {
|
|
648
|
-
relType: SynsetRelationRelType.parse(attr(node, "relType")),
|
|
649
|
-
target: attr(node, "target")
|
|
650
|
-
};
|
|
651
|
-
return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
652
|
-
}
|
|
653
|
-
function SyntacticBehaviorNode(node) {
|
|
654
|
-
const obj = {
|
|
655
|
-
id: attr(node, "id"),
|
|
656
|
-
subcategorizationFrame: attr(node, "subcategorizationFrame")
|
|
657
|
-
};
|
|
658
|
-
return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
659
|
-
}
|
|
660
|
-
function SynsetNode(node) {
|
|
661
|
-
const obj = {
|
|
662
|
-
id: attr(node, "id"),
|
|
663
|
-
ili: attr(node, "ili"),
|
|
664
|
-
lexfile: attr(node, "lexfile"),
|
|
665
|
-
members: attr(node, "members").split(" "),
|
|
666
|
-
dcSource: optAttr(node, "dc:source"),
|
|
667
|
-
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
668
|
-
definitions: children(node, "Definition", (v) => DefinitionNode(v)),
|
|
669
|
-
examples: children(node, "Example", (v) => ExampleNode(v)),
|
|
670
|
-
iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
|
|
671
|
-
synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
|
|
672
|
-
};
|
|
673
|
-
return Synset.parse(
|
|
674
|
-
extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
|
|
675
|
-
);
|
|
676
|
-
}
|
|
677
|
-
function LexiconNode(node) {
|
|
678
|
-
const obj = {
|
|
679
|
-
id: attr(node, "id"),
|
|
680
|
-
label: attr(node, "label"),
|
|
681
|
-
language: attr(node, "language"),
|
|
682
|
-
email: attr(node, "email"),
|
|
683
|
-
license: attr(node, "license"),
|
|
684
|
-
version: attr(node, "version"),
|
|
685
|
-
citation: optAttr(node, "citation"),
|
|
686
|
-
url: attr(node, "url"),
|
|
687
|
-
lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
|
|
688
|
-
synsets: children(node, "Synset", SynsetNode),
|
|
689
|
-
syntacticBehaviors: (
|
|
690
|
-
//
|
|
691
|
-
children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
|
|
692
|
-
)
|
|
693
|
-
};
|
|
694
|
-
return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
695
|
-
}
|
|
696
|
-
var decodeXmlEntities = (s) => {
|
|
697
|
-
if (s === void 0) return void 0;
|
|
698
|
-
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/'/g, "'").replace(/"/g, '"');
|
|
699
|
-
};
|
|
700
|
-
var attr = (node, attrName) => {
|
|
701
|
-
const value = decodeXmlEntities(node.attributes[attrName]);
|
|
702
|
-
if (value === void 0) {
|
|
703
|
-
throw new Error(`Missing required attribute "${attrName}" on node "${node.type}"`);
|
|
704
|
-
}
|
|
705
|
-
return value;
|
|
706
|
-
};
|
|
707
|
-
var optAttr = (node, attrName) => {
|
|
708
|
-
return decodeXmlEntities(node.attributes[attrName]);
|
|
709
|
-
};
|
|
710
|
-
var restAttrs = (node, obj, proxy) => {
|
|
711
|
-
const result = {};
|
|
712
|
-
Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
|
|
713
|
-
result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
|
|
714
|
-
});
|
|
715
|
-
return result;
|
|
716
|
-
};
|
|
717
|
-
var extendWithRestAttr = (node, obj, proxy) => {
|
|
718
|
-
return Object.assign(obj, restAttrs(node, obj, proxy));
|
|
719
|
-
};
|
|
720
|
-
var children = (node, type, fn) => {
|
|
721
|
-
return node.children.filter((v) => v.type == type).map((v) => fn(v));
|
|
722
|
-
};
|
|
723
|
-
|
|
724
858
|
// src/loader.ts
|
|
725
859
|
var BASE_VERSION = "2024";
|
|
726
860
|
function getFilename(version) {
|
|
@@ -786,7 +920,6 @@ async function findLatestVersion(onProgress, cacheDir) {
|
|
|
786
920
|
for (let year = baseYear + 1; year <= lastReleasableYear; year++) {
|
|
787
921
|
const version = year.toString();
|
|
788
922
|
if (await urlExists(getDownloadUrl(version))) {
|
|
789
|
-
continue;
|
|
790
923
|
} else {
|
|
791
924
|
return (year - 1).toString();
|
|
792
925
|
}
|
|
@@ -807,9 +940,13 @@ async function downloadWordNet(version, destPath) {
|
|
|
807
940
|
const url = getDownloadUrl(version);
|
|
808
941
|
const response = await fetch(url);
|
|
809
942
|
if (!response.ok || !response.body) {
|
|
810
|
-
throw new Error(
|
|
943
|
+
throw new Error(
|
|
944
|
+
`Failed to download WordNet ${version}: ${response.statusText}`
|
|
945
|
+
);
|
|
811
946
|
}
|
|
812
|
-
const decompressed = response.body.pipeThrough(
|
|
947
|
+
const decompressed = response.body.pipeThrough(
|
|
948
|
+
new DecompressionStream("gzip")
|
|
949
|
+
);
|
|
813
950
|
const arrayBuffer = await new Response(decompressed).arrayBuffer();
|
|
814
951
|
const dir = path.dirname(destPath);
|
|
815
952
|
if (!existsSync(dir)) {
|
|
@@ -819,8 +956,9 @@ async function downloadWordNet(version, destPath) {
|
|
|
819
956
|
}
|
|
820
957
|
function createParser(filePath) {
|
|
821
958
|
const resolvedPath = path.resolve(filePath);
|
|
822
|
-
const
|
|
823
|
-
|
|
959
|
+
const nodeStream = createReadStream(resolvedPath);
|
|
960
|
+
const webStream = Readable.toWeb(nodeStream);
|
|
961
|
+
return parse(webStream, {
|
|
824
962
|
ignoreDeclaration: false,
|
|
825
963
|
silent: false
|
|
826
964
|
});
|
|
@@ -1005,6 +1143,7 @@ export {
|
|
|
1005
1143
|
PartsOfSpeech,
|
|
1006
1144
|
PartsOfSpeech2 as PartsOfSpeechLabels,
|
|
1007
1145
|
Pronunciation,
|
|
1146
|
+
SCHEMA,
|
|
1008
1147
|
Sense,
|
|
1009
1148
|
SenseId,
|
|
1010
1149
|
SenseRelation,
|
|
@@ -1024,6 +1163,7 @@ export {
|
|
|
1024
1163
|
createParser,
|
|
1025
1164
|
decodeXmlEntities,
|
|
1026
1165
|
ensureWordNetCached,
|
|
1166
|
+
exportToSQLite,
|
|
1027
1167
|
fetchWordNet,
|
|
1028
1168
|
findLatestVersion,
|
|
1029
1169
|
findSenses,
|