synset 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -39
- package/dist/cli.cjs +534 -382
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +542 -383
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +301 -166
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +26 -7
- package/dist/index.d.ts +26 -7
- package/dist/index.js +307 -167
- package/dist/index.js.map +1 -1
- package/dist/schema.sql +22 -0
- package/package.json +10 -5
package/dist/index.cjs
CHANGED
|
@@ -46,6 +46,7 @@ __export(src_exports, {
|
|
|
46
46
|
PartsOfSpeech: () => PartsOfSpeech,
|
|
47
47
|
PartsOfSpeechLabels: () => PartsOfSpeech2,
|
|
48
48
|
Pronunciation: () => Pronunciation,
|
|
49
|
+
SCHEMA: () => SCHEMA,
|
|
49
50
|
Sense: () => Sense,
|
|
50
51
|
SenseId: () => SenseId,
|
|
51
52
|
SenseRelation: () => SenseRelation,
|
|
@@ -65,6 +66,7 @@ __export(src_exports, {
|
|
|
65
66
|
createParser: () => createParser,
|
|
66
67
|
decodeXmlEntities: () => decodeXmlEntities,
|
|
67
68
|
ensureWordNetCached: () => ensureWordNetCached,
|
|
69
|
+
exportToSQLite: () => exportToSQLite,
|
|
68
70
|
fetchWordNet: () => fetchWordNet,
|
|
69
71
|
findLatestVersion: () => findLatestVersion,
|
|
70
72
|
findSenses: () => findSenses,
|
|
@@ -88,6 +90,9 @@ __export(src_exports, {
|
|
|
88
90
|
});
|
|
89
91
|
module.exports = __toCommonJS(src_exports);
|
|
90
92
|
|
|
93
|
+
// src/export-sqlite.ts
|
|
94
|
+
var import_bun_sqlite = require("bun:sqlite");
|
|
95
|
+
|
|
91
96
|
// src/types.ts
|
|
92
97
|
var import_zod = require("zod");
|
|
93
98
|
var LexiconId = import_zod.z.string();
|
|
@@ -253,7 +258,290 @@ var Lexicon = import_zod.z.object({
|
|
|
253
258
|
synsets: import_zod.z.array(Synset).min(0),
|
|
254
259
|
syntacticBehaviors: import_zod.z.array(SyntacticBehavior).min(0)
|
|
255
260
|
});
|
|
256
|
-
var partsOfSpeechList = PartsOfSpeech.options.map(
|
|
261
|
+
var partsOfSpeechList = PartsOfSpeech.options.map(
|
|
262
|
+
(v) => v.value
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
// src/helpers.ts
|
|
266
|
+
function PronunciationNode(node) {
|
|
267
|
+
const obj = {
|
|
268
|
+
variety: optAttr(node, "variety"),
|
|
269
|
+
inner: node.innerText
|
|
270
|
+
};
|
|
271
|
+
return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
272
|
+
}
|
|
273
|
+
function LemmaNode(node) {
|
|
274
|
+
const obj = {
|
|
275
|
+
writtenForm: attr(node, "writtenForm"),
|
|
276
|
+
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
277
|
+
pronunciations: (
|
|
278
|
+
//
|
|
279
|
+
children(node, "Pronunciation", (v) => PronunciationNode(v))
|
|
280
|
+
)
|
|
281
|
+
};
|
|
282
|
+
return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
283
|
+
}
|
|
284
|
+
function SenseRelationNode(node) {
|
|
285
|
+
const obj = {
|
|
286
|
+
relType: SenseRelationRelType.parse(attr(node, "relType")),
|
|
287
|
+
target: attr(node, "target"),
|
|
288
|
+
dcType: optAttr(node, "dc:type")
|
|
289
|
+
};
|
|
290
|
+
return SenseRelation.parse(
|
|
291
|
+
extendWithRestAttr(node, obj, (s) => s === "dc:type" ? "dcType" : s)
|
|
292
|
+
);
|
|
293
|
+
}
|
|
294
|
+
function SenseNode(node) {
|
|
295
|
+
const adjPos = optAttr(node, "adjposition");
|
|
296
|
+
const obj = {
|
|
297
|
+
id: attr(node, "id"),
|
|
298
|
+
synset: SynsetId.parse(attr(node, "synset")),
|
|
299
|
+
senseRelations: children(node, "SenseRelation", SenseRelationNode),
|
|
300
|
+
subCat: optAttr(node, "subcat"),
|
|
301
|
+
adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
|
|
302
|
+
};
|
|
303
|
+
return Sense.parse(
|
|
304
|
+
extendWithRestAttr(
|
|
305
|
+
node,
|
|
306
|
+
obj,
|
|
307
|
+
(s) => s === "subcat" ? "subCat" : s === "adjposition" ? "adjPosition" : s
|
|
308
|
+
)
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
function FormNode(node) {
|
|
312
|
+
const obj = {
|
|
313
|
+
writtenForm: attr(node, "writtenForm")
|
|
314
|
+
};
|
|
315
|
+
return Form.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
316
|
+
}
|
|
317
|
+
function LexicalEntryNode(node) {
|
|
318
|
+
const obj = {
|
|
319
|
+
id: attr(node, "id"),
|
|
320
|
+
lemmas: children(node, "Lemma", LemmaNode),
|
|
321
|
+
senses: children(node, "Sense", SenseNode),
|
|
322
|
+
forms: children(node, "Form", FormNode)
|
|
323
|
+
};
|
|
324
|
+
return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
325
|
+
}
|
|
326
|
+
function DefinitionNode(node) {
|
|
327
|
+
const obj = {
|
|
328
|
+
inner: node.innerText
|
|
329
|
+
};
|
|
330
|
+
return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
331
|
+
}
|
|
332
|
+
function ExampleNode(node) {
|
|
333
|
+
const obj = {
|
|
334
|
+
inner: node.innerText,
|
|
335
|
+
dcSource: optAttr(node, "dc:source")
|
|
336
|
+
};
|
|
337
|
+
return Example.parse(
|
|
338
|
+
extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
|
|
339
|
+
);
|
|
340
|
+
}
|
|
341
|
+
function ILIDefinitionNode(node) {
|
|
342
|
+
const obj = {
|
|
343
|
+
inner: node.innerText
|
|
344
|
+
};
|
|
345
|
+
return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
346
|
+
}
|
|
347
|
+
function SynsetRelationNode(node) {
|
|
348
|
+
const obj = {
|
|
349
|
+
relType: SynsetRelationRelType.parse(attr(node, "relType")),
|
|
350
|
+
target: attr(node, "target")
|
|
351
|
+
};
|
|
352
|
+
return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
353
|
+
}
|
|
354
|
+
function SyntacticBehaviorNode(node) {
|
|
355
|
+
const obj = {
|
|
356
|
+
id: attr(node, "id"),
|
|
357
|
+
subcategorizationFrame: attr(node, "subcategorizationFrame")
|
|
358
|
+
};
|
|
359
|
+
return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
360
|
+
}
|
|
361
|
+
function SynsetNode(node) {
|
|
362
|
+
const obj = {
|
|
363
|
+
id: attr(node, "id"),
|
|
364
|
+
ili: attr(node, "ili"),
|
|
365
|
+
lexfile: attr(node, "lexfile"),
|
|
366
|
+
members: attr(node, "members").split(" "),
|
|
367
|
+
dcSource: optAttr(node, "dc:source"),
|
|
368
|
+
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
369
|
+
definitions: children(node, "Definition", (v) => DefinitionNode(v)),
|
|
370
|
+
examples: children(node, "Example", (v) => ExampleNode(v)),
|
|
371
|
+
iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
|
|
372
|
+
synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
|
|
373
|
+
};
|
|
374
|
+
return Synset.parse(
|
|
375
|
+
extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
|
|
376
|
+
);
|
|
377
|
+
}
|
|
378
|
+
function LexiconNode(node) {
|
|
379
|
+
const obj = {
|
|
380
|
+
id: attr(node, "id"),
|
|
381
|
+
label: attr(node, "label"),
|
|
382
|
+
language: attr(node, "language"),
|
|
383
|
+
email: attr(node, "email"),
|
|
384
|
+
license: attr(node, "license"),
|
|
385
|
+
version: attr(node, "version"),
|
|
386
|
+
citation: optAttr(node, "citation"),
|
|
387
|
+
url: attr(node, "url"),
|
|
388
|
+
lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
|
|
389
|
+
synsets: children(node, "Synset", SynsetNode),
|
|
390
|
+
syntacticBehaviors: (
|
|
391
|
+
//
|
|
392
|
+
children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
|
|
393
|
+
)
|
|
394
|
+
};
|
|
395
|
+
return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
396
|
+
}
|
|
397
|
+
var decodeXmlEntities = (s) => {
|
|
398
|
+
if (s === void 0) return void 0;
|
|
399
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/'/g, "'").replace(/"/g, '"');
|
|
400
|
+
};
|
|
401
|
+
var attr = (node, attrName) => {
|
|
402
|
+
const value = decodeXmlEntities(node.attributes[attrName]);
|
|
403
|
+
if (value === void 0) {
|
|
404
|
+
throw new Error(
|
|
405
|
+
`Missing required attribute "${attrName}" on node "${node.type}"`
|
|
406
|
+
);
|
|
407
|
+
}
|
|
408
|
+
return value;
|
|
409
|
+
};
|
|
410
|
+
var optAttr = (node, attrName) => {
|
|
411
|
+
return decodeXmlEntities(node.attributes[attrName]);
|
|
412
|
+
};
|
|
413
|
+
var restAttrs = (node, obj, proxy) => {
|
|
414
|
+
const result = {};
|
|
415
|
+
Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
|
|
416
|
+
result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
|
|
417
|
+
});
|
|
418
|
+
return result;
|
|
419
|
+
};
|
|
420
|
+
var extendWithRestAttr = (node, obj, proxy) => {
|
|
421
|
+
return Object.assign(obj, restAttrs(node, obj, proxy));
|
|
422
|
+
};
|
|
423
|
+
var children = (node, type, fn) => {
|
|
424
|
+
return node.children.filter((v) => v.type === type).map((v) => fn(v));
|
|
425
|
+
};
|
|
426
|
+
|
|
427
|
+
// src/export-sqlite.ts
|
|
428
|
+
var SCHEMA = `
|
|
429
|
+
CREATE TABLE IF NOT EXISTS words (
|
|
430
|
+
id INTEGER PRIMARY KEY,
|
|
431
|
+
word TEXT NOT NULL,
|
|
432
|
+
word_display TEXT NOT NULL
|
|
433
|
+
);
|
|
434
|
+
CREATE INDEX IF NOT EXISTS idx_words_word ON words(word);
|
|
435
|
+
|
|
436
|
+
CREATE TABLE IF NOT EXISTS synsets (
|
|
437
|
+
id TEXT PRIMARY KEY,
|
|
438
|
+
pos TEXT NOT NULL,
|
|
439
|
+
definition TEXT NOT NULL
|
|
440
|
+
);
|
|
441
|
+
|
|
442
|
+
CREATE TABLE IF NOT EXISTS word_synsets (
|
|
443
|
+
word_id INTEGER NOT NULL,
|
|
444
|
+
synset_id TEXT NOT NULL,
|
|
445
|
+
PRIMARY KEY (word_id, synset_id)
|
|
446
|
+
);
|
|
447
|
+
CREATE INDEX IF NOT EXISTS idx_ws_word ON word_synsets(word_id);
|
|
448
|
+
`;
|
|
449
|
+
function exportToSQLite(lexicon, outputPath, options = {}) {
|
|
450
|
+
const { onProgress } = options;
|
|
451
|
+
const db = new import_bun_sqlite.Database(outputPath, { create: true });
|
|
452
|
+
db.exec("PRAGMA journal_mode = OFF");
|
|
453
|
+
db.exec("PRAGMA synchronous = OFF");
|
|
454
|
+
db.exec(SCHEMA);
|
|
455
|
+
const wordToEntries = /* @__PURE__ */ new Map();
|
|
456
|
+
for (const entry of lexicon.lexicalEntries) {
|
|
457
|
+
const word = entry.lemmas[0]?.writtenForm;
|
|
458
|
+
if (word) {
|
|
459
|
+
const lower = word.toLowerCase();
|
|
460
|
+
const existing = wordToEntries.get(lower) || [];
|
|
461
|
+
existing.push(entry);
|
|
462
|
+
wordToEntries.set(lower, existing);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
const synsetMap = /* @__PURE__ */ new Map();
|
|
466
|
+
for (const synset of lexicon.synsets) {
|
|
467
|
+
synsetMap.set(synset.id, synset);
|
|
468
|
+
}
|
|
469
|
+
const insertWord = db.prepare(
|
|
470
|
+
"INSERT INTO words (word, word_display) VALUES (?, ?)"
|
|
471
|
+
);
|
|
472
|
+
const wordIds = /* @__PURE__ */ new Map();
|
|
473
|
+
const words = Array.from(wordToEntries.keys()).sort();
|
|
474
|
+
const totalWords = words.length;
|
|
475
|
+
db.exec("BEGIN TRANSACTION");
|
|
476
|
+
let wordId = 0;
|
|
477
|
+
for (let i = 0; i < words.length; i++) {
|
|
478
|
+
const word = words[i];
|
|
479
|
+
const entries = wordToEntries.get(word);
|
|
480
|
+
if (!entries) continue;
|
|
481
|
+
const display = entries[0].lemmas[0]?.writtenForm || word;
|
|
482
|
+
insertWord.run(word, display);
|
|
483
|
+
wordId++;
|
|
484
|
+
wordIds.set(word, wordId);
|
|
485
|
+
if (onProgress && i % 1e4 === 0) {
|
|
486
|
+
onProgress({ phase: "words", current: i, total: totalWords });
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
db.exec("COMMIT");
|
|
490
|
+
const usedSynsetIds = /* @__PURE__ */ new Set();
|
|
491
|
+
for (const entries of wordToEntries.values()) {
|
|
492
|
+
for (const entry of entries) {
|
|
493
|
+
for (const sense of entry.senses) {
|
|
494
|
+
usedSynsetIds.add(sense.synset);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
const insertSynset = db.prepare(
|
|
499
|
+
"INSERT OR IGNORE INTO synsets (id, pos, definition) VALUES (?, ?, ?)"
|
|
500
|
+
);
|
|
501
|
+
const synsetList = Array.from(usedSynsetIds);
|
|
502
|
+
const totalSynsets = synsetList.length;
|
|
503
|
+
db.exec("BEGIN TRANSACTION");
|
|
504
|
+
for (let i = 0; i < synsetList.length; i++) {
|
|
505
|
+
const synsetId = synsetList[i];
|
|
506
|
+
const synset = synsetMap.get(synsetId);
|
|
507
|
+
if (synset) {
|
|
508
|
+
const def = decodeXmlEntities(synset.definitions[0]?.inner) || "";
|
|
509
|
+
insertSynset.run(synsetId, synset.partOfSpeech, def);
|
|
510
|
+
}
|
|
511
|
+
if (onProgress && i % 1e4 === 0) {
|
|
512
|
+
onProgress({ phase: "synsets", current: i, total: totalSynsets });
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
db.exec("COMMIT");
|
|
516
|
+
const insertRelation = db.prepare(
|
|
517
|
+
"INSERT OR IGNORE INTO word_synsets (word_id, synset_id) VALUES (?, ?)"
|
|
518
|
+
);
|
|
519
|
+
let relationCount = 0;
|
|
520
|
+
const totalRelations = Array.from(wordToEntries.values()).reduce(
|
|
521
|
+
(sum, entries) => sum + entries.reduce((s, e) => s + e.senses.length, 0),
|
|
522
|
+
0
|
|
523
|
+
);
|
|
524
|
+
db.exec("BEGIN TRANSACTION");
|
|
525
|
+
for (const [word, entries] of wordToEntries) {
|
|
526
|
+
const wordId2 = wordIds.get(word);
|
|
527
|
+
if (!wordId2) continue;
|
|
528
|
+
for (const entry of entries) {
|
|
529
|
+
for (const sense of entry.senses) {
|
|
530
|
+
insertRelation.run(wordId2, sense.synset);
|
|
531
|
+
relationCount++;
|
|
532
|
+
if (onProgress && relationCount % 1e4 === 0) {
|
|
533
|
+
onProgress({
|
|
534
|
+
phase: "relations",
|
|
535
|
+
current: relationCount,
|
|
536
|
+
total: totalRelations
|
|
537
|
+
});
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
db.exec("COMMIT");
|
|
543
|
+
db.close();
|
|
544
|
+
}
|
|
257
545
|
|
|
258
546
|
// src/literals.ts
|
|
259
547
|
var PartsOfSpeech2 = {
|
|
@@ -408,6 +696,7 @@ var AdjPosition2 = {
|
|
|
408
696
|
// src/loader.ts
|
|
409
697
|
var import_node_fs = require("fs");
|
|
410
698
|
var import_node_path = __toESM(require("path"), 1);
|
|
699
|
+
var import_node_stream = require("stream");
|
|
411
700
|
|
|
412
701
|
// node_modules/@dbushell/xml-streamify/src/node.ts
|
|
413
702
|
var Node = class {
|
|
@@ -651,166 +940,6 @@ async function* parse(input, options) {
|
|
|
651
940
|
return document;
|
|
652
941
|
}
|
|
653
942
|
|
|
654
|
-
// src/helpers.ts
|
|
655
|
-
function PronunciationNode(node) {
|
|
656
|
-
const obj = {
|
|
657
|
-
variety: optAttr(node, "variety"),
|
|
658
|
-
inner: node.innerText
|
|
659
|
-
};
|
|
660
|
-
return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
661
|
-
}
|
|
662
|
-
function LemmaNode(node) {
|
|
663
|
-
const obj = {
|
|
664
|
-
writtenForm: attr(node, "writtenForm"),
|
|
665
|
-
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
666
|
-
pronunciations: (
|
|
667
|
-
//
|
|
668
|
-
children(node, "Pronunciation", (v) => PronunciationNode(v))
|
|
669
|
-
)
|
|
670
|
-
};
|
|
671
|
-
return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
672
|
-
}
|
|
673
|
-
function SenseRelationNode(node) {
|
|
674
|
-
const obj = {
|
|
675
|
-
relType: SenseRelationRelType.parse(attr(node, "relType")),
|
|
676
|
-
target: attr(node, "target"),
|
|
677
|
-
dcType: optAttr(node, "dc:type")
|
|
678
|
-
};
|
|
679
|
-
return SenseRelation.parse(
|
|
680
|
-
extendWithRestAttr(node, obj, (s) => s == "dc:type" ? "dcType" : s)
|
|
681
|
-
);
|
|
682
|
-
}
|
|
683
|
-
function SenseNode(node) {
|
|
684
|
-
const adjPos = optAttr(node, "adjposition");
|
|
685
|
-
const obj = {
|
|
686
|
-
id: attr(node, "id"),
|
|
687
|
-
synset: SynsetId.parse(attr(node, "synset")),
|
|
688
|
-
senseRelations: children(node, "SenseRelation", SenseRelationNode),
|
|
689
|
-
subCat: optAttr(node, "subcat"),
|
|
690
|
-
adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
|
|
691
|
-
};
|
|
692
|
-
return Sense.parse(
|
|
693
|
-
extendWithRestAttr(
|
|
694
|
-
node,
|
|
695
|
-
obj,
|
|
696
|
-
(s) => s == "subcat" ? "subCat" : s == "adjposition" ? "adjPosition" : s
|
|
697
|
-
)
|
|
698
|
-
);
|
|
699
|
-
}
|
|
700
|
-
function FormNode(node) {
|
|
701
|
-
const obj = {
|
|
702
|
-
writtenForm: attr(node, "writtenForm")
|
|
703
|
-
};
|
|
704
|
-
return Form.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
705
|
-
}
|
|
706
|
-
function LexicalEntryNode(node) {
|
|
707
|
-
const obj = {
|
|
708
|
-
id: attr(node, "id"),
|
|
709
|
-
lemmas: children(node, "Lemma", LemmaNode),
|
|
710
|
-
senses: children(node, "Sense", SenseNode),
|
|
711
|
-
forms: children(node, "Form", FormNode)
|
|
712
|
-
};
|
|
713
|
-
return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
714
|
-
}
|
|
715
|
-
function DefinitionNode(node) {
|
|
716
|
-
const obj = {
|
|
717
|
-
inner: node.innerText
|
|
718
|
-
};
|
|
719
|
-
return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
720
|
-
}
|
|
721
|
-
function ExampleNode(node) {
|
|
722
|
-
const obj = {
|
|
723
|
-
inner: node.innerText,
|
|
724
|
-
dcSource: optAttr(node, "dc:source")
|
|
725
|
-
};
|
|
726
|
-
return Example.parse(
|
|
727
|
-
extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
|
|
728
|
-
);
|
|
729
|
-
}
|
|
730
|
-
function ILIDefinitionNode(node) {
|
|
731
|
-
const obj = {
|
|
732
|
-
inner: node.innerText
|
|
733
|
-
};
|
|
734
|
-
return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
735
|
-
}
|
|
736
|
-
function SynsetRelationNode(node) {
|
|
737
|
-
const obj = {
|
|
738
|
-
relType: SynsetRelationRelType.parse(attr(node, "relType")),
|
|
739
|
-
target: attr(node, "target")
|
|
740
|
-
};
|
|
741
|
-
return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
742
|
-
}
|
|
743
|
-
function SyntacticBehaviorNode(node) {
|
|
744
|
-
const obj = {
|
|
745
|
-
id: attr(node, "id"),
|
|
746
|
-
subcategorizationFrame: attr(node, "subcategorizationFrame")
|
|
747
|
-
};
|
|
748
|
-
return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
749
|
-
}
|
|
750
|
-
function SynsetNode(node) {
|
|
751
|
-
const obj = {
|
|
752
|
-
id: attr(node, "id"),
|
|
753
|
-
ili: attr(node, "ili"),
|
|
754
|
-
lexfile: attr(node, "lexfile"),
|
|
755
|
-
members: attr(node, "members").split(" "),
|
|
756
|
-
dcSource: optAttr(node, "dc:source"),
|
|
757
|
-
partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
|
|
758
|
-
definitions: children(node, "Definition", (v) => DefinitionNode(v)),
|
|
759
|
-
examples: children(node, "Example", (v) => ExampleNode(v)),
|
|
760
|
-
iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
|
|
761
|
-
synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
|
|
762
|
-
};
|
|
763
|
-
return Synset.parse(
|
|
764
|
-
extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
|
|
765
|
-
);
|
|
766
|
-
}
|
|
767
|
-
function LexiconNode(node) {
|
|
768
|
-
const obj = {
|
|
769
|
-
id: attr(node, "id"),
|
|
770
|
-
label: attr(node, "label"),
|
|
771
|
-
language: attr(node, "language"),
|
|
772
|
-
email: attr(node, "email"),
|
|
773
|
-
license: attr(node, "license"),
|
|
774
|
-
version: attr(node, "version"),
|
|
775
|
-
citation: optAttr(node, "citation"),
|
|
776
|
-
url: attr(node, "url"),
|
|
777
|
-
lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
|
|
778
|
-
synsets: children(node, "Synset", SynsetNode),
|
|
779
|
-
syntacticBehaviors: (
|
|
780
|
-
//
|
|
781
|
-
children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
|
|
782
|
-
)
|
|
783
|
-
};
|
|
784
|
-
return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
785
|
-
}
|
|
786
|
-
var decodeXmlEntities = (s) => {
|
|
787
|
-
if (s === void 0) return void 0;
|
|
788
|
-
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/'/g, "'").replace(/"/g, '"');
|
|
789
|
-
};
|
|
790
|
-
var attr = (node, attrName) => {
|
|
791
|
-
const value = decodeXmlEntities(node.attributes[attrName]);
|
|
792
|
-
if (value === void 0) {
|
|
793
|
-
throw new Error(`Missing required attribute "${attrName}" on node "${node.type}"`);
|
|
794
|
-
}
|
|
795
|
-
return value;
|
|
796
|
-
};
|
|
797
|
-
var optAttr = (node, attrName) => {
|
|
798
|
-
return decodeXmlEntities(node.attributes[attrName]);
|
|
799
|
-
};
|
|
800
|
-
var restAttrs = (node, obj, proxy) => {
|
|
801
|
-
const result = {};
|
|
802
|
-
Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
|
|
803
|
-
result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
|
|
804
|
-
});
|
|
805
|
-
return result;
|
|
806
|
-
};
|
|
807
|
-
var extendWithRestAttr = (node, obj, proxy) => {
|
|
808
|
-
return Object.assign(obj, restAttrs(node, obj, proxy));
|
|
809
|
-
};
|
|
810
|
-
var children = (node, type, fn) => {
|
|
811
|
-
return node.children.filter((v) => v.type == type).map((v) => fn(v));
|
|
812
|
-
};
|
|
813
|
-
|
|
814
943
|
// src/loader.ts
|
|
815
944
|
var BASE_VERSION = "2024";
|
|
816
945
|
function getFilename(version) {
|
|
@@ -876,7 +1005,6 @@ async function findLatestVersion(onProgress, cacheDir) {
|
|
|
876
1005
|
for (let year = baseYear + 1; year <= lastReleasableYear; year++) {
|
|
877
1006
|
const version = year.toString();
|
|
878
1007
|
if (await urlExists(getDownloadUrl(version))) {
|
|
879
|
-
continue;
|
|
880
1008
|
} else {
|
|
881
1009
|
return (year - 1).toString();
|
|
882
1010
|
}
|
|
@@ -897,9 +1025,13 @@ async function downloadWordNet(version, destPath) {
|
|
|
897
1025
|
const url = getDownloadUrl(version);
|
|
898
1026
|
const response = await fetch(url);
|
|
899
1027
|
if (!response.ok || !response.body) {
|
|
900
|
-
throw new Error(
|
|
1028
|
+
throw new Error(
|
|
1029
|
+
`Failed to download WordNet ${version}: ${response.statusText}`
|
|
1030
|
+
);
|
|
901
1031
|
}
|
|
902
|
-
const decompressed = response.body.pipeThrough(
|
|
1032
|
+
const decompressed = response.body.pipeThrough(
|
|
1033
|
+
new DecompressionStream("gzip")
|
|
1034
|
+
);
|
|
903
1035
|
const arrayBuffer = await new Response(decompressed).arrayBuffer();
|
|
904
1036
|
const dir = import_node_path.default.dirname(destPath);
|
|
905
1037
|
if (!(0, import_node_fs.existsSync)(dir)) {
|
|
@@ -909,8 +1041,9 @@ async function downloadWordNet(version, destPath) {
|
|
|
909
1041
|
}
|
|
910
1042
|
function createParser(filePath) {
|
|
911
1043
|
const resolvedPath = import_node_path.default.resolve(filePath);
|
|
912
|
-
const
|
|
913
|
-
|
|
1044
|
+
const nodeStream = (0, import_node_fs.createReadStream)(resolvedPath);
|
|
1045
|
+
const webStream = import_node_stream.Readable.toWeb(nodeStream);
|
|
1046
|
+
return parse(webStream, {
|
|
914
1047
|
ignoreDeclaration: false,
|
|
915
1048
|
silent: false
|
|
916
1049
|
});
|
|
@@ -1096,6 +1229,7 @@ function getSynsetWords(index, synset) {
|
|
|
1096
1229
|
PartsOfSpeech,
|
|
1097
1230
|
PartsOfSpeechLabels,
|
|
1098
1231
|
Pronunciation,
|
|
1232
|
+
SCHEMA,
|
|
1099
1233
|
Sense,
|
|
1100
1234
|
SenseId,
|
|
1101
1235
|
SenseRelation,
|
|
@@ -1115,6 +1249,7 @@ function getSynsetWords(index, synset) {
|
|
|
1115
1249
|
createParser,
|
|
1116
1250
|
decodeXmlEntities,
|
|
1117
1251
|
ensureWordNetCached,
|
|
1252
|
+
exportToSQLite,
|
|
1118
1253
|
fetchWordNet,
|
|
1119
1254
|
findLatestVersion,
|
|
1120
1255
|
findSenses,
|