synset 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -46,6 +46,7 @@ __export(src_exports, {
46
46
  PartsOfSpeech: () => PartsOfSpeech,
47
47
  PartsOfSpeechLabels: () => PartsOfSpeech2,
48
48
  Pronunciation: () => Pronunciation,
49
+ SCHEMA: () => SCHEMA,
49
50
  Sense: () => Sense,
50
51
  SenseId: () => SenseId,
51
52
  SenseRelation: () => SenseRelation,
@@ -65,6 +66,7 @@ __export(src_exports, {
65
66
  createParser: () => createParser,
66
67
  decodeXmlEntities: () => decodeXmlEntities,
67
68
  ensureWordNetCached: () => ensureWordNetCached,
69
+ exportToSQLite: () => exportToSQLite,
68
70
  fetchWordNet: () => fetchWordNet,
69
71
  findLatestVersion: () => findLatestVersion,
70
72
  findSenses: () => findSenses,
@@ -88,6 +90,9 @@ __export(src_exports, {
88
90
  });
89
91
  module.exports = __toCommonJS(src_exports);
90
92
 
93
+ // src/export-sqlite.ts
94
+ var import_bun_sqlite = require("bun:sqlite");
95
+
91
96
  // src/types.ts
92
97
  var import_zod = require("zod");
93
98
  var LexiconId = import_zod.z.string();
@@ -253,7 +258,290 @@ var Lexicon = import_zod.z.object({
253
258
  synsets: import_zod.z.array(Synset).min(0),
254
259
  syntacticBehaviors: import_zod.z.array(SyntacticBehavior).min(0)
255
260
  });
256
- var partsOfSpeechList = PartsOfSpeech.options.map((v) => v.value);
261
+ var partsOfSpeechList = PartsOfSpeech.options.map(
262
+ (v) => v.value
263
+ );
264
+
265
+ // src/helpers.ts
266
+ function PronunciationNode(node) {
267
+ const obj = {
268
+ variety: optAttr(node, "variety"),
269
+ inner: node.innerText
270
+ };
271
+ return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
272
+ }
273
+ function LemmaNode(node) {
274
+ const obj = {
275
+ writtenForm: attr(node, "writtenForm"),
276
+ partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
277
+ pronunciations: (
278
+ //
279
+ children(node, "Pronunciation", (v) => PronunciationNode(v))
280
+ )
281
+ };
282
+ return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
283
+ }
284
+ function SenseRelationNode(node) {
285
+ const obj = {
286
+ relType: SenseRelationRelType.parse(attr(node, "relType")),
287
+ target: attr(node, "target"),
288
+ dcType: optAttr(node, "dc:type")
289
+ };
290
+ return SenseRelation.parse(
291
+ extendWithRestAttr(node, obj, (s) => s === "dc:type" ? "dcType" : s)
292
+ );
293
+ }
294
+ function SenseNode(node) {
295
+ const adjPos = optAttr(node, "adjposition");
296
+ const obj = {
297
+ id: attr(node, "id"),
298
+ synset: SynsetId.parse(attr(node, "synset")),
299
+ senseRelations: children(node, "SenseRelation", SenseRelationNode),
300
+ subCat: optAttr(node, "subcat"),
301
+ adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
302
+ };
303
+ return Sense.parse(
304
+ extendWithRestAttr(
305
+ node,
306
+ obj,
307
+ (s) => s === "subcat" ? "subCat" : s === "adjposition" ? "adjPosition" : s
308
+ )
309
+ );
310
+ }
311
+ function FormNode(node) {
312
+ const obj = {
313
+ writtenForm: attr(node, "writtenForm")
314
+ };
315
+ return Form.parse(extendWithRestAttr(node, obj, (s) => s));
316
+ }
317
+ function LexicalEntryNode(node) {
318
+ const obj = {
319
+ id: attr(node, "id"),
320
+ lemmas: children(node, "Lemma", LemmaNode),
321
+ senses: children(node, "Sense", SenseNode),
322
+ forms: children(node, "Form", FormNode)
323
+ };
324
+ return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
325
+ }
326
+ function DefinitionNode(node) {
327
+ const obj = {
328
+ inner: node.innerText
329
+ };
330
+ return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
331
+ }
332
+ function ExampleNode(node) {
333
+ const obj = {
334
+ inner: node.innerText,
335
+ dcSource: optAttr(node, "dc:source")
336
+ };
337
+ return Example.parse(
338
+ extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
339
+ );
340
+ }
341
+ function ILIDefinitionNode(node) {
342
+ const obj = {
343
+ inner: node.innerText
344
+ };
345
+ return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
346
+ }
347
+ function SynsetRelationNode(node) {
348
+ const obj = {
349
+ relType: SynsetRelationRelType.parse(attr(node, "relType")),
350
+ target: attr(node, "target")
351
+ };
352
+ return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
353
+ }
354
+ function SyntacticBehaviorNode(node) {
355
+ const obj = {
356
+ id: attr(node, "id"),
357
+ subcategorizationFrame: attr(node, "subcategorizationFrame")
358
+ };
359
+ return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
360
+ }
361
+ function SynsetNode(node) {
362
+ const obj = {
363
+ id: attr(node, "id"),
364
+ ili: attr(node, "ili"),
365
+ lexfile: attr(node, "lexfile"),
366
+ members: attr(node, "members").split(" "),
367
+ dcSource: optAttr(node, "dc:source"),
368
+ partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
369
+ definitions: children(node, "Definition", (v) => DefinitionNode(v)),
370
+ examples: children(node, "Example", (v) => ExampleNode(v)),
371
+ iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
372
+ synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
373
+ };
374
+ return Synset.parse(
375
+ extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
376
+ );
377
+ }
378
+ function LexiconNode(node) {
379
+ const obj = {
380
+ id: attr(node, "id"),
381
+ label: attr(node, "label"),
382
+ language: attr(node, "language"),
383
+ email: attr(node, "email"),
384
+ license: attr(node, "license"),
385
+ version: attr(node, "version"),
386
+ citation: optAttr(node, "citation"),
387
+ url: attr(node, "url"),
388
+ lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
389
+ synsets: children(node, "Synset", SynsetNode),
390
+ syntacticBehaviors: (
391
+ //
392
+ children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
393
+ )
394
+ };
395
+ return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
396
+ }
397
+ var decodeXmlEntities = (s) => {
398
+ if (s === void 0) return void 0;
399
+ return s.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&apos;/g, "'").replace(/&quot;/g, '"');
400
+ };
401
+ var attr = (node, attrName) => {
402
+ const value = decodeXmlEntities(node.attributes[attrName]);
403
+ if (value === void 0) {
404
+ throw new Error(
405
+ `Missing required attribute "${attrName}" on node "${node.type}"`
406
+ );
407
+ }
408
+ return value;
409
+ };
410
+ var optAttr = (node, attrName) => {
411
+ return decodeXmlEntities(node.attributes[attrName]);
412
+ };
413
+ var restAttrs = (node, obj, proxy) => {
414
+ const result = {};
415
+ Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
416
+ result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
417
+ });
418
+ return result;
419
+ };
420
+ var extendWithRestAttr = (node, obj, proxy) => {
421
+ return Object.assign(obj, restAttrs(node, obj, proxy));
422
+ };
423
+ var children = (node, type, fn) => {
424
+ return node.children.filter((v) => v.type === type).map((v) => fn(v));
425
+ };
426
+
427
+ // src/export-sqlite.ts
428
+ var SCHEMA = `
429
+ CREATE TABLE IF NOT EXISTS words (
430
+ id INTEGER PRIMARY KEY,
431
+ word TEXT NOT NULL,
432
+ word_display TEXT NOT NULL
433
+ );
434
+ CREATE INDEX IF NOT EXISTS idx_words_word ON words(word);
435
+
436
+ CREATE TABLE IF NOT EXISTS synsets (
437
+ id TEXT PRIMARY KEY,
438
+ pos TEXT NOT NULL,
439
+ definition TEXT NOT NULL
440
+ );
441
+
442
+ CREATE TABLE IF NOT EXISTS word_synsets (
443
+ word_id INTEGER NOT NULL,
444
+ synset_id TEXT NOT NULL,
445
+ PRIMARY KEY (word_id, synset_id)
446
+ );
447
+ CREATE INDEX IF NOT EXISTS idx_ws_word ON word_synsets(word_id);
448
+ `;
449
+ function exportToSQLite(lexicon, outputPath, options = {}) {
450
+ const { onProgress } = options;
451
+ const db = new import_bun_sqlite.Database(outputPath, { create: true });
452
+ db.exec("PRAGMA journal_mode = OFF");
453
+ db.exec("PRAGMA synchronous = OFF");
454
+ db.exec(SCHEMA);
455
+ const wordToEntries = /* @__PURE__ */ new Map();
456
+ for (const entry of lexicon.lexicalEntries) {
457
+ const word = entry.lemmas[0]?.writtenForm;
458
+ if (word) {
459
+ const lower = word.toLowerCase();
460
+ const existing = wordToEntries.get(lower) || [];
461
+ existing.push(entry);
462
+ wordToEntries.set(lower, existing);
463
+ }
464
+ }
465
+ const synsetMap = /* @__PURE__ */ new Map();
466
+ for (const synset of lexicon.synsets) {
467
+ synsetMap.set(synset.id, synset);
468
+ }
469
+ const insertWord = db.prepare(
470
+ "INSERT INTO words (word, word_display) VALUES (?, ?)"
471
+ );
472
+ const wordIds = /* @__PURE__ */ new Map();
473
+ const words = Array.from(wordToEntries.keys()).sort();
474
+ const totalWords = words.length;
475
+ db.exec("BEGIN TRANSACTION");
476
+ let wordId = 0;
477
+ for (let i = 0; i < words.length; i++) {
478
+ const word = words[i];
479
+ const entries = wordToEntries.get(word);
480
+ if (!entries) continue;
481
+ const display = entries[0].lemmas[0]?.writtenForm || word;
482
+ insertWord.run(word, display);
483
+ wordId++;
484
+ wordIds.set(word, wordId);
485
+ if (onProgress && i % 1e4 === 0) {
486
+ onProgress({ phase: "words", current: i, total: totalWords });
487
+ }
488
+ }
489
+ db.exec("COMMIT");
490
+ const usedSynsetIds = /* @__PURE__ */ new Set();
491
+ for (const entries of wordToEntries.values()) {
492
+ for (const entry of entries) {
493
+ for (const sense of entry.senses) {
494
+ usedSynsetIds.add(sense.synset);
495
+ }
496
+ }
497
+ }
498
+ const insertSynset = db.prepare(
499
+ "INSERT OR IGNORE INTO synsets (id, pos, definition) VALUES (?, ?, ?)"
500
+ );
501
+ const synsetList = Array.from(usedSynsetIds);
502
+ const totalSynsets = synsetList.length;
503
+ db.exec("BEGIN TRANSACTION");
504
+ for (let i = 0; i < synsetList.length; i++) {
505
+ const synsetId = synsetList[i];
506
+ const synset = synsetMap.get(synsetId);
507
+ if (synset) {
508
+ const def = decodeXmlEntities(synset.definitions[0]?.inner) || "";
509
+ insertSynset.run(synsetId, synset.partOfSpeech, def);
510
+ }
511
+ if (onProgress && i % 1e4 === 0) {
512
+ onProgress({ phase: "synsets", current: i, total: totalSynsets });
513
+ }
514
+ }
515
+ db.exec("COMMIT");
516
+ const insertRelation = db.prepare(
517
+ "INSERT OR IGNORE INTO word_synsets (word_id, synset_id) VALUES (?, ?)"
518
+ );
519
+ let relationCount = 0;
520
+ const totalRelations = Array.from(wordToEntries.values()).reduce(
521
+ (sum, entries) => sum + entries.reduce((s, e) => s + e.senses.length, 0),
522
+ 0
523
+ );
524
+ db.exec("BEGIN TRANSACTION");
525
+ for (const [word, entries] of wordToEntries) {
526
+ const wordId2 = wordIds.get(word);
527
+ if (!wordId2) continue;
528
+ for (const entry of entries) {
529
+ for (const sense of entry.senses) {
530
+ insertRelation.run(wordId2, sense.synset);
531
+ relationCount++;
532
+ if (onProgress && relationCount % 1e4 === 0) {
533
+ onProgress({
534
+ phase: "relations",
535
+ current: relationCount,
536
+ total: totalRelations
537
+ });
538
+ }
539
+ }
540
+ }
541
+ }
542
+ db.exec("COMMIT");
543
+ db.close();
544
+ }
257
545
 
258
546
  // src/literals.ts
259
547
  var PartsOfSpeech2 = {
@@ -408,6 +696,7 @@ var AdjPosition2 = {
408
696
  // src/loader.ts
409
697
  var import_node_fs = require("fs");
410
698
  var import_node_path = __toESM(require("path"), 1);
699
+ var import_node_stream = require("stream");
411
700
 
412
701
  // node_modules/@dbushell/xml-streamify/src/node.ts
413
702
  var Node = class {
@@ -651,166 +940,6 @@ async function* parse(input, options) {
651
940
  return document;
652
941
  }
653
942
 
654
- // src/helpers.ts
655
- function PronunciationNode(node) {
656
- const obj = {
657
- variety: optAttr(node, "variety"),
658
- inner: node.innerText
659
- };
660
- return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
661
- }
662
- function LemmaNode(node) {
663
- const obj = {
664
- writtenForm: attr(node, "writtenForm"),
665
- partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
666
- pronunciations: (
667
- //
668
- children(node, "Pronunciation", (v) => PronunciationNode(v))
669
- )
670
- };
671
- return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
672
- }
673
- function SenseRelationNode(node) {
674
- const obj = {
675
- relType: SenseRelationRelType.parse(attr(node, "relType")),
676
- target: attr(node, "target"),
677
- dcType: optAttr(node, "dc:type")
678
- };
679
- return SenseRelation.parse(
680
- extendWithRestAttr(node, obj, (s) => s == "dc:type" ? "dcType" : s)
681
- );
682
- }
683
- function SenseNode(node) {
684
- const adjPos = optAttr(node, "adjposition");
685
- const obj = {
686
- id: attr(node, "id"),
687
- synset: SynsetId.parse(attr(node, "synset")),
688
- senseRelations: children(node, "SenseRelation", SenseRelationNode),
689
- subCat: optAttr(node, "subcat"),
690
- adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
691
- };
692
- return Sense.parse(
693
- extendWithRestAttr(
694
- node,
695
- obj,
696
- (s) => s == "subcat" ? "subCat" : s == "adjposition" ? "adjPosition" : s
697
- )
698
- );
699
- }
700
- function FormNode(node) {
701
- const obj = {
702
- writtenForm: attr(node, "writtenForm")
703
- };
704
- return Form.parse(extendWithRestAttr(node, obj, (s) => s));
705
- }
706
- function LexicalEntryNode(node) {
707
- const obj = {
708
- id: attr(node, "id"),
709
- lemmas: children(node, "Lemma", LemmaNode),
710
- senses: children(node, "Sense", SenseNode),
711
- forms: children(node, "Form", FormNode)
712
- };
713
- return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
714
- }
715
- function DefinitionNode(node) {
716
- const obj = {
717
- inner: node.innerText
718
- };
719
- return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
720
- }
721
- function ExampleNode(node) {
722
- const obj = {
723
- inner: node.innerText,
724
- dcSource: optAttr(node, "dc:source")
725
- };
726
- return Example.parse(
727
- extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
728
- );
729
- }
730
- function ILIDefinitionNode(node) {
731
- const obj = {
732
- inner: node.innerText
733
- };
734
- return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
735
- }
736
- function SynsetRelationNode(node) {
737
- const obj = {
738
- relType: SynsetRelationRelType.parse(attr(node, "relType")),
739
- target: attr(node, "target")
740
- };
741
- return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
742
- }
743
- function SyntacticBehaviorNode(node) {
744
- const obj = {
745
- id: attr(node, "id"),
746
- subcategorizationFrame: attr(node, "subcategorizationFrame")
747
- };
748
- return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
749
- }
750
- function SynsetNode(node) {
751
- const obj = {
752
- id: attr(node, "id"),
753
- ili: attr(node, "ili"),
754
- lexfile: attr(node, "lexfile"),
755
- members: attr(node, "members").split(" "),
756
- dcSource: optAttr(node, "dc:source"),
757
- partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
758
- definitions: children(node, "Definition", (v) => DefinitionNode(v)),
759
- examples: children(node, "Example", (v) => ExampleNode(v)),
760
- iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
761
- synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
762
- };
763
- return Synset.parse(
764
- extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
765
- );
766
- }
767
- function LexiconNode(node) {
768
- const obj = {
769
- id: attr(node, "id"),
770
- label: attr(node, "label"),
771
- language: attr(node, "language"),
772
- email: attr(node, "email"),
773
- license: attr(node, "license"),
774
- version: attr(node, "version"),
775
- citation: optAttr(node, "citation"),
776
- url: attr(node, "url"),
777
- lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
778
- synsets: children(node, "Synset", SynsetNode),
779
- syntacticBehaviors: (
780
- //
781
- children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
782
- )
783
- };
784
- return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
785
- }
786
- var decodeXmlEntities = (s) => {
787
- if (s === void 0) return void 0;
788
- return s.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&apos;/g, "'").replace(/&quot;/g, '"');
789
- };
790
- var attr = (node, attrName) => {
791
- const value = decodeXmlEntities(node.attributes[attrName]);
792
- if (value === void 0) {
793
- throw new Error(`Missing required attribute "${attrName}" on node "${node.type}"`);
794
- }
795
- return value;
796
- };
797
- var optAttr = (node, attrName) => {
798
- return decodeXmlEntities(node.attributes[attrName]);
799
- };
800
- var restAttrs = (node, obj, proxy) => {
801
- const result = {};
802
- Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
803
- result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
804
- });
805
- return result;
806
- };
807
- var extendWithRestAttr = (node, obj, proxy) => {
808
- return Object.assign(obj, restAttrs(node, obj, proxy));
809
- };
810
- var children = (node, type, fn) => {
811
- return node.children.filter((v) => v.type == type).map((v) => fn(v));
812
- };
813
-
814
943
  // src/loader.ts
815
944
  var BASE_VERSION = "2024";
816
945
  function getFilename(version) {
@@ -876,7 +1005,6 @@ async function findLatestVersion(onProgress, cacheDir) {
876
1005
  for (let year = baseYear + 1; year <= lastReleasableYear; year++) {
877
1006
  const version = year.toString();
878
1007
  if (await urlExists(getDownloadUrl(version))) {
879
- continue;
880
1008
  } else {
881
1009
  return (year - 1).toString();
882
1010
  }
@@ -897,9 +1025,13 @@ async function downloadWordNet(version, destPath) {
897
1025
  const url = getDownloadUrl(version);
898
1026
  const response = await fetch(url);
899
1027
  if (!response.ok || !response.body) {
900
- throw new Error(`Failed to download WordNet ${version}: ${response.statusText}`);
1028
+ throw new Error(
1029
+ `Failed to download WordNet ${version}: ${response.statusText}`
1030
+ );
901
1031
  }
902
- const decompressed = response.body.pipeThrough(new DecompressionStream("gzip"));
1032
+ const decompressed = response.body.pipeThrough(
1033
+ new DecompressionStream("gzip")
1034
+ );
903
1035
  const arrayBuffer = await new Response(decompressed).arrayBuffer();
904
1036
  const dir = import_node_path.default.dirname(destPath);
905
1037
  if (!(0, import_node_fs.existsSync)(dir)) {
@@ -909,8 +1041,9 @@ async function downloadWordNet(version, destPath) {
909
1041
  }
910
1042
  function createParser(filePath) {
911
1043
  const resolvedPath = import_node_path.default.resolve(filePath);
912
- const fileUrl = resolvedPath.startsWith("/") ? `file://${resolvedPath}` : `file:///${resolvedPath.replace(/\\/g, "/")}`;
913
- return parse(fileUrl, {
1044
+ const nodeStream = (0, import_node_fs.createReadStream)(resolvedPath);
1045
+ const webStream = import_node_stream.Readable.toWeb(nodeStream);
1046
+ return parse(webStream, {
914
1047
  ignoreDeclaration: false,
915
1048
  silent: false
916
1049
  });
@@ -1096,6 +1229,7 @@ function getSynsetWords(index, synset) {
1096
1229
  PartsOfSpeech,
1097
1230
  PartsOfSpeechLabels,
1098
1231
  Pronunciation,
1232
+ SCHEMA,
1099
1233
  Sense,
1100
1234
  SenseId,
1101
1235
  SenseRelation,
@@ -1115,6 +1249,7 @@ function getSynsetWords(index, synset) {
1115
1249
  createParser,
1116
1250
  decodeXmlEntities,
1117
1251
  ensureWordNetCached,
1252
+ exportToSQLite,
1118
1253
  fetchWordNet,
1119
1254
  findLatestVersion,
1120
1255
  findSenses,