synset 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,6 @@
1
+ // src/export-sqlite.ts
2
+ import { Database } from "bun:sqlite";
3
+
1
4
  // src/types.ts
2
5
  import { z } from "zod";
3
6
  var LexiconId = z.string();
@@ -163,7 +166,290 @@ var Lexicon = z.object({
163
166
  synsets: z.array(Synset).min(0),
164
167
  syntacticBehaviors: z.array(SyntacticBehavior).min(0)
165
168
  });
166
- var partsOfSpeechList = PartsOfSpeech.options.map((v) => v.value);
169
+ var partsOfSpeechList = PartsOfSpeech.options.map(
170
+ (v) => v.value
171
+ );
172
+
173
+ // src/helpers.ts
174
+ function PronunciationNode(node) {
175
+ const obj = {
176
+ variety: optAttr(node, "variety"),
177
+ inner: node.innerText
178
+ };
179
+ return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
180
+ }
181
+ function LemmaNode(node) {
182
+ const obj = {
183
+ writtenForm: attr(node, "writtenForm"),
184
+ partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
185
+ pronunciations: (
186
+ //
187
+ children(node, "Pronunciation", (v) => PronunciationNode(v))
188
+ )
189
+ };
190
+ return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
191
+ }
192
+ function SenseRelationNode(node) {
193
+ const obj = {
194
+ relType: SenseRelationRelType.parse(attr(node, "relType")),
195
+ target: attr(node, "target"),
196
+ dcType: optAttr(node, "dc:type")
197
+ };
198
+ return SenseRelation.parse(
199
+ extendWithRestAttr(node, obj, (s) => s === "dc:type" ? "dcType" : s)
200
+ );
201
+ }
202
+ function SenseNode(node) {
203
+ const adjPos = optAttr(node, "adjposition");
204
+ const obj = {
205
+ id: attr(node, "id"),
206
+ synset: SynsetId.parse(attr(node, "synset")),
207
+ senseRelations: children(node, "SenseRelation", SenseRelationNode),
208
+ subCat: optAttr(node, "subcat"),
209
+ adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
210
+ };
211
+ return Sense.parse(
212
+ extendWithRestAttr(
213
+ node,
214
+ obj,
215
+ (s) => s === "subcat" ? "subCat" : s === "adjposition" ? "adjPosition" : s
216
+ )
217
+ );
218
+ }
219
+ function FormNode(node) {
220
+ const obj = {
221
+ writtenForm: attr(node, "writtenForm")
222
+ };
223
+ return Form.parse(extendWithRestAttr(node, obj, (s) => s));
224
+ }
225
+ function LexicalEntryNode(node) {
226
+ const obj = {
227
+ id: attr(node, "id"),
228
+ lemmas: children(node, "Lemma", LemmaNode),
229
+ senses: children(node, "Sense", SenseNode),
230
+ forms: children(node, "Form", FormNode)
231
+ };
232
+ return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
233
+ }
234
+ function DefinitionNode(node) {
235
+ const obj = {
236
+ inner: node.innerText
237
+ };
238
+ return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
239
+ }
240
+ function ExampleNode(node) {
241
+ const obj = {
242
+ inner: node.innerText,
243
+ dcSource: optAttr(node, "dc:source")
244
+ };
245
+ return Example.parse(
246
+ extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
247
+ );
248
+ }
249
+ function ILIDefinitionNode(node) {
250
+ const obj = {
251
+ inner: node.innerText
252
+ };
253
+ return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
254
+ }
255
+ function SynsetRelationNode(node) {
256
+ const obj = {
257
+ relType: SynsetRelationRelType.parse(attr(node, "relType")),
258
+ target: attr(node, "target")
259
+ };
260
+ return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
261
+ }
262
+ function SyntacticBehaviorNode(node) {
263
+ const obj = {
264
+ id: attr(node, "id"),
265
+ subcategorizationFrame: attr(node, "subcategorizationFrame")
266
+ };
267
+ return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
268
+ }
269
+ function SynsetNode(node) {
270
+ const obj = {
271
+ id: attr(node, "id"),
272
+ ili: attr(node, "ili"),
273
+ lexfile: attr(node, "lexfile"),
274
+ members: attr(node, "members").split(" "),
275
+ dcSource: optAttr(node, "dc:source"),
276
+ partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
277
+ definitions: children(node, "Definition", (v) => DefinitionNode(v)),
278
+ examples: children(node, "Example", (v) => ExampleNode(v)),
279
+ iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
280
+ synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
281
+ };
282
+ return Synset.parse(
283
+ extendWithRestAttr(node, obj, (s) => s === "dc:source" ? "dcSource" : s)
284
+ );
285
+ }
286
+ function LexiconNode(node) {
287
+ const obj = {
288
+ id: attr(node, "id"),
289
+ label: attr(node, "label"),
290
+ language: attr(node, "language"),
291
+ email: attr(node, "email"),
292
+ license: attr(node, "license"),
293
+ version: attr(node, "version"),
294
+ citation: optAttr(node, "citation"),
295
+ url: attr(node, "url"),
296
+ lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
297
+ synsets: children(node, "Synset", SynsetNode),
298
+ syntacticBehaviors: (
299
+ //
300
+ children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
301
+ )
302
+ };
303
+ return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
304
+ }
305
+ var decodeXmlEntities = (s) => {
306
+ if (s === void 0) return void 0;
307
+ return s.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&apos;/g, "'").replace(/&quot;/g, '"');
308
+ };
309
+ var attr = (node, attrName) => {
310
+ const value = decodeXmlEntities(node.attributes[attrName]);
311
+ if (value === void 0) {
312
+ throw new Error(
313
+ `Missing required attribute "${attrName}" on node "${node.type}"`
314
+ );
315
+ }
316
+ return value;
317
+ };
318
+ var optAttr = (node, attrName) => {
319
+ return decodeXmlEntities(node.attributes[attrName]);
320
+ };
321
+ var restAttrs = (node, obj, proxy) => {
322
+ const result = {};
323
+ Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
324
+ result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
325
+ });
326
+ return result;
327
+ };
328
+ var extendWithRestAttr = (node, obj, proxy) => {
329
+ return Object.assign(obj, restAttrs(node, obj, proxy));
330
+ };
331
+ var children = (node, type, fn) => {
332
+ return node.children.filter((v) => v.type === type).map((v) => fn(v));
333
+ };
334
+
335
+ // src/export-sqlite.ts
336
+ var SCHEMA = `
337
+ CREATE TABLE IF NOT EXISTS words (
338
+ id INTEGER PRIMARY KEY,
339
+ word TEXT NOT NULL,
340
+ word_display TEXT NOT NULL
341
+ );
342
+ CREATE INDEX IF NOT EXISTS idx_words_word ON words(word);
343
+
344
+ CREATE TABLE IF NOT EXISTS synsets (
345
+ id TEXT PRIMARY KEY,
346
+ pos TEXT NOT NULL,
347
+ definition TEXT NOT NULL
348
+ );
349
+
350
+ CREATE TABLE IF NOT EXISTS word_synsets (
351
+ word_id INTEGER NOT NULL,
352
+ synset_id TEXT NOT NULL,
353
+ PRIMARY KEY (word_id, synset_id)
354
+ );
355
+ CREATE INDEX IF NOT EXISTS idx_ws_word ON word_synsets(word_id);
356
+ `;
357
+ function exportToSQLite(lexicon, outputPath, options = {}) {
358
+ const { onProgress } = options;
359
+ const db = new Database(outputPath, { create: true });
360
+ db.exec("PRAGMA journal_mode = OFF");
361
+ db.exec("PRAGMA synchronous = OFF");
362
+ db.exec(SCHEMA);
363
+ const wordToEntries = /* @__PURE__ */ new Map();
364
+ for (const entry of lexicon.lexicalEntries) {
365
+ const word = entry.lemmas[0]?.writtenForm;
366
+ if (word) {
367
+ const lower = word.toLowerCase();
368
+ const existing = wordToEntries.get(lower) || [];
369
+ existing.push(entry);
370
+ wordToEntries.set(lower, existing);
371
+ }
372
+ }
373
+ const synsetMap = /* @__PURE__ */ new Map();
374
+ for (const synset of lexicon.synsets) {
375
+ synsetMap.set(synset.id, synset);
376
+ }
377
+ const insertWord = db.prepare(
378
+ "INSERT INTO words (word, word_display) VALUES (?, ?)"
379
+ );
380
+ const wordIds = /* @__PURE__ */ new Map();
381
+ const words = Array.from(wordToEntries.keys()).sort();
382
+ const totalWords = words.length;
383
+ db.exec("BEGIN TRANSACTION");
384
+ let wordId = 0;
385
+ for (let i = 0; i < words.length; i++) {
386
+ const word = words[i];
387
+ const entries = wordToEntries.get(word);
388
+ if (!entries) continue;
389
+ const display = entries[0].lemmas[0]?.writtenForm || word;
390
+ insertWord.run(word, display);
391
+ wordId++;
392
+ wordIds.set(word, wordId);
393
+ if (onProgress && i % 1e4 === 0) {
394
+ onProgress({ phase: "words", current: i, total: totalWords });
395
+ }
396
+ }
397
+ db.exec("COMMIT");
398
+ const usedSynsetIds = /* @__PURE__ */ new Set();
399
+ for (const entries of wordToEntries.values()) {
400
+ for (const entry of entries) {
401
+ for (const sense of entry.senses) {
402
+ usedSynsetIds.add(sense.synset);
403
+ }
404
+ }
405
+ }
406
+ const insertSynset = db.prepare(
407
+ "INSERT OR IGNORE INTO synsets (id, pos, definition) VALUES (?, ?, ?)"
408
+ );
409
+ const synsetList = Array.from(usedSynsetIds);
410
+ const totalSynsets = synsetList.length;
411
+ db.exec("BEGIN TRANSACTION");
412
+ for (let i = 0; i < synsetList.length; i++) {
413
+ const synsetId = synsetList[i];
414
+ const synset = synsetMap.get(synsetId);
415
+ if (synset) {
416
+ const def = decodeXmlEntities(synset.definitions[0]?.inner) || "";
417
+ insertSynset.run(synsetId, synset.partOfSpeech, def);
418
+ }
419
+ if (onProgress && i % 1e4 === 0) {
420
+ onProgress({ phase: "synsets", current: i, total: totalSynsets });
421
+ }
422
+ }
423
+ db.exec("COMMIT");
424
+ const insertRelation = db.prepare(
425
+ "INSERT OR IGNORE INTO word_synsets (word_id, synset_id) VALUES (?, ?)"
426
+ );
427
+ let relationCount = 0;
428
+ const totalRelations = Array.from(wordToEntries.values()).reduce(
429
+ (sum, entries) => sum + entries.reduce((s, e) => s + e.senses.length, 0),
430
+ 0
431
+ );
432
+ db.exec("BEGIN TRANSACTION");
433
+ for (const [word, entries] of wordToEntries) {
434
+ const wordId2 = wordIds.get(word);
435
+ if (!wordId2) continue;
436
+ for (const entry of entries) {
437
+ for (const sense of entry.senses) {
438
+ insertRelation.run(wordId2, sense.synset);
439
+ relationCount++;
440
+ if (onProgress && relationCount % 1e4 === 0) {
441
+ onProgress({
442
+ phase: "relations",
443
+ current: relationCount,
444
+ total: totalRelations
445
+ });
446
+ }
447
+ }
448
+ }
449
+ }
450
+ db.exec("COMMIT");
451
+ db.close();
452
+ }
167
453
 
168
454
  // src/literals.ts
169
455
  var PartsOfSpeech2 = {
@@ -316,8 +602,16 @@ var AdjPosition2 = {
316
602
  };
317
603
 
318
604
  // src/loader.ts
319
- import { existsSync, statSync, writeFileSync, mkdirSync, readdirSync } from "fs";
605
+ import {
606
+ createReadStream,
607
+ existsSync,
608
+ mkdirSync,
609
+ readdirSync,
610
+ statSync,
611
+ writeFileSync
612
+ } from "fs";
320
613
  import path from "path";
614
+ import { Readable } from "stream";
321
615
 
322
616
  // node_modules/@dbushell/xml-streamify/src/node.ts
323
617
  var Node = class {
@@ -561,166 +855,6 @@ async function* parse(input, options) {
561
855
  return document;
562
856
  }
563
857
 
564
- // src/helpers.ts
565
- function PronunciationNode(node) {
566
- const obj = {
567
- variety: optAttr(node, "variety"),
568
- inner: node.innerText
569
- };
570
- return Pronunciation.parse(extendWithRestAttr(node, obj, (s) => s));
571
- }
572
- function LemmaNode(node) {
573
- const obj = {
574
- writtenForm: attr(node, "writtenForm"),
575
- partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
576
- pronunciations: (
577
- //
578
- children(node, "Pronunciation", (v) => PronunciationNode(v))
579
- )
580
- };
581
- return Lemma.parse(extendWithRestAttr(node, obj, (s) => s));
582
- }
583
- function SenseRelationNode(node) {
584
- const obj = {
585
- relType: SenseRelationRelType.parse(attr(node, "relType")),
586
- target: attr(node, "target"),
587
- dcType: optAttr(node, "dc:type")
588
- };
589
- return SenseRelation.parse(
590
- extendWithRestAttr(node, obj, (s) => s == "dc:type" ? "dcType" : s)
591
- );
592
- }
593
- function SenseNode(node) {
594
- const adjPos = optAttr(node, "adjposition");
595
- const obj = {
596
- id: attr(node, "id"),
597
- synset: SynsetId.parse(attr(node, "synset")),
598
- senseRelations: children(node, "SenseRelation", SenseRelationNode),
599
- subCat: optAttr(node, "subcat"),
600
- adjPosition: adjPos ? AdjPosition.parse(adjPos) : void 0
601
- };
602
- return Sense.parse(
603
- extendWithRestAttr(
604
- node,
605
- obj,
606
- (s) => s == "subcat" ? "subCat" : s == "adjposition" ? "adjPosition" : s
607
- )
608
- );
609
- }
610
- function FormNode(node) {
611
- const obj = {
612
- writtenForm: attr(node, "writtenForm")
613
- };
614
- return Form.parse(extendWithRestAttr(node, obj, (s) => s));
615
- }
616
- function LexicalEntryNode(node) {
617
- const obj = {
618
- id: attr(node, "id"),
619
- lemmas: children(node, "Lemma", LemmaNode),
620
- senses: children(node, "Sense", SenseNode),
621
- forms: children(node, "Form", FormNode)
622
- };
623
- return LexicalEntry.parse(extendWithRestAttr(node, obj, (s) => s));
624
- }
625
- function DefinitionNode(node) {
626
- const obj = {
627
- inner: node.innerText
628
- };
629
- return Definition.parse(extendWithRestAttr(node, obj, (s) => s));
630
- }
631
- function ExampleNode(node) {
632
- const obj = {
633
- inner: node.innerText,
634
- dcSource: optAttr(node, "dc:source")
635
- };
636
- return Example.parse(
637
- extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
638
- );
639
- }
640
- function ILIDefinitionNode(node) {
641
- const obj = {
642
- inner: node.innerText
643
- };
644
- return ILIDefinition.parse(extendWithRestAttr(node, obj, (s) => s));
645
- }
646
- function SynsetRelationNode(node) {
647
- const obj = {
648
- relType: SynsetRelationRelType.parse(attr(node, "relType")),
649
- target: attr(node, "target")
650
- };
651
- return SynsetRelation.parse(extendWithRestAttr(node, obj, (s) => s));
652
- }
653
- function SyntacticBehaviorNode(node) {
654
- const obj = {
655
- id: attr(node, "id"),
656
- subcategorizationFrame: attr(node, "subcategorizationFrame")
657
- };
658
- return SyntacticBehavior.parse(extendWithRestAttr(node, obj, (s) => s));
659
- }
660
- function SynsetNode(node) {
661
- const obj = {
662
- id: attr(node, "id"),
663
- ili: attr(node, "ili"),
664
- lexfile: attr(node, "lexfile"),
665
- members: attr(node, "members").split(" "),
666
- dcSource: optAttr(node, "dc:source"),
667
- partOfSpeech: PartsOfSpeech.parse(attr(node, "partOfSpeech")),
668
- definitions: children(node, "Definition", (v) => DefinitionNode(v)),
669
- examples: children(node, "Example", (v) => ExampleNode(v)),
670
- iliDefinitions: children(node, "ILIDefinition", ILIDefinitionNode),
671
- synsetRelations: children(node, "SynsetRelation", SynsetRelationNode)
672
- };
673
- return Synset.parse(
674
- extendWithRestAttr(node, obj, (s) => s == "dc:source" ? "dcSource" : s)
675
- );
676
- }
677
- function LexiconNode(node) {
678
- const obj = {
679
- id: attr(node, "id"),
680
- label: attr(node, "label"),
681
- language: attr(node, "language"),
682
- email: attr(node, "email"),
683
- license: attr(node, "license"),
684
- version: attr(node, "version"),
685
- citation: optAttr(node, "citation"),
686
- url: attr(node, "url"),
687
- lexicalEntries: children(node, "LexicalEntry", LexicalEntryNode),
688
- synsets: children(node, "Synset", SynsetNode),
689
- syntacticBehaviors: (
690
- //
691
- children(node, "SyntacticBehaviour", SyntacticBehaviorNode)
692
- )
693
- };
694
- return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
695
- }
696
- var decodeXmlEntities = (s) => {
697
- if (s === void 0) return void 0;
698
- return s.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&apos;/g, "'").replace(/&quot;/g, '"');
699
- };
700
- var attr = (node, attrName) => {
701
- const value = decodeXmlEntities(node.attributes[attrName]);
702
- if (value === void 0) {
703
- throw new Error(`Missing required attribute "${attrName}" on node "${node.type}"`);
704
- }
705
- return value;
706
- };
707
- var optAttr = (node, attrName) => {
708
- return decodeXmlEntities(node.attributes[attrName]);
709
- };
710
- var restAttrs = (node, obj, proxy) => {
711
- const result = {};
712
- Object.keys(node.attributes).filter((a) => !(proxy(a) in obj)).forEach((k) => {
713
- result[k] = decodeXmlEntities(node.attributes[k]) ?? node.attributes[k];
714
- });
715
- return result;
716
- };
717
- var extendWithRestAttr = (node, obj, proxy) => {
718
- return Object.assign(obj, restAttrs(node, obj, proxy));
719
- };
720
- var children = (node, type, fn) => {
721
- return node.children.filter((v) => v.type == type).map((v) => fn(v));
722
- };
723
-
724
858
  // src/loader.ts
725
859
  var BASE_VERSION = "2024";
726
860
  function getFilename(version) {
@@ -786,7 +920,6 @@ async function findLatestVersion(onProgress, cacheDir) {
786
920
  for (let year = baseYear + 1; year <= lastReleasableYear; year++) {
787
921
  const version = year.toString();
788
922
  if (await urlExists(getDownloadUrl(version))) {
789
- continue;
790
923
  } else {
791
924
  return (year - 1).toString();
792
925
  }
@@ -807,9 +940,13 @@ async function downloadWordNet(version, destPath) {
807
940
  const url = getDownloadUrl(version);
808
941
  const response = await fetch(url);
809
942
  if (!response.ok || !response.body) {
810
- throw new Error(`Failed to download WordNet ${version}: ${response.statusText}`);
943
+ throw new Error(
944
+ `Failed to download WordNet ${version}: ${response.statusText}`
945
+ );
811
946
  }
812
- const decompressed = response.body.pipeThrough(new DecompressionStream("gzip"));
947
+ const decompressed = response.body.pipeThrough(
948
+ new DecompressionStream("gzip")
949
+ );
813
950
  const arrayBuffer = await new Response(decompressed).arrayBuffer();
814
951
  const dir = path.dirname(destPath);
815
952
  if (!existsSync(dir)) {
@@ -819,8 +956,9 @@ async function downloadWordNet(version, destPath) {
819
956
  }
820
957
  function createParser(filePath) {
821
958
  const resolvedPath = path.resolve(filePath);
822
- const fileUrl = resolvedPath.startsWith("/") ? `file://${resolvedPath}` : `file:///${resolvedPath.replace(/\\/g, "/")}`;
823
- return parse(fileUrl, {
959
+ const nodeStream = createReadStream(resolvedPath);
960
+ const webStream = Readable.toWeb(nodeStream);
961
+ return parse(webStream, {
824
962
  ignoreDeclaration: false,
825
963
  silent: false
826
964
  });
@@ -1005,6 +1143,7 @@ export {
1005
1143
  PartsOfSpeech,
1006
1144
  PartsOfSpeech2 as PartsOfSpeechLabels,
1007
1145
  Pronunciation,
1146
+ SCHEMA,
1008
1147
  Sense,
1009
1148
  SenseId,
1010
1149
  SenseRelation,
@@ -1024,6 +1163,7 @@ export {
1024
1163
  createParser,
1025
1164
  decodeXmlEntities,
1026
1165
  ensureWordNetCached,
1166
+ exportToSQLite,
1027
1167
  fetchWordNet,
1028
1168
  findLatestVersion,
1029
1169
  findSenses,