synset 0.9.4 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -1
- package/dist/cli.cjs +128 -15
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +128 -15
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +128 -15
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +128 -15
- package/dist/index.js.map +1 -1
- package/dist/schema.sql +19 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -88,15 +88,44 @@ import { exportToSQLite } from 'synset'
|
|
|
88
88
|
// Export to SQLite
|
|
89
89
|
exportToSQLite(lexicon, 'dictionary.db', {
|
|
90
90
|
onProgress: ({ phase, current, total }) => {
|
|
91
|
+
// phases: words, synsets, word_synsets, synset_relations, sense_relations
|
|
91
92
|
console.log(`${phase}: ${current}/${total}`)
|
|
92
93
|
}
|
|
93
94
|
})
|
|
94
95
|
```
|
|
95
96
|
|
|
96
|
-
Schema
|
|
97
|
+
Schema is available as:
|
|
97
98
|
- `import { SCHEMA } from 'synset'` - SQL string constant
|
|
98
99
|
- `synset/schema.sql` - standalone file via package exports
|
|
99
100
|
|
|
101
|
+
Tables:
|
|
102
|
+
- `words` - unique words with display form
|
|
103
|
+
- `synsets` - definitions with part of speech
|
|
104
|
+
- `word_synsets` - word → synset mappings
|
|
105
|
+
- `synset_relations` - hypernym, hyponym, meronym, etc. links between synsets
|
|
106
|
+
- `sense_relations` - antonym, derivation, pertainym, etc. links between word senses
|
|
107
|
+
|
|
108
|
+
Example queries:
|
|
109
|
+
```sql
|
|
110
|
+
-- Hypernyms via synset relations (dog → canine, domestic animal)
|
|
111
|
+
SELECT w2.word_display, s2.definition
|
|
112
|
+
FROM words w
|
|
113
|
+
JOIN word_synsets ws ON w.id = ws.word_id
|
|
114
|
+
JOIN synset_relations sr ON ws.synset_id = sr.source_id
|
|
115
|
+
JOIN synsets s2 ON sr.target_id = s2.id
|
|
116
|
+
JOIN word_synsets ws2 ON s2.id = ws2.synset_id
|
|
117
|
+
JOIN words w2 ON ws2.word_id = w2.id
|
|
118
|
+
WHERE w.word = 'dog' AND sr.rel_type = 'hypernym';
|
|
119
|
+
|
|
120
|
+
-- Antonyms via sense relations (happy → unhappy)
|
|
121
|
+
SELECT w2.word_display, s2.definition
|
|
122
|
+
FROM words w
|
|
123
|
+
JOIN sense_relations sr ON w.id = sr.source_word_id
|
|
124
|
+
JOIN words w2 ON sr.target_word_id = w2.id
|
|
125
|
+
JOIN synsets s2 ON sr.target_synset_id = s2.id
|
|
126
|
+
WHERE w.word = 'happy' AND sr.rel_type = 'antonym';
|
|
127
|
+
```
|
|
128
|
+
|
|
100
129
|
## Runtime
|
|
101
130
|
|
|
102
131
|
- **Bun**: Full support
|
package/dist/cli.cjs
CHANGED
|
@@ -26,6 +26,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
26
26
|
// src/export-sqlite.ts
|
|
27
27
|
var import_libsql = __toESM(require("libsql"), 1);
|
|
28
28
|
|
|
29
|
+
// src/helpers.ts
|
|
30
|
+
var import_entities = require("entities");
|
|
31
|
+
|
|
29
32
|
// src/types.ts
|
|
30
33
|
var import_zod = require("zod");
|
|
31
34
|
var LexiconId = import_zod.z.string();
|
|
@@ -327,10 +330,7 @@ function LexiconNode(node) {
|
|
|
327
330
|
};
|
|
328
331
|
return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
329
332
|
}
|
|
330
|
-
var decodeXmlEntities = (s) =>
|
|
331
|
-
if (s === void 0) return void 0;
|
|
332
|
-
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/'/g, "'").replace(/"/g, '"');
|
|
333
|
-
};
|
|
333
|
+
var decodeXmlEntities = (s) => s === void 0 ? void 0 : (0, import_entities.decodeXML)(s);
|
|
334
334
|
var attr = (node, attrName) => {
|
|
335
335
|
const value = decodeXmlEntities(node.attributes[attrName]);
|
|
336
336
|
if (value === void 0) {
|
|
@@ -378,6 +378,25 @@ CREATE TABLE IF NOT EXISTS word_synsets (
|
|
|
378
378
|
PRIMARY KEY (word_id, synset_id)
|
|
379
379
|
);
|
|
380
380
|
CREATE INDEX IF NOT EXISTS idx_ws_word ON word_synsets(word_id);
|
|
381
|
+
|
|
382
|
+
CREATE TABLE IF NOT EXISTS synset_relations (
|
|
383
|
+
source_id TEXT NOT NULL,
|
|
384
|
+
target_id TEXT NOT NULL,
|
|
385
|
+
rel_type TEXT NOT NULL,
|
|
386
|
+
PRIMARY KEY (source_id, target_id, rel_type)
|
|
387
|
+
);
|
|
388
|
+
CREATE INDEX IF NOT EXISTS idx_sr_source ON synset_relations(source_id);
|
|
389
|
+
CREATE INDEX IF NOT EXISTS idx_sr_target ON synset_relations(target_id);
|
|
390
|
+
|
|
391
|
+
CREATE TABLE IF NOT EXISTS sense_relations (
|
|
392
|
+
source_word_id INTEGER NOT NULL,
|
|
393
|
+
source_synset_id TEXT NOT NULL,
|
|
394
|
+
target_word_id INTEGER NOT NULL,
|
|
395
|
+
target_synset_id TEXT NOT NULL,
|
|
396
|
+
rel_type TEXT NOT NULL,
|
|
397
|
+
PRIMARY KEY (source_word_id, source_synset_id, target_word_id, target_synset_id, rel_type)
|
|
398
|
+
);
|
|
399
|
+
CREATE INDEX IF NOT EXISTS idx_sense_rel_source ON sense_relations(source_word_id, source_synset_id);
|
|
381
400
|
`;
|
|
382
401
|
function exportToSQLite(lexicon, outputPath, options = {}) {
|
|
383
402
|
const { onProgress } = options;
|
|
@@ -446,33 +465,127 @@ function exportToSQLite(lexicon, outputPath, options = {}) {
|
|
|
446
465
|
}
|
|
447
466
|
}
|
|
448
467
|
db.exec("COMMIT");
|
|
449
|
-
const
|
|
468
|
+
const insertWordSynset = db.prepare(
|
|
450
469
|
"INSERT OR IGNORE INTO word_synsets (word_id, synset_id) VALUES (?, ?)"
|
|
451
470
|
);
|
|
452
|
-
let
|
|
453
|
-
const
|
|
471
|
+
let wsCount = 0;
|
|
472
|
+
const totalWordSynsets = Array.from(wordToEntries.values()).reduce(
|
|
454
473
|
(sum, entries) => sum + entries.reduce((s, e) => s + e.senses.length, 0),
|
|
455
474
|
0
|
|
456
475
|
);
|
|
457
476
|
db.exec("BEGIN TRANSACTION");
|
|
458
477
|
for (const [word, entries] of wordToEntries) {
|
|
459
|
-
const
|
|
460
|
-
if (!
|
|
478
|
+
const wId = wordIds.get(word);
|
|
479
|
+
if (!wId) continue;
|
|
461
480
|
for (const entry of entries) {
|
|
462
481
|
for (const sense of entry.senses) {
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
if (onProgress &&
|
|
482
|
+
insertWordSynset.run(wId, sense.synset);
|
|
483
|
+
wsCount++;
|
|
484
|
+
if (onProgress && wsCount % 1e4 === 0) {
|
|
466
485
|
onProgress({
|
|
467
|
-
phase: "
|
|
468
|
-
current:
|
|
469
|
-
total:
|
|
486
|
+
phase: "word_synsets",
|
|
487
|
+
current: wsCount,
|
|
488
|
+
total: totalWordSynsets
|
|
470
489
|
});
|
|
471
490
|
}
|
|
472
491
|
}
|
|
473
492
|
}
|
|
474
493
|
}
|
|
475
494
|
db.exec("COMMIT");
|
|
495
|
+
const insertSynsetRelation = db.prepare(
|
|
496
|
+
"INSERT OR IGNORE INTO synset_relations (source_id, target_id, rel_type) VALUES (?, ?, ?)"
|
|
497
|
+
);
|
|
498
|
+
let totalSynsetRelations = 0;
|
|
499
|
+
for (const synsetId of usedSynsetIds) {
|
|
500
|
+
const synset = synsetMap.get(synsetId);
|
|
501
|
+
if (synset) {
|
|
502
|
+
for (const rel of synset.synsetRelations) {
|
|
503
|
+
if (usedSynsetIds.has(rel.target)) {
|
|
504
|
+
totalSynsetRelations++;
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
db.exec("BEGIN TRANSACTION");
|
|
510
|
+
let srCount = 0;
|
|
511
|
+
for (const synsetId of usedSynsetIds) {
|
|
512
|
+
const synset = synsetMap.get(synsetId);
|
|
513
|
+
if (!synset) continue;
|
|
514
|
+
for (const rel of synset.synsetRelations) {
|
|
515
|
+
if (usedSynsetIds.has(rel.target)) {
|
|
516
|
+
insertSynsetRelation.run(synsetId, rel.target, rel.relType);
|
|
517
|
+
srCount++;
|
|
518
|
+
if (onProgress && srCount % 1e4 === 0) {
|
|
519
|
+
onProgress({
|
|
520
|
+
phase: "synset_relations",
|
|
521
|
+
current: srCount,
|
|
522
|
+
total: totalSynsetRelations
|
|
523
|
+
});
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
db.exec("COMMIT");
|
|
529
|
+
const senseToWordSynset = /* @__PURE__ */ new Map();
|
|
530
|
+
for (const [word, entries] of wordToEntries) {
|
|
531
|
+
const wId = wordIds.get(word);
|
|
532
|
+
if (!wId) continue;
|
|
533
|
+
for (const entry of entries) {
|
|
534
|
+
for (const sense of entry.senses) {
|
|
535
|
+
senseToWordSynset.set(sense.id, {
|
|
536
|
+
wordId: wId,
|
|
537
|
+
synsetId: sense.synset
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
const insertSenseRelation = db.prepare(
|
|
543
|
+
"INSERT OR IGNORE INTO sense_relations (source_word_id, source_synset_id, target_word_id, target_synset_id, rel_type) VALUES (?, ?, ?, ?, ?)"
|
|
544
|
+
);
|
|
545
|
+
let totalSenseRelations = 0;
|
|
546
|
+
for (const entries of wordToEntries.values()) {
|
|
547
|
+
for (const entry of entries) {
|
|
548
|
+
for (const sense of entry.senses) {
|
|
549
|
+
for (const rel of sense.senseRelations) {
|
|
550
|
+
if (senseToWordSynset.has(rel.target)) {
|
|
551
|
+
totalSenseRelations++;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
db.exec("BEGIN TRANSACTION");
|
|
558
|
+
let senseRelCount = 0;
|
|
559
|
+
for (const [word, entries] of wordToEntries) {
|
|
560
|
+
const sourceWordId = wordIds.get(word);
|
|
561
|
+
if (!sourceWordId) continue;
|
|
562
|
+
for (const entry of entries) {
|
|
563
|
+
for (const sense of entry.senses) {
|
|
564
|
+
const sourceSynsetId = sense.synset;
|
|
565
|
+
for (const rel of sense.senseRelations) {
|
|
566
|
+
const target = senseToWordSynset.get(rel.target);
|
|
567
|
+
if (target) {
|
|
568
|
+
insertSenseRelation.run(
|
|
569
|
+
sourceWordId,
|
|
570
|
+
sourceSynsetId,
|
|
571
|
+
target.wordId,
|
|
572
|
+
target.synsetId,
|
|
573
|
+
rel.relType
|
|
574
|
+
);
|
|
575
|
+
senseRelCount++;
|
|
576
|
+
if (onProgress && senseRelCount % 1e4 === 0) {
|
|
577
|
+
onProgress({
|
|
578
|
+
phase: "sense_relations",
|
|
579
|
+
current: senseRelCount,
|
|
580
|
+
total: totalSenseRelations
|
|
581
|
+
});
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
db.exec("COMMIT");
|
|
476
589
|
db.close();
|
|
477
590
|
}
|
|
478
591
|
|