synset 0.9.5 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -3
- package/dist/cli.cjs +97 -14
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +92 -9
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +94 -14
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -2
- package/dist/index.d.ts +4 -2
- package/dist/index.js +89 -9
- package/dist/index.js.map +1 -1
- package/dist/schema.sql +10 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -88,7 +88,7 @@ import { exportToSQLite } from 'synset'
|
|
|
88
88
|
// Export to SQLite
|
|
89
89
|
exportToSQLite(lexicon, 'dictionary.db', {
|
|
90
90
|
onProgress: ({ phase, current, total }) => {
|
|
91
|
-
// phases: words, synsets, word_synsets, synset_relations
|
|
91
|
+
// phases: words, synsets, word_synsets, synset_relations, sense_relations
|
|
92
92
|
console.log(`${phase}: ${current}/${total}`)
|
|
93
93
|
}
|
|
94
94
|
})
|
|
@@ -103,9 +103,11 @@ Tables:
|
|
|
103
103
|
- `synsets` - definitions with part of speech
|
|
104
104
|
- `word_synsets` - word → synset mappings
|
|
105
105
|
- `synset_relations` - hypernym, hyponym, meronym, etc. links between synsets
|
|
106
|
+
- `sense_relations` - antonym, derivation, pertainym, etc. links between word senses
|
|
106
107
|
|
|
107
|
-
Example
|
|
108
|
+
Example queries:
|
|
108
109
|
```sql
|
|
110
|
+
-- Hypernyms via synset relations (dog → canine, domestic animal)
|
|
109
111
|
SELECT w2.word_display, s2.definition
|
|
110
112
|
FROM words w
|
|
111
113
|
JOIN word_synsets ws ON w.id = ws.word_id
|
|
@@ -114,7 +116,14 @@ JOIN synsets s2 ON sr.target_id = s2.id
|
|
|
114
116
|
JOIN word_synsets ws2 ON s2.id = ws2.synset_id
|
|
115
117
|
JOIN words w2 ON ws2.word_id = w2.id
|
|
116
118
|
WHERE w.word = 'dog' AND sr.rel_type = 'hypernym';
|
|
117
|
-
|
|
119
|
+
|
|
120
|
+
-- Antonyms via sense relations (happy → unhappy)
|
|
121
|
+
SELECT w2.word_display, s2.definition
|
|
122
|
+
FROM words w
|
|
123
|
+
JOIN sense_relations sr ON w.id = sr.source_word_id
|
|
124
|
+
JOIN words w2 ON sr.target_word_id = w2.id
|
|
125
|
+
JOIN synsets s2 ON sr.target_synset_id = s2.id
|
|
126
|
+
WHERE w.word = 'happy' AND sr.rel_type = 'antonym';
|
|
118
127
|
```
|
|
119
128
|
|
|
120
129
|
## Runtime
|
package/dist/cli.cjs
CHANGED
|
@@ -24,8 +24,12 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
24
24
|
));
|
|
25
25
|
|
|
26
26
|
// src/export-sqlite.ts
|
|
27
|
+
var import_node_fs = require("fs");
|
|
27
28
|
var import_libsql = __toESM(require("libsql"), 1);
|
|
28
29
|
|
|
30
|
+
// src/helpers.ts
|
|
31
|
+
var import_entities = require("entities");
|
|
32
|
+
|
|
29
33
|
// src/types.ts
|
|
30
34
|
var import_zod = require("zod");
|
|
31
35
|
var LexiconId = import_zod.z.string();
|
|
@@ -327,10 +331,7 @@ function LexiconNode(node) {
|
|
|
327
331
|
};
|
|
328
332
|
return Lexicon.parse(extendWithRestAttr(node, obj, (s) => s));
|
|
329
333
|
}
|
|
330
|
-
var decodeXmlEntities = (s) =>
|
|
331
|
-
if (s === void 0) return void 0;
|
|
332
|
-
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/'/g, "'").replace(/"/g, '"');
|
|
333
|
-
};
|
|
334
|
+
var decodeXmlEntities = (s) => s === void 0 ? void 0 : (0, import_entities.decodeXML)(s);
|
|
334
335
|
var attr = (node, attrName) => {
|
|
335
336
|
const value = decodeXmlEntities(node.attributes[attrName]);
|
|
336
337
|
if (value === void 0) {
|
|
@@ -387,9 +388,28 @@ CREATE TABLE IF NOT EXISTS synset_relations (
|
|
|
387
388
|
);
|
|
388
389
|
CREATE INDEX IF NOT EXISTS idx_sr_source ON synset_relations(source_id);
|
|
389
390
|
CREATE INDEX IF NOT EXISTS idx_sr_target ON synset_relations(target_id);
|
|
391
|
+
|
|
392
|
+
CREATE TABLE IF NOT EXISTS sense_relations (
|
|
393
|
+
source_word_id INTEGER NOT NULL,
|
|
394
|
+
source_synset_id TEXT NOT NULL,
|
|
395
|
+
target_word_id INTEGER NOT NULL,
|
|
396
|
+
target_synset_id TEXT NOT NULL,
|
|
397
|
+
rel_type TEXT NOT NULL,
|
|
398
|
+
PRIMARY KEY (source_word_id, source_synset_id, target_word_id, target_synset_id, rel_type)
|
|
399
|
+
);
|
|
400
|
+
CREATE INDEX IF NOT EXISTS idx_sense_rel_source ON sense_relations(source_word_id, source_synset_id);
|
|
390
401
|
`;
|
|
391
402
|
function exportToSQLite(lexicon, outputPath, options = {}) {
|
|
392
|
-
const { onProgress } = options;
|
|
403
|
+
const { onProgress, overwrite } = options;
|
|
404
|
+
if ((0, import_node_fs.existsSync)(outputPath)) {
|
|
405
|
+
if (overwrite) {
|
|
406
|
+
(0, import_node_fs.unlinkSync)(outputPath);
|
|
407
|
+
} else {
|
|
408
|
+
throw new Error(
|
|
409
|
+
`File already exists: ${outputPath}. Use --overwrite to replace it.`
|
|
410
|
+
);
|
|
411
|
+
}
|
|
412
|
+
}
|
|
393
413
|
const db = new import_libsql.default(outputPath);
|
|
394
414
|
db.exec("PRAGMA journal_mode = OFF");
|
|
395
415
|
db.exec("PRAGMA synchronous = OFF");
|
|
@@ -516,6 +536,66 @@ function exportToSQLite(lexicon, outputPath, options = {}) {
|
|
|
516
536
|
}
|
|
517
537
|
}
|
|
518
538
|
db.exec("COMMIT");
|
|
539
|
+
const senseToWordSynset = /* @__PURE__ */ new Map();
|
|
540
|
+
for (const [word, entries] of wordToEntries) {
|
|
541
|
+
const wId = wordIds.get(word);
|
|
542
|
+
if (!wId) continue;
|
|
543
|
+
for (const entry of entries) {
|
|
544
|
+
for (const sense of entry.senses) {
|
|
545
|
+
senseToWordSynset.set(sense.id, {
|
|
546
|
+
wordId: wId,
|
|
547
|
+
synsetId: sense.synset
|
|
548
|
+
});
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
const insertSenseRelation = db.prepare(
|
|
553
|
+
"INSERT OR IGNORE INTO sense_relations (source_word_id, source_synset_id, target_word_id, target_synset_id, rel_type) VALUES (?, ?, ?, ?, ?)"
|
|
554
|
+
);
|
|
555
|
+
let totalSenseRelations = 0;
|
|
556
|
+
for (const entries of wordToEntries.values()) {
|
|
557
|
+
for (const entry of entries) {
|
|
558
|
+
for (const sense of entry.senses) {
|
|
559
|
+
for (const rel of sense.senseRelations) {
|
|
560
|
+
if (senseToWordSynset.has(rel.target)) {
|
|
561
|
+
totalSenseRelations++;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
db.exec("BEGIN TRANSACTION");
|
|
568
|
+
let senseRelCount = 0;
|
|
569
|
+
for (const [word, entries] of wordToEntries) {
|
|
570
|
+
const sourceWordId = wordIds.get(word);
|
|
571
|
+
if (!sourceWordId) continue;
|
|
572
|
+
for (const entry of entries) {
|
|
573
|
+
for (const sense of entry.senses) {
|
|
574
|
+
const sourceSynsetId = sense.synset;
|
|
575
|
+
for (const rel of sense.senseRelations) {
|
|
576
|
+
const target = senseToWordSynset.get(rel.target);
|
|
577
|
+
if (target) {
|
|
578
|
+
insertSenseRelation.run(
|
|
579
|
+
sourceWordId,
|
|
580
|
+
sourceSynsetId,
|
|
581
|
+
target.wordId,
|
|
582
|
+
target.synsetId,
|
|
583
|
+
rel.relType
|
|
584
|
+
);
|
|
585
|
+
senseRelCount++;
|
|
586
|
+
if (onProgress && senseRelCount % 1e4 === 0) {
|
|
587
|
+
onProgress({
|
|
588
|
+
phase: "sense_relations",
|
|
589
|
+
current: senseRelCount,
|
|
590
|
+
total: totalSenseRelations
|
|
591
|
+
});
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
db.exec("COMMIT");
|
|
519
599
|
db.close();
|
|
520
600
|
}
|
|
521
601
|
|
|
@@ -631,7 +711,7 @@ var SynsetRelationRelType2 = {
|
|
|
631
711
|
};
|
|
632
712
|
|
|
633
713
|
// src/loader.ts
|
|
634
|
-
var
|
|
714
|
+
var import_node_fs2 = require("fs");
|
|
635
715
|
var import_node_path = __toESM(require("path"), 1);
|
|
636
716
|
var import_node_stream = require("stream");
|
|
637
717
|
|
|
@@ -890,8 +970,8 @@ function getDefaultCacheDir() {
|
|
|
890
970
|
return import_node_path.default.join(homeDir, ".cache", "synset");
|
|
891
971
|
}
|
|
892
972
|
function fileExists(filePath) {
|
|
893
|
-
if ((0,
|
|
894
|
-
const stat = (0,
|
|
973
|
+
if ((0, import_node_fs2.existsSync)(filePath)) {
|
|
974
|
+
const stat = (0, import_node_fs2.statSync)(filePath);
|
|
895
975
|
return stat.isFile();
|
|
896
976
|
}
|
|
897
977
|
return false;
|
|
@@ -909,8 +989,8 @@ function extractVersionFromFilename(filename) {
|
|
|
909
989
|
return match ? parseInt(match[1], 10) : null;
|
|
910
990
|
}
|
|
911
991
|
function findCachedVersion(cacheDir) {
|
|
912
|
-
if (!(0,
|
|
913
|
-
const files = (0,
|
|
992
|
+
if (!(0, import_node_fs2.existsSync)(cacheDir)) return null;
|
|
993
|
+
const files = (0, import_node_fs2.readdirSync)(cacheDir);
|
|
914
994
|
const wordnetFiles = files.map((f) => ({ file: f, year: extractVersionFromFilename(f) })).filter((x) => x.year !== null).sort((a, b) => b.year - a.year);
|
|
915
995
|
return wordnetFiles.length > 0 ? wordnetFiles[0].year.toString() : null;
|
|
916
996
|
}
|
|
@@ -971,14 +1051,14 @@ async function downloadWordNet(version, destPath) {
|
|
|
971
1051
|
);
|
|
972
1052
|
const arrayBuffer = await new Response(decompressed).arrayBuffer();
|
|
973
1053
|
const dir = import_node_path.default.dirname(destPath);
|
|
974
|
-
if (!(0,
|
|
975
|
-
(0,
|
|
1054
|
+
if (!(0, import_node_fs2.existsSync)(dir)) {
|
|
1055
|
+
(0, import_node_fs2.mkdirSync)(dir, { recursive: true });
|
|
976
1056
|
}
|
|
977
|
-
(0,
|
|
1057
|
+
(0, import_node_fs2.writeFileSync)(destPath, Buffer.from(arrayBuffer));
|
|
978
1058
|
}
|
|
979
1059
|
function createParser(filePath) {
|
|
980
1060
|
const resolvedPath = import_node_path.default.resolve(filePath);
|
|
981
|
-
const nodeStream = (0,
|
|
1061
|
+
const nodeStream = (0, import_node_fs2.createReadStream)(resolvedPath);
|
|
982
1062
|
const webStream = import_node_stream.Readable.toWeb(nodeStream);
|
|
983
1063
|
return parse(webStream, {
|
|
984
1064
|
ignoreDeclaration: false,
|
|
@@ -1153,6 +1233,7 @@ Commands:
|
|
|
1153
1233
|
|
|
1154
1234
|
Options:
|
|
1155
1235
|
--file <path> Use a local WordNet XML file instead of cache
|
|
1236
|
+
--overwrite Overwrite existing file (for export-sqlite)
|
|
1156
1237
|
--help, -h Show this help message
|
|
1157
1238
|
|
|
1158
1239
|
Examples:
|
|
@@ -1188,10 +1269,12 @@ async function main() {
|
|
|
1188
1269
|
console.error("Error: Missing output path for export-sqlite");
|
|
1189
1270
|
process.exit(1);
|
|
1190
1271
|
}
|
|
1272
|
+
const overwrite = args.includes("--overwrite");
|
|
1191
1273
|
console.log("Loading WordNet data...");
|
|
1192
1274
|
const lexicon2 = filePath ? await loadWordNet(filePath) : (await fetchWordNet({ onProgress: console.log })).lexicon;
|
|
1193
1275
|
console.log(`Exporting to ${outputPath}...`);
|
|
1194
1276
|
exportToSQLite(lexicon2, outputPath, {
|
|
1277
|
+
overwrite,
|
|
1195
1278
|
onProgress: ({ phase, current, total }) => {
|
|
1196
1279
|
process.stdout.write(`\r${phase}: ${current}/${total}`);
|
|
1197
1280
|
}
|