glossarist 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +189 -0
- package/package.json +69 -0
- package/src/concept-reader.d.ts +13 -0
- package/src/concept-reader.js +94 -0
- package/src/errors.d.ts +11 -0
- package/src/errors.js +42 -0
- package/src/gcr-reader.d.ts +77 -0
- package/src/gcr-reader.js +292 -0
- package/src/index.d.ts +4 -0
- package/src/index.js +3 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Glossarist Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# glossarist-js
|
|
2
|
+
|
|
3
|
+
[](https://github.com/glossarist/glossarist-js/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.npmjs.com/package/glossarist)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
JavaScript library for reading [Glossarist](https://github.com/glossarist) GCR packages (ZIP archives) and v2 glossarist concept data (YAML files). Works in Node.js and browsers.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install glossarist
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Requires Node.js 18+.
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
### Reading a GCR package
|
|
20
|
+
|
|
21
|
+
```js
|
|
22
|
+
import { loadGcr } from 'glossarist';
|
|
23
|
+
import fs from 'fs';
|
|
24
|
+
|
|
25
|
+
const buf = fs.readFileSync('my-dataset.gcr');
|
|
26
|
+
const pkg = await loadGcr(buf);
|
|
27
|
+
|
|
28
|
+
// Metadata
|
|
29
|
+
const meta = await pkg.metadata();
|
|
30
|
+
console.log(meta.shortname, meta.version, meta.concept_count);
|
|
31
|
+
|
|
32
|
+
// List concept IDs
|
|
33
|
+
const ids = await pkg.conceptIds();
|
|
34
|
+
|
|
35
|
+
// Read a specific concept
|
|
36
|
+
const concept = await pkg.concept('3.1.1.1');
|
|
37
|
+
console.log(concept.termid);
|
|
38
|
+
console.log(concept.localizations.eng.terms[0].designation);
|
|
39
|
+
|
|
40
|
+
// Iterate all concepts (streaming)
|
|
41
|
+
await pkg.eachConcept((concept) => {
|
|
42
|
+
console.log(concept.termid);
|
|
43
|
+
});
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`loadGcr` accepts `Buffer`, `ArrayBuffer`, `Uint8Array`, `Blob`, or a base64-encoded string.
|
|
47
|
+
|
|
48
|
+
### Reading concept YAML files from a directory
|
|
49
|
+
|
|
50
|
+
```js
|
|
51
|
+
import { readConcepts, readConcept, listConceptIds } from 'glossarist';
|
|
52
|
+
|
|
53
|
+
// Read all concepts
|
|
54
|
+
const concepts = readConcepts('./geolexica-v2/');
|
|
55
|
+
console.log(`Loaded ${concepts.length} concepts`);
|
|
56
|
+
|
|
57
|
+
// Read a single concept by ID
|
|
58
|
+
const concept = readConcept('./geolexica-v2/', '3.1.1.1');
|
|
59
|
+
|
|
60
|
+
// List IDs with optional prefix filter
|
|
61
|
+
const ids = listConceptIds('./geolexica-v2/', '3.1.');
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Browser usage
|
|
65
|
+
|
|
66
|
+
The GCR reader works in browsers via jszip. The concept directory reader requires Node.js `fs`.
|
|
67
|
+
|
|
68
|
+
```html
|
|
69
|
+
<script type="module">
|
|
70
|
+
import { loadGcr } from 'glossarist/gcr';
|
|
71
|
+
|
|
72
|
+
const response = await fetch('/datasets/isotc204.gcr');
|
|
73
|
+
const buf = await response.arrayBuffer();
|
|
74
|
+
const pkg = await loadGcr(buf);
|
|
75
|
+
const meta = await pkg.metadata();
|
|
76
|
+
</script>
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Concept format
|
|
80
|
+
|
|
81
|
+
Glossarist-js normalizes both storage formats into a consistent structure:
|
|
82
|
+
|
|
83
|
+
```js
|
|
84
|
+
{
|
|
85
|
+
termid: '3.1.1.1', // concept identifier
|
|
86
|
+
term: 'entity', // primary term (canonical format only)
|
|
87
|
+
localizations: {
|
|
88
|
+
eng: {
|
|
89
|
+
terms: [{ type: 'expression', designation: 'entity', normative_status: 'preferred' }],
|
|
90
|
+
definition: [{ content: 'concrete or abstract thing...' }],
|
|
91
|
+
notes: [],
|
|
92
|
+
examples: [],
|
|
93
|
+
sources: [{ type: 'authoritative', origin: { ref: 'ISO/TS 14812:2022' } }],
|
|
94
|
+
entry_status: 'valid',
|
|
95
|
+
},
|
|
96
|
+
fra: { ... },
|
|
97
|
+
},
|
|
98
|
+
raw: { ... }, // original parsed YAML
|
|
99
|
+
}
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Language codes are discovered dynamically from the YAML keys — any ISO 639-3 code works without code changes.
|
|
103
|
+
|
|
104
|
+
### Supported formats
|
|
105
|
+
|
|
106
|
+
| Format | Structure | Used by |
|
|
107
|
+
|--------|-----------|---------|
|
|
108
|
+
| **Canonical** | Single YAML document with `termid` and language keys (`eng:`, `fra:`) | IEV (iec-electropedia) |
|
|
109
|
+
| **Managed concept** | Multi-document YAML: first doc has `data.identifier` + `data.localized_concepts`, subsequent docs have `data.language_code` | isotc204, isotc211, osgeo |
|
|
110
|
+
|
|
111
|
+
## Error handling
|
|
112
|
+
|
|
113
|
+
All public functions validate inputs and throw descriptive errors with context:
|
|
114
|
+
|
|
115
|
+
```js
|
|
116
|
+
import { InvalidInputError, YamlParseError } from 'glossarist';
|
|
117
|
+
|
|
118
|
+
try {
|
|
119
|
+
await pkg.concept('3.1.1.1');
|
|
120
|
+
} catch (err) {
|
|
121
|
+
if (err instanceof YamlParseError) {
|
|
122
|
+
// err.message: "Failed to parse YAML for 3.1.1.1: ..."
|
|
123
|
+
// err.cause: the original YAML parse error
|
|
124
|
+
} else if (err instanceof InvalidInputError) {
|
|
125
|
+
// Invalid input (null, empty string, wrong type)
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Errors include the concept ID or filename in their message, making it easy to locate failures in large datasets.
|
|
131
|
+
|
|
132
|
+
- **`GlossaristError`** — base class for all library errors
|
|
133
|
+
- **`InvalidInputError`** — null, undefined, empty, or wrong-type arguments
|
|
134
|
+
- **`YamlParseError`** — malformed YAML with `cause` chaining the original error
|
|
135
|
+
|
|
136
|
+
## TypeScript
|
|
137
|
+
|
|
138
|
+
TypeScript declarations are included. No `@types/` package needed.
|
|
139
|
+
|
|
140
|
+
```ts
|
|
141
|
+
import { loadGcr, readConcepts, type Concept, type GcrMetadata } from 'glossarist';
|
|
142
|
+
|
|
143
|
+
const pkg = await loadGcr(buffer);
|
|
144
|
+
const meta: GcrMetadata | null = await pkg.metadata();
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## API
|
|
148
|
+
|
|
149
|
+
### GCR Package (`glossarist/gcr`)
|
|
150
|
+
|
|
151
|
+
- `loadGcr(input)` — Load a GCR ZIP from Buffer/ArrayBuffer/Uint8Array/Blob/base64 string. Returns `GcrPackage`.
|
|
152
|
+
- `GcrPackage#metadata()` — Parse `metadata.yaml`.
|
|
153
|
+
- `GcrPackage#register()` — Parse optional `register.yaml`.
|
|
154
|
+
- `GcrPackage#conceptIds()` — Array of concept IDs (natural-sorted).
|
|
155
|
+
- `GcrPackage#concept(id)` — Read and normalize a single concept.
|
|
156
|
+
- `GcrPackage#eachConcept(callback)` — Stream all concepts.
|
|
157
|
+
- `GcrPackage#allConcepts()` — Load all concepts into an array.
|
|
158
|
+
- `parseConceptYaml(raw, context?)` — Parse raw YAML string into normalized concept object. `context` is an optional concept ID or filename for error messages.
|
|
159
|
+
- `naturalSort(a, b)` — Natural sort comparator for concept IDs.
|
|
160
|
+
|
|
161
|
+
### Concept Directory Reader (`glossarist/concept`)
|
|
162
|
+
|
|
163
|
+
Node.js only (uses `fs`).
|
|
164
|
+
|
|
165
|
+
- `readConcepts(dir)` — Read all concept YAML files from a directory.
|
|
166
|
+
- `readConcept(dir, id)` — Read a single concept by ID.
|
|
167
|
+
- `listConceptIds(dir, prefix?)` — List concept IDs, optionally filtered by prefix.
|
|
168
|
+
- `readRegister(dir)` — Read `register.yaml` if present.
|
|
169
|
+
|
|
170
|
+
### Errors
|
|
171
|
+
|
|
172
|
+
- `GlossaristError` — base error class
|
|
173
|
+
- `InvalidInputError` — bad input arguments
|
|
174
|
+
- `YamlParseError` — YAML parse failures (has `cause`, includes concept context)
|
|
175
|
+
|
|
176
|
+
## Development
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
npm install
|
|
180
|
+
npm test # regenerate fixtures + run all tests
|
|
181
|
+
npm run lint # lint src/ and test/
|
|
182
|
+
npm run test:coverage # run with coverage report
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
See [CONTRIBUTING.md](./CONTRIBUTING.md) for full guidelines.
|
|
186
|
+
|
|
187
|
+
## License
|
|
188
|
+
|
|
189
|
+
[MIT](./LICENSE)
|
package/package.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "glossarist",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "JavaScript library for reading Glossarist GCR packages and v2 concept data",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "src/index.js",
|
|
7
|
+
"types": "src/index.d.ts",
|
|
8
|
+
"sideEffects": false,
|
|
9
|
+
"files": [
|
|
10
|
+
"src"
|
|
11
|
+
],
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./src/index.d.ts",
|
|
15
|
+
"import": "./src/index.js",
|
|
16
|
+
"default": "./src/index.js"
|
|
17
|
+
},
|
|
18
|
+
"./gcr": {
|
|
19
|
+
"types": "./src/gcr-reader.d.ts",
|
|
20
|
+
"import": "./src/gcr-reader.js",
|
|
21
|
+
"default": "./src/gcr-reader.js"
|
|
22
|
+
},
|
|
23
|
+
"./concept": {
|
|
24
|
+
"types": "./src/concept-reader.d.ts",
|
|
25
|
+
"import": "./src/concept-reader.js",
|
|
26
|
+
"default": "./src/concept-reader.js"
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"scripts": {
|
|
30
|
+
"lint": "eslint src/ test/",
|
|
31
|
+
"pretest": "node test/fixtures/build-fixtures.js",
|
|
32
|
+
"test": "node --test test/*.test.js",
|
|
33
|
+
"test:verbose": "node --test --test-reporter spec test/*.test.js",
|
|
34
|
+
"test:coverage": "node --test --experimental-test-coverage test/*.test.js",
|
|
35
|
+
"prepublishOnly": "npm test"
|
|
36
|
+
},
|
|
37
|
+
"keywords": [
|
|
38
|
+
"glossarist",
|
|
39
|
+
"gcr",
|
|
40
|
+
"terminology",
|
|
41
|
+
"glossary",
|
|
42
|
+
"iso",
|
|
43
|
+
"geolexica"
|
|
44
|
+
],
|
|
45
|
+
"license": "MIT",
|
|
46
|
+
"repository": {
|
|
47
|
+
"type": "git",
|
|
48
|
+
"url": "git+https://github.com/glossarist/glossarist-js.git"
|
|
49
|
+
},
|
|
50
|
+
"homepage": "https://github.com/glossarist/glossarist-js#readme",
|
|
51
|
+
"bugs": {
|
|
52
|
+
"url": "https://github.com/glossarist/glossarist-js/issues"
|
|
53
|
+
},
|
|
54
|
+
"engines": {
|
|
55
|
+
"node": ">=18"
|
|
56
|
+
},
|
|
57
|
+
"publishConfig": {
|
|
58
|
+
"access": "public"
|
|
59
|
+
},
|
|
60
|
+
"dependencies": {
|
|
61
|
+
"js-yaml": "^4.1.0",
|
|
62
|
+
"jszip": "^3.10.1"
|
|
63
|
+
},
|
|
64
|
+
"devDependencies": {
|
|
65
|
+
"@eslint/js": "^10.0.1",
|
|
66
|
+
"eslint": "^10.3.0",
|
|
67
|
+
"globals": "^17.6.0"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Concept } from './gcr-reader';
|
|
2
|
+
|
|
3
|
+
/** Read all concept YAML files from a directory. */
|
|
4
|
+
export function readConcepts(dir: string): Concept[];
|
|
5
|
+
|
|
6
|
+
/** Read a single concept by ID from a directory. */
|
|
7
|
+
export function readConcept(dir: string, id: string): Concept | null;
|
|
8
|
+
|
|
9
|
+
/** List all concept IDs in a directory, optionally filtered by prefix. */
|
|
10
|
+
export function listConceptIds(dir: string, prefix?: string): string[];
|
|
11
|
+
|
|
12
|
+
/** Read register.yaml from a dataset directory (if present). */
|
|
13
|
+
export function readRegister(dir: string): Record<string, unknown> | null;
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import yaml from 'js-yaml';
|
|
4
|
+
import { parseConceptYaml, naturalSort } from './gcr-reader.js';
|
|
5
|
+
import { InvalidInputError } from './errors.js';
|
|
6
|
+
|
|
7
|
+
function assertDir(dir, fnName) {
|
|
8
|
+
if (typeof dir !== 'string' || dir.trim() === '') {
|
|
9
|
+
throw new InvalidInputError(`${fnName} requires a directory path`, 'non-empty string');
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Read all v2 glossarist concept YAML files from a directory.
|
|
15
|
+
* @param {string} dir - path to directory containing concept YAML files
|
|
16
|
+
* @returns {import('./gcr-reader.js').Concept[]}
|
|
17
|
+
* @throws {InvalidInputError} if dir is missing or empty
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
* const concepts = readConcepts('./geolexica-v2/');
|
|
21
|
+
* console.log(concepts[0].localizations.eng.terms[0].designation);
|
|
22
|
+
*/
|
|
23
|
+
export function readConcepts(dir) {
|
|
24
|
+
assertDir(dir, 'readConcepts');
|
|
25
|
+
const files = fs.readdirSync(dir)
|
|
26
|
+
.filter(f => f.endsWith('.yaml') && f !== 'register.yaml')
|
|
27
|
+
.sort(naturalSort);
|
|
28
|
+
|
|
29
|
+
const concepts = [];
|
|
30
|
+
for (const file of files) {
|
|
31
|
+
const raw = fs.readFileSync(path.join(dir, file), 'utf8');
|
|
32
|
+
const concept = parseConceptYaml(raw, file);
|
|
33
|
+
if (concept && concept.termid) {
|
|
34
|
+
concepts.push(concept);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return concepts;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Read a single concept file by ID from a directory.
|
|
42
|
+
* @param {string} dir - path to directory containing concept YAML files
|
|
43
|
+
* @param {string} id - concept identifier (filename without .yaml)
|
|
44
|
+
* @returns {import('./gcr-reader.js').Concept | null}
|
|
45
|
+
* @throws {InvalidInputError} if dir or id is missing or empty
|
|
46
|
+
*
|
|
47
|
+
* @example
|
|
48
|
+
* const concept = readConcept('./geolexica-v2/', '3.1.1.1');
|
|
49
|
+
* if (concept) console.log(concept.termid);
|
|
50
|
+
*/
|
|
51
|
+
export function readConcept(dir, id) {
|
|
52
|
+
assertDir(dir, 'readConcept');
|
|
53
|
+
if (typeof id !== 'string' || id.trim() === '') {
|
|
54
|
+
throw new InvalidInputError('readConcept requires a concept ID', 'non-empty string');
|
|
55
|
+
}
|
|
56
|
+
const filePath = path.join(dir, `${id}.yaml`);
|
|
57
|
+
if (!fs.existsSync(filePath)) return null;
|
|
58
|
+
const raw = fs.readFileSync(filePath, 'utf8');
|
|
59
|
+
return parseConceptYaml(raw, `${id}.yaml`);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* List all concept IDs in a directory, optionally filtered by prefix.
|
|
64
|
+
* @param {string} dir - path to directory
|
|
65
|
+
* @param {string} [prefix] - optional prefix filter
|
|
66
|
+
* @returns {string[]}
|
|
67
|
+
* @throws {InvalidInputError} if dir is missing or empty
|
|
68
|
+
*
|
|
69
|
+
* @example
|
|
70
|
+
* const ids = listConceptIds('./geolexica-v2/', '3.1.'); // ['3.1.1.1', '3.1.1.2', ...]
|
|
71
|
+
*/
|
|
72
|
+
export function listConceptIds(dir, prefix) {
|
|
73
|
+
assertDir(dir, 'listConceptIds');
|
|
74
|
+
let files = fs.readdirSync(dir).filter(f => f.endsWith('.yaml') && f !== 'register.yaml');
|
|
75
|
+
if (prefix) {
|
|
76
|
+
files = files.filter(f => f.startsWith(prefix));
|
|
77
|
+
}
|
|
78
|
+
return files
|
|
79
|
+
.map(f => f.slice(0, -'.yaml'.length))
|
|
80
|
+
.sort(naturalSort);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Read register.yaml from a dataset directory (if present).
|
|
85
|
+
* @param {string} dir - path to directory
|
|
86
|
+
* @returns {Record<string, unknown> | null}
|
|
87
|
+
* @throws {InvalidInputError} if dir is missing or empty
|
|
88
|
+
*/
|
|
89
|
+
export function readRegister(dir) {
|
|
90
|
+
assertDir(dir, 'readRegister');
|
|
91
|
+
const p = path.join(dir, 'register.yaml');
|
|
92
|
+
if (!fs.existsSync(p)) return null;
|
|
93
|
+
return yaml.load(fs.readFileSync(p, 'utf8'));
|
|
94
|
+
}
|
package/src/errors.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export class GlossaristError extends Error {
|
|
2
|
+
constructor(message: string, options?: { cause?: Error });
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
export class InvalidInputError extends GlossaristError {
|
|
6
|
+
constructor(what: string, expected?: string);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class YamlParseError extends GlossaristError {
|
|
10
|
+
constructor(context: string, cause: Error);
|
|
11
|
+
}
|
package/src/errors.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base error for all glossarist errors.
|
|
3
|
+
*/
|
|
4
|
+
export class GlossaristError extends Error {
|
|
5
|
+
/**
|
|
6
|
+
* @param {string} message
|
|
7
|
+
* @param {{ cause?: Error }} [options]
|
|
8
|
+
*/
|
|
9
|
+
constructor(message, options) {
|
|
10
|
+
super(message, options);
|
|
11
|
+
this.name = 'GlossaristError';
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Thrown when a function receives invalid input (null, undefined, wrong type).
|
|
17
|
+
*/
|
|
18
|
+
export class InvalidInputError extends GlossaristError {
|
|
19
|
+
/**
|
|
20
|
+
* @param {string} what - description of the invalid input
|
|
21
|
+
* @param {string} [expected] - description of what was expected
|
|
22
|
+
*/
|
|
23
|
+
constructor(what, expected) {
|
|
24
|
+
const msg = expected ? `${what} (expected ${expected})` : what;
|
|
25
|
+
super(msg);
|
|
26
|
+
this.name = 'InvalidInputError';
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Thrown when YAML content cannot be parsed.
|
|
32
|
+
*/
|
|
33
|
+
export class YamlParseError extends GlossaristError {
|
|
34
|
+
/**
|
|
35
|
+
* @param {string} context - what was being parsed (e.g. concept ID)
|
|
36
|
+
* @param {Error} cause - the original YAML parse error
|
|
37
|
+
*/
|
|
38
|
+
constructor(context, cause) {
|
|
39
|
+
super(`Failed to parse YAML for ${context}: ${cause.message}`, { cause });
|
|
40
|
+
this.name = 'YamlParseError';
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/** A single term designation. */
|
|
2
|
+
export interface Term {
|
|
3
|
+
type: string;
|
|
4
|
+
designation: string;
|
|
5
|
+
normative_status?: string;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
/** A definition content block. */
|
|
9
|
+
export interface Definition {
|
|
10
|
+
content: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** A bibliographic source reference. */
|
|
14
|
+
export interface Source {
|
|
15
|
+
type: string;
|
|
16
|
+
origin?: { ref: string };
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Localized concept data for a single language. */
|
|
20
|
+
export interface Localization {
|
|
21
|
+
terms: Term[];
|
|
22
|
+
definition?: Definition[];
|
|
23
|
+
notes?: { content: string }[];
|
|
24
|
+
examples?: { content: string }[];
|
|
25
|
+
sources?: Source[];
|
|
26
|
+
entry_status?: string;
|
|
27
|
+
normative_status?: string;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** A normalized glossarist concept. */
|
|
31
|
+
export interface Concept {
|
|
32
|
+
termid: string;
|
|
33
|
+
term: string | null;
|
|
34
|
+
localizations: Record<string, Localization>;
|
|
35
|
+
raw: Record<string, unknown>;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** GCR package metadata from metadata.yaml. */
|
|
39
|
+
export interface GcrMetadata {
|
|
40
|
+
shortname: string;
|
|
41
|
+
version?: string;
|
|
42
|
+
title?: string;
|
|
43
|
+
concept_count?: number;
|
|
44
|
+
languages?: string[];
|
|
45
|
+
schema_version?: string;
|
|
46
|
+
glossarist_version?: string;
|
|
47
|
+
created_at?: string;
|
|
48
|
+
statistics?: Record<string, unknown>;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Load a GCR package from a ZIP archive.
|
|
53
|
+
* Accepts Buffer, ArrayBuffer, Uint8Array, Blob, or base64 string.
|
|
54
|
+
*/
|
|
55
|
+
export function loadGcr(input: Buffer | ArrayBuffer | Uint8Array | Blob | string): Promise<GcrPackage>;
|
|
56
|
+
|
|
57
|
+
/** A loaded GCR package (ZIP archive of glossarist concept data). */
|
|
58
|
+
export class GcrPackage {
|
|
59
|
+
/** Read and parse metadata.yaml. */
|
|
60
|
+
metadata(): Promise<GcrMetadata | null>;
|
|
61
|
+
/** Read and parse optional register.yaml. */
|
|
62
|
+
register(): Promise<Record<string, unknown> | null>;
|
|
63
|
+
/** List all concept IDs, naturally sorted. */
|
|
64
|
+
conceptIds(): Promise<string[]>;
|
|
65
|
+
/** Read and normalize a single concept by ID. */
|
|
66
|
+
concept(id: string): Promise<Concept | null>;
|
|
67
|
+
/** Iterate all concepts via callback. */
|
|
68
|
+
eachConcept(callback: (concept: Concept, index: number) => void | Promise<void>): Promise<void>;
|
|
69
|
+
/** Load all concepts into an array. */
|
|
70
|
+
allConcepts(): Promise<Concept[]>;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/** Parse raw concept YAML (canonical or managed format) into a normalized Concept. */
|
|
74
|
+
export function parseConceptYaml(raw: string, context?: string): Concept;
|
|
75
|
+
|
|
76
|
+
/** Natural sort comparator for concept IDs like "3.1.1.1", "551-12-39". */
|
|
77
|
+
export function naturalSort(a: string, b: string): number;
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import JSZip from 'jszip';
|
|
2
|
+
import yaml from 'js-yaml';
|
|
3
|
+
import { InvalidInputError, YamlParseError } from './errors.js';
|
|
4
|
+
|
|
5
|
+
const STRUCTURAL_KEYS = new Set(['termid', 'term']);
|
|
6
|
+
|
|
7
|
+
const BASE64_RE = /^[A-Za-z0-9+/]{100,}={0,2}$/;
|
|
8
|
+
|
|
9
|
+
const NATURAL_SORT_RE = /(\d+|\D+)/g;
|
|
10
|
+
const DIGIT_RE = /^\d+$/;
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @typedef {Object} Term
|
|
14
|
+
* @property {string} type - e.g. 'expression', 'symbol', 'abbreviation'
|
|
15
|
+
* @property {string} designation - the term text
|
|
16
|
+
* @property {string} [normative_status] - e.g. 'preferred', 'admitted'
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @typedef {Object} Definition
|
|
21
|
+
* @property {string} content - the definition text
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* @typedef {Object} Source
|
|
26
|
+
* @property {string} type - e.g. 'authoritative', 'adapted'
|
|
27
|
+
* @property {{ ref: string }} [origin] - reference to the source standard
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @typedef {Object} Localization
|
|
32
|
+
* @property {Term[]} terms - term designations
|
|
33
|
+
* @property {Definition[]} [definition] - definition content
|
|
34
|
+
* @property {{ content: string }[]} [notes] - editorial notes
|
|
35
|
+
* @property {{ content: string }[]} [examples] - usage examples
|
|
36
|
+
* @property {Source[]} [sources] - bibliographic sources
|
|
37
|
+
* @property {string} [entry_status] - e.g. 'valid', 'draft'
|
|
38
|
+
* @property {string} [normative_status] - e.g. 'preferred', 'admitted'
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* @typedef {Object} Concept
|
|
43
|
+
* @property {string} termid - concept identifier (e.g. '3.1.1.1', '551-12-39')
|
|
44
|
+
* @property {string|null} term - primary term (canonical format only)
|
|
45
|
+
* @property {Record<string, Localization>} localizations - keyed by ISO 639-3 language code
|
|
46
|
+
* @property {Record<string, unknown>} raw - original parsed YAML
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Load a GCR package from a ZIP archive.
|
|
51
|
+
*
|
|
52
|
+
* Accepts a Buffer/ArrayBuffer/Uint8Array (Node or browser), a Blob (browser),
|
|
53
|
+
* or a base64-encoded string.
|
|
54
|
+
*
|
|
55
|
+
* @param {Buffer | ArrayBuffer | Uint8Array | Blob | string} input
|
|
56
|
+
* @returns {Promise<GcrPackage>}
|
|
57
|
+
* @throws {InvalidInputError} if input is null or undefined
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* import { loadGcr } from 'glossarist';
|
|
61
|
+
* import fs from 'fs';
|
|
62
|
+
* const pkg = await loadGcr(fs.readFileSync('dataset.gcr'));
|
|
63
|
+
* const meta = await pkg.metadata();
|
|
64
|
+
*/
|
|
65
|
+
export async function loadGcr(input) {
|
|
66
|
+
if (input == null) {
|
|
67
|
+
throw new InvalidInputError('loadGcr requires a Buffer, ArrayBuffer, Uint8Array, Blob, or base64 string', 'non-null input');
|
|
68
|
+
}
|
|
69
|
+
const opts = typeof input === 'string' && BASE64_RE.test(input) ? { base64: true } : undefined;
|
|
70
|
+
const zip = await JSZip.loadAsync(input, opts);
|
|
71
|
+
return new GcrPackage(zip);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Represents a loaded GCR package (ZIP archive of glossarist concept data).
|
|
76
|
+
*/
|
|
77
|
+
export class GcrPackage {
|
|
78
|
+
/** @param {JSZip} zip */
|
|
79
|
+
constructor(zip) {
|
|
80
|
+
this._zip = zip;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Read and parse metadata.yaml from the package.
|
|
85
|
+
* @returns {Promise<GcrMetadata | null>}
|
|
86
|
+
*/
|
|
87
|
+
async metadata() {
|
|
88
|
+
const raw = await this._readText('metadata.yaml');
|
|
89
|
+
return raw ? yaml.load(raw) : null;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Read and parse optional register.yaml from the package.
|
|
94
|
+
* @returns {Promise<Record<string, unknown> | null>}
|
|
95
|
+
*/
|
|
96
|
+
async register() {
|
|
97
|
+
const raw = await this._readText('register.yaml');
|
|
98
|
+
return raw ? yaml.load(raw) : null;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* List all concept IDs (filenames without .yaml), naturally sorted.
|
|
103
|
+
* @returns {Promise<string[]>}
|
|
104
|
+
*/
|
|
105
|
+
async conceptIds() {
|
|
106
|
+
const ids = [];
|
|
107
|
+
this._zip.forEach((relativePath, entry) => {
|
|
108
|
+
if (!entry.dir && relativePath.startsWith('concepts/') && relativePath.endsWith('.yaml')) {
|
|
109
|
+
ids.push(relativePath.slice('concepts/'.length, -'.yaml'.length));
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
return ids.sort(naturalSort);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Read and normalize a single concept by ID.
|
|
117
|
+
* @param {string} id - concept identifier
|
|
118
|
+
* @returns {Promise<Concept | null>}
|
|
119
|
+
*/
|
|
120
|
+
async concept(id) {
|
|
121
|
+
const raw = await this._readText(`concepts/${id}.yaml`);
|
|
122
|
+
if (raw === null) return null;
|
|
123
|
+
return parseConceptYaml(raw, id);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Iterate all concepts. Use for large packages to avoid loading everything at once.
|
|
128
|
+
* @param {(concept: Concept, index: number) => void | Promise<void>} callback
|
|
129
|
+
* @returns {Promise<void>}
|
|
130
|
+
*/
|
|
131
|
+
async eachConcept(callback) {
|
|
132
|
+
const ids = await this.conceptIds();
|
|
133
|
+
for (let i = 0; i < ids.length; i++) {
|
|
134
|
+
const concept = await this.concept(ids[i]);
|
|
135
|
+
if (concept) await callback(concept, i);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Load all concepts into an array. Beware memory for large packages.
|
|
141
|
+
* @returns {Promise<Concept[]>}
|
|
142
|
+
*/
|
|
143
|
+
async allConcepts() {
|
|
144
|
+
const ids = await this.conceptIds();
|
|
145
|
+
const concepts = [];
|
|
146
|
+
for (const id of ids) {
|
|
147
|
+
const c = await this.concept(id);
|
|
148
|
+
if (c) concepts.push(c);
|
|
149
|
+
}
|
|
150
|
+
return concepts;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/** @private @param {string} filePath @returns {Promise<string | null>} */
|
|
154
|
+
async _readText(filePath) {
|
|
155
|
+
const entry = this._zip.file(filePath);
|
|
156
|
+
if (!entry) return null;
|
|
157
|
+
return entry.async('text');
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// --- Concept YAML parsing ---
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Parse concept YAML (canonical or managed format) into a normalized object.
|
|
165
|
+
*
|
|
166
|
+
* Canonical format (single doc):
|
|
167
|
+
* { termid: "3.1.1.1", eng: { terms: [...], definition: [...] }, ... }
|
|
168
|
+
*
|
|
169
|
+
* Managed concept format (multi-doc):
|
|
170
|
+
* doc 0: { data: { identifier: "3.1.1.1", localized_concepts: { eng: "uuid" } }, id: "uuid" }
|
|
171
|
+
* doc 1+: { data: { language_code: "eng", terms: [...], ... }, id: "uuid" }
|
|
172
|
+
*
|
|
173
|
+
* @param {string} raw - raw YAML string
|
|
174
|
+
* @param {string} [context] - concept ID or filename for error messages
|
|
175
|
+
* @returns {Concept}
|
|
176
|
+
* @throws {InvalidInputError} if raw is null, undefined, or empty
|
|
177
|
+
* @throws {YamlParseError} if the YAML content is malformed
|
|
178
|
+
*
|
|
179
|
+
* @example
|
|
180
|
+
* const concept = parseConceptYaml('termid: "001"\neng:\n terms:\n - designation: test', '001');
|
|
181
|
+
* console.log(concept.localizations.eng.terms[0].designation); // "test"
|
|
182
|
+
*/
|
|
183
|
+
export function parseConceptYaml(raw, context) {
|
|
184
|
+
const label = context ?? 'concept';
|
|
185
|
+
|
|
186
|
+
if (raw == null) {
|
|
187
|
+
throw new InvalidInputError(`parseConceptYaml requires a non-empty YAML string (${label})`, 'non-null string');
|
|
188
|
+
}
|
|
189
|
+
if (typeof raw !== 'string' || raw.trim() === '') {
|
|
190
|
+
throw new InvalidInputError(`parseConceptYaml requires a non-empty YAML string (${label})`, 'non-empty string');
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
let docs;
|
|
194
|
+
try {
|
|
195
|
+
docs = yaml.loadAll(raw, null, { schema: yaml.DEFAULT_SCHEMA });
|
|
196
|
+
} catch (err) {
|
|
197
|
+
throw new YamlParseError(label, err);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (docs.length === 1 && docs[0]?.termid !== undefined) {
|
|
201
|
+
return normalizeCanonical(docs[0]);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (docs.length >= 1 && docs[0]?.data?.identifier !== undefined) {
|
|
205
|
+
return normalizeManaged(docs);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (docs[0] == null) {
|
|
209
|
+
throw new YamlParseError(label, new Error('YAML document is empty'));
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
return normalizeCanonical(docs[0]);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/** @private @param {Record<string, any>} doc @returns {Concept} */
|
|
216
|
+
function normalizeCanonical(doc) {
|
|
217
|
+
const localizations = {};
|
|
218
|
+
for (const key of Object.keys(doc)) {
|
|
219
|
+
if (!STRUCTURAL_KEYS.has(key) && typeof doc[key] === 'object' && doc[key] !== null) {
|
|
220
|
+
localizations[key] = doc[key];
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
return {
|
|
224
|
+
termid: String(doc.termid),
|
|
225
|
+
term: doc.term || null,
|
|
226
|
+
localizations,
|
|
227
|
+
raw: doc,
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/** @private @param {Record<string, any>[]} docs @returns {Concept} */
|
|
232
|
+
function normalizeManaged(docs) {
|
|
233
|
+
const mc = docs[0];
|
|
234
|
+
const termid = String(mc.data.identifier);
|
|
235
|
+
const localizations = {};
|
|
236
|
+
|
|
237
|
+
for (const doc of docs.slice(1)) {
|
|
238
|
+
if (!doc || !doc.data || !doc.data.language_code) continue;
|
|
239
|
+
const lang = doc.data.language_code;
|
|
240
|
+
const lcData = { ...doc.data };
|
|
241
|
+
delete lcData.language_code;
|
|
242
|
+
localizations[lang] = lcData;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return {
|
|
246
|
+
termid,
|
|
247
|
+
term: null,
|
|
248
|
+
localizations,
|
|
249
|
+
raw: mc,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// --- Helpers ---
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Natural sort comparator for concept IDs like "3.1.1.1", "551-12-39".
|
|
257
|
+
* @param {string} a
|
|
258
|
+
* @param {string} b
|
|
259
|
+
* @returns {number}
|
|
260
|
+
*
|
|
261
|
+
* @example
|
|
262
|
+
* ['3.1.10', '3.1.2', '3.1.1'].sort(naturalSort); // ['3.1.1', '3.1.2', '3.1.10']
|
|
263
|
+
*/
|
|
264
|
+
export function naturalSort(a, b) {
|
|
265
|
+
const pa = a.match(NATURAL_SORT_RE) || [];
|
|
266
|
+
const pb = b.match(NATURAL_SORT_RE) || [];
|
|
267
|
+
for (let i = 0; i < Math.max(pa.length, pb.length); i++) {
|
|
268
|
+
const na = pa[i] || '';
|
|
269
|
+
const nb = pb[i] || '';
|
|
270
|
+
if (DIGIT_RE.test(na) && DIGIT_RE.test(nb)) {
|
|
271
|
+
const diff = parseInt(na, 10) - parseInt(nb, 10);
|
|
272
|
+
if (diff !== 0) return diff;
|
|
273
|
+
} else {
|
|
274
|
+
const cmp = na.localeCompare(nb);
|
|
275
|
+
if (cmp !== 0) return cmp;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return 0;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* @typedef {Object} GcrMetadata
|
|
283
|
+
* @property {string} shortname - dataset short name
|
|
284
|
+
* @property {string} [version] - dataset version
|
|
285
|
+
* @property {string} [title] - dataset title
|
|
286
|
+
* @property {number} [concept_count] - number of concepts
|
|
287
|
+
* @property {string[]} [languages] - supported language codes
|
|
288
|
+
* @property {string} [schema_version] - schema version
|
|
289
|
+
* @property {string} [glossarist_version] - glossarist tool version
|
|
290
|
+
* @property {string} [created_at] - ISO 8601 creation timestamp
|
|
291
|
+
* @property {Record<string, unknown>} [statistics] - dataset statistics
|
|
292
|
+
*/
|
package/src/index.d.ts
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { loadGcr, GcrPackage, parseConceptYaml, naturalSort } from './gcr-reader';
|
|
2
|
+
export type { Concept, Localization, Term, Definition, Source, GcrMetadata } from './gcr-reader';
|
|
3
|
+
export { readConcepts, readConcept, listConceptIds, readRegister } from './concept-reader';
|
|
4
|
+
export { GlossaristError, InvalidInputError, YamlParseError } from './errors';
|
package/src/index.js
ADDED