glossarist 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Glossarist Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,189 @@
1
+ # glossarist-js
2
+
3
+ [![CI](https://github.com/glossarist/glossarist-js/actions/workflows/ci.yml/badge.svg)](https://github.com/glossarist/glossarist-js/actions/workflows/ci.yml)
4
+ [![npm version](https://img.shields.io/npm/v/glossarist.svg)](https://www.npmjs.com/package/glossarist)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ JavaScript library for reading [Glossarist](https://github.com/glossarist) GCR packages (ZIP archives) and v2 glossarist concept data (YAML files). Works in Node.js and browsers.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ npm install glossarist
13
+ ```
14
+
15
+ Requires Node.js 18+.
16
+
17
+ ## Usage
18
+
19
+ ### Reading a GCR package
20
+
21
+ ```js
22
+ import { loadGcr } from 'glossarist';
23
+ import fs from 'fs';
24
+
25
+ const buf = fs.readFileSync('my-dataset.gcr');
26
+ const pkg = await loadGcr(buf);
27
+
28
+ // Metadata
29
+ const meta = await pkg.metadata();
30
+ console.log(meta.shortname, meta.version, meta.concept_count);
31
+
32
+ // List concept IDs
33
+ const ids = await pkg.conceptIds();
34
+
35
+ // Read a specific concept
36
+ const concept = await pkg.concept('3.1.1.1');
37
+ console.log(concept.termid);
38
+ console.log(concept.localizations.eng.terms[0].designation);
39
+
40
+ // Iterate all concepts (streaming)
41
+ await pkg.eachConcept((concept) => {
42
+ console.log(concept.termid);
43
+ });
44
+ ```
45
+
46
+ `loadGcr` accepts `Buffer`, `ArrayBuffer`, `Uint8Array`, `Blob`, or a base64-encoded string.
47
+
48
+ ### Reading concept YAML files from a directory
49
+
50
+ ```js
51
+ import { readConcepts, readConcept, listConceptIds } from 'glossarist';
52
+
53
+ // Read all concepts
54
+ const concepts = readConcepts('./geolexica-v2/');
55
+ console.log(`Loaded ${concepts.length} concepts`);
56
+
57
+ // Read a single concept by ID
58
+ const concept = readConcept('./geolexica-v2/', '3.1.1.1');
59
+
60
+ // List IDs with optional prefix filter
61
+ const ids = listConceptIds('./geolexica-v2/', '3.1.');
62
+ ```
63
+
64
+ ### Browser usage
65
+
66
+ The GCR reader works in browsers via jszip. The concept directory reader requires Node.js `fs`.
67
+
68
+ ```html
69
+ <script type="module">
70
+ import { loadGcr } from 'glossarist/gcr';
71
+
72
+ const response = await fetch('/datasets/isotc204.gcr');
73
+ const buf = await response.arrayBuffer();
74
+ const pkg = await loadGcr(buf);
75
+ const meta = await pkg.metadata();
76
+ </script>
77
+ ```
78
+
79
+ ## Concept format
80
+
81
+ Glossarist-js normalizes both storage formats into a consistent structure:
82
+
83
+ ```js
84
+ {
85
+ termid: '3.1.1.1', // concept identifier
86
+ term: 'entity', // primary term (canonical format only)
87
+ localizations: {
88
+ eng: {
89
+ terms: [{ type: 'expression', designation: 'entity', normative_status: 'preferred' }],
90
+ definition: [{ content: 'concrete or abstract thing...' }],
91
+ notes: [],
92
+ examples: [],
93
+ sources: [{ type: 'authoritative', origin: { ref: 'ISO/TS 14812:2022' } }],
94
+ entry_status: 'valid',
95
+ },
96
+ fra: { ... },
97
+ },
98
+ raw: { ... }, // original parsed YAML
99
+ }
100
+ ```
101
+
102
+ Language codes are discovered dynamically from the YAML keys — any ISO 639-3 code works without code changes.
103
+
104
+ ### Supported formats
105
+
106
+ | Format | Structure | Used by |
107
+ |--------|-----------|---------|
108
+ | **Canonical** | Single YAML document with `termid` and language keys (`eng:`, `fra:`) | IEV (iec-electropedia) |
109
+ | **Managed concept** | Multi-document YAML: first doc has `data.identifier` + `data.localized_concepts`, subsequent docs have `data.language_code` | isotc204, isotc211, osgeo |
110
+
111
+ ## Error handling
112
+
113
+ All public functions validate inputs and throw descriptive errors with context:
114
+
115
+ ```js
116
+ import { InvalidInputError, YamlParseError } from 'glossarist';
117
+
118
+ try {
119
+ await pkg.concept('3.1.1.1');
120
+ } catch (err) {
121
+ if (err instanceof YamlParseError) {
122
+ // err.message: "Failed to parse YAML for 3.1.1.1: ..."
123
+ // err.cause: the original YAML parse error
124
+ } else if (err instanceof InvalidInputError) {
125
+ // Invalid input (null, empty string, wrong type)
126
+ }
127
+ }
128
+ ```
129
+
130
+ Errors include the concept ID or filename in their message, making it easy to locate failures in large datasets.
131
+
132
+ - **`GlossaristError`** — base class for all library errors
133
+ - **`InvalidInputError`** — null, undefined, empty, or wrong-type arguments
134
+ - **`YamlParseError`** — malformed YAML with `cause` chaining the original error
135
+
136
+ ## TypeScript
137
+
138
+ TypeScript declarations are included. No `@types/` package needed.
139
+
140
+ ```ts
141
+ import { loadGcr, readConcepts, type Concept, type GcrMetadata } from 'glossarist';
142
+
143
+ const pkg = await loadGcr(buffer);
144
+ const meta: GcrMetadata | null = await pkg.metadata();
145
+ ```
146
+
147
+ ## API
148
+
149
+ ### GCR Package (`glossarist/gcr`)
150
+
151
+ - `loadGcr(input)` — Load a GCR ZIP from Buffer/ArrayBuffer/Uint8Array/Blob/base64 string. Returns `GcrPackage`.
152
+ - `GcrPackage#metadata()` — Parse `metadata.yaml`.
153
+ - `GcrPackage#register()` — Parse optional `register.yaml`.
154
+ - `GcrPackage#conceptIds()` — Array of concept IDs (natural-sorted).
155
+ - `GcrPackage#concept(id)` — Read and normalize a single concept.
156
+ - `GcrPackage#eachConcept(callback)` — Stream all concepts.
157
+ - `GcrPackage#allConcepts()` — Load all concepts into an array.
158
+ - `parseConceptYaml(raw, context?)` — Parse raw YAML string into normalized concept object. `context` is an optional concept ID or filename for error messages.
159
+ - `naturalSort(a, b)` — Natural sort comparator for concept IDs.
160
+
161
+ ### Concept Directory Reader (`glossarist/concept`)
162
+
163
+ Node.js only (uses `fs`).
164
+
165
+ - `readConcepts(dir)` — Read all concept YAML files from a directory.
166
+ - `readConcept(dir, id)` — Read a single concept by ID.
167
+ - `listConceptIds(dir, prefix?)` — List concept IDs, optionally filtered by prefix.
168
+ - `readRegister(dir)` — Read `register.yaml` if present.
169
+
170
+ ### Errors
171
+
172
+ - `GlossaristError` — base error class
173
+ - `InvalidInputError` — bad input arguments
174
+ - `YamlParseError` — YAML parse failures (has `cause`, includes concept context)
175
+
176
+ ## Development
177
+
178
+ ```bash
179
+ npm install
180
+ npm test # regenerate fixtures + run all tests
181
+ npm run lint # lint src/ and test/
182
+ npm run test:coverage # run with coverage report
183
+ ```
184
+
185
+ See [CONTRIBUTING.md](./CONTRIBUTING.md) for full guidelines.
186
+
187
+ ## License
188
+
189
+ [MIT](./LICENSE)
package/package.json ADDED
@@ -0,0 +1,69 @@
1
+ {
2
+ "name": "glossarist",
3
+ "version": "0.1.0",
4
+ "description": "JavaScript library for reading Glossarist GCR packages and v2 concept data",
5
+ "type": "module",
6
+ "main": "src/index.js",
7
+ "types": "src/index.d.ts",
8
+ "sideEffects": false,
9
+ "files": [
10
+ "src"
11
+ ],
12
+ "exports": {
13
+ ".": {
14
+ "types": "./src/index.d.ts",
15
+ "import": "./src/index.js",
16
+ "default": "./src/index.js"
17
+ },
18
+ "./gcr": {
19
+ "types": "./src/gcr-reader.d.ts",
20
+ "import": "./src/gcr-reader.js",
21
+ "default": "./src/gcr-reader.js"
22
+ },
23
+ "./concept": {
24
+ "types": "./src/concept-reader.d.ts",
25
+ "import": "./src/concept-reader.js",
26
+ "default": "./src/concept-reader.js"
27
+ }
28
+ },
29
+ "scripts": {
30
+ "lint": "eslint src/ test/",
31
+ "pretest": "node test/fixtures/build-fixtures.js",
32
+ "test": "node --test test/*.test.js",
33
+ "test:verbose": "node --test --test-reporter spec test/*.test.js",
34
+ "test:coverage": "node --test --experimental-test-coverage test/*.test.js",
35
+ "prepublishOnly": "npm test"
36
+ },
37
+ "keywords": [
38
+ "glossarist",
39
+ "gcr",
40
+ "terminology",
41
+ "glossary",
42
+ "iso",
43
+ "geolexica"
44
+ ],
45
+ "license": "MIT",
46
+ "repository": {
47
+ "type": "git",
48
+ "url": "git+https://github.com/glossarist/glossarist-js.git"
49
+ },
50
+ "homepage": "https://github.com/glossarist/glossarist-js#readme",
51
+ "bugs": {
52
+ "url": "https://github.com/glossarist/glossarist-js/issues"
53
+ },
54
+ "engines": {
55
+ "node": ">=18"
56
+ },
57
+ "publishConfig": {
58
+ "access": "public"
59
+ },
60
+ "dependencies": {
61
+ "js-yaml": "^4.1.0",
62
+ "jszip": "^3.10.1"
63
+ },
64
+ "devDependencies": {
65
+ "@eslint/js": "^10.0.1",
66
+ "eslint": "^10.3.0",
67
+ "globals": "^17.6.0"
68
+ }
69
+ }
@@ -0,0 +1,13 @@
1
+ import type { Concept } from './gcr-reader';
2
+
3
+ /** Read all concept YAML files from a directory. */
4
+ export function readConcepts(dir: string): Concept[];
5
+
6
+ /** Read a single concept by ID from a directory. */
7
+ export function readConcept(dir: string, id: string): Concept | null;
8
+
9
+ /** List all concept IDs in a directory, optionally filtered by prefix. */
10
+ export function listConceptIds(dir: string, prefix?: string): string[];
11
+
12
+ /** Read register.yaml from a dataset directory (if present). */
13
+ export function readRegister(dir: string): Record<string, unknown> | null;
@@ -0,0 +1,94 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import yaml from 'js-yaml';
4
+ import { parseConceptYaml, naturalSort } from './gcr-reader.js';
5
+ import { InvalidInputError } from './errors.js';
6
+
7
+ function assertDir(dir, fnName) {
8
+ if (typeof dir !== 'string' || dir.trim() === '') {
9
+ throw new InvalidInputError(`${fnName} requires a directory path`, 'non-empty string');
10
+ }
11
+ }
12
+
13
+ /**
14
+ * Read all v2 glossarist concept YAML files from a directory.
15
+ * @param {string} dir - path to directory containing concept YAML files
16
+ * @returns {import('./gcr-reader.js').Concept[]}
17
+ * @throws {InvalidInputError} if dir is missing or empty
18
+ *
19
+ * @example
20
+ * const concepts = readConcepts('./geolexica-v2/');
21
+ * console.log(concepts[0].localizations.eng.terms[0].designation);
22
+ */
23
+ export function readConcepts(dir) {
24
+ assertDir(dir, 'readConcepts');
25
+ const files = fs.readdirSync(dir)
26
+ .filter(f => f.endsWith('.yaml') && f !== 'register.yaml')
27
+ .sort(naturalSort);
28
+
29
+ const concepts = [];
30
+ for (const file of files) {
31
+ const raw = fs.readFileSync(path.join(dir, file), 'utf8');
32
+ const concept = parseConceptYaml(raw, file);
33
+ if (concept && concept.termid) {
34
+ concepts.push(concept);
35
+ }
36
+ }
37
+ return concepts;
38
+ }
39
+
40
+ /**
41
+ * Read a single concept file by ID from a directory.
42
+ * @param {string} dir - path to directory containing concept YAML files
43
+ * @param {string} id - concept identifier (filename without .yaml)
44
+ * @returns {import('./gcr-reader.js').Concept | null}
45
+ * @throws {InvalidInputError} if dir or id is missing or empty
46
+ *
47
+ * @example
48
+ * const concept = readConcept('./geolexica-v2/', '3.1.1.1');
49
+ * if (concept) console.log(concept.termid);
50
+ */
51
+ export function readConcept(dir, id) {
52
+ assertDir(dir, 'readConcept');
53
+ if (typeof id !== 'string' || id.trim() === '') {
54
+ throw new InvalidInputError('readConcept requires a concept ID', 'non-empty string');
55
+ }
56
+ const filePath = path.join(dir, `${id}.yaml`);
57
+ if (!fs.existsSync(filePath)) return null;
58
+ const raw = fs.readFileSync(filePath, 'utf8');
59
+ return parseConceptYaml(raw, `${id}.yaml`);
60
+ }
61
+
62
+ /**
63
+ * List all concept IDs in a directory, optionally filtered by prefix.
64
+ * @param {string} dir - path to directory
65
+ * @param {string} [prefix] - optional prefix filter
66
+ * @returns {string[]}
67
+ * @throws {InvalidInputError} if dir is missing or empty
68
+ *
69
+ * @example
70
+ * const ids = listConceptIds('./geolexica-v2/', '3.1.'); // ['3.1.1.1', '3.1.1.2', ...]
71
+ */
72
+ export function listConceptIds(dir, prefix) {
73
+ assertDir(dir, 'listConceptIds');
74
+ let files = fs.readdirSync(dir).filter(f => f.endsWith('.yaml') && f !== 'register.yaml');
75
+ if (prefix) {
76
+ files = files.filter(f => f.startsWith(prefix));
77
+ }
78
+ return files
79
+ .map(f => f.slice(0, -'.yaml'.length))
80
+ .sort(naturalSort);
81
+ }
82
+
83
+ /**
84
+ * Read register.yaml from a dataset directory (if present).
85
+ * @param {string} dir - path to directory
86
+ * @returns {Record<string, unknown> | null}
87
+ * @throws {InvalidInputError} if dir is missing or empty
88
+ */
89
+ export function readRegister(dir) {
90
+ assertDir(dir, 'readRegister');
91
+ const p = path.join(dir, 'register.yaml');
92
+ if (!fs.existsSync(p)) return null;
93
+ return yaml.load(fs.readFileSync(p, 'utf8'));
94
+ }
@@ -0,0 +1,11 @@
1
+ export class GlossaristError extends Error {
2
+ constructor(message: string, options?: { cause?: Error });
3
+ }
4
+
5
+ export class InvalidInputError extends GlossaristError {
6
+ constructor(what: string, expected?: string);
7
+ }
8
+
9
+ export class YamlParseError extends GlossaristError {
10
+ constructor(context: string, cause: Error);
11
+ }
package/src/errors.js ADDED
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Base error for all glossarist errors.
3
+ */
4
+ export class GlossaristError extends Error {
5
+ /**
6
+ * @param {string} message
7
+ * @param {{ cause?: Error }} [options]
8
+ */
9
+ constructor(message, options) {
10
+ super(message, options);
11
+ this.name = 'GlossaristError';
12
+ }
13
+ }
14
+
15
+ /**
16
+ * Thrown when a function receives invalid input (null, undefined, wrong type).
17
+ */
18
+ export class InvalidInputError extends GlossaristError {
19
+ /**
20
+ * @param {string} what - description of the invalid input
21
+ * @param {string} [expected] - description of what was expected
22
+ */
23
+ constructor(what, expected) {
24
+ const msg = expected ? `${what} (expected ${expected})` : what;
25
+ super(msg);
26
+ this.name = 'InvalidInputError';
27
+ }
28
+ }
29
+
30
+ /**
31
+ * Thrown when YAML content cannot be parsed.
32
+ */
33
+ export class YamlParseError extends GlossaristError {
34
+ /**
35
+ * @param {string} context - what was being parsed (e.g. concept ID)
36
+ * @param {Error} cause - the original YAML parse error
37
+ */
38
+ constructor(context, cause) {
39
+ super(`Failed to parse YAML for ${context}: ${cause.message}`, { cause });
40
+ this.name = 'YamlParseError';
41
+ }
42
+ }
@@ -0,0 +1,77 @@
1
+ /** A single term designation. */
2
+ export interface Term {
3
+ type: string;
4
+ designation: string;
5
+ normative_status?: string;
6
+ }
7
+
8
+ /** A definition content block. */
9
+ export interface Definition {
10
+ content: string;
11
+ }
12
+
13
+ /** A bibliographic source reference. */
14
+ export interface Source {
15
+ type: string;
16
+ origin?: { ref: string };
17
+ }
18
+
19
+ /** Localized concept data for a single language. */
20
+ export interface Localization {
21
+ terms: Term[];
22
+ definition?: Definition[];
23
+ notes?: { content: string }[];
24
+ examples?: { content: string }[];
25
+ sources?: Source[];
26
+ entry_status?: string;
27
+ normative_status?: string;
28
+ }
29
+
30
+ /** A normalized glossarist concept. */
31
+ export interface Concept {
32
+ termid: string;
33
+ term: string | null;
34
+ localizations: Record<string, Localization>;
35
+ raw: Record<string, unknown>;
36
+ }
37
+
38
+ /** GCR package metadata from metadata.yaml. */
39
+ export interface GcrMetadata {
40
+ shortname: string;
41
+ version?: string;
42
+ title?: string;
43
+ concept_count?: number;
44
+ languages?: string[];
45
+ schema_version?: string;
46
+ glossarist_version?: string;
47
+ created_at?: string;
48
+ statistics?: Record<string, unknown>;
49
+ }
50
+
51
+ /**
52
+ * Load a GCR package from a ZIP archive.
53
+ * Accepts Buffer, ArrayBuffer, Uint8Array, Blob, or base64 string.
54
+ */
55
+ export function loadGcr(input: Buffer | ArrayBuffer | Uint8Array | Blob | string): Promise<GcrPackage>;
56
+
57
+ /** A loaded GCR package (ZIP archive of glossarist concept data). */
58
+ export class GcrPackage {
59
+ /** Read and parse metadata.yaml. */
60
+ metadata(): Promise<GcrMetadata | null>;
61
+ /** Read and parse optional register.yaml. */
62
+ register(): Promise<Record<string, unknown> | null>;
63
+ /** List all concept IDs, naturally sorted. */
64
+ conceptIds(): Promise<string[]>;
65
+ /** Read and normalize a single concept by ID. */
66
+ concept(id: string): Promise<Concept | null>;
67
+ /** Iterate all concepts via callback. */
68
+ eachConcept(callback: (concept: Concept, index: number) => void | Promise<void>): Promise<void>;
69
+ /** Load all concepts into an array. */
70
+ allConcepts(): Promise<Concept[]>;
71
+ }
72
+
73
+ /** Parse raw concept YAML (canonical or managed format) into a normalized Concept. */
74
+ export function parseConceptYaml(raw: string, context?: string): Concept;
75
+
76
+ /** Natural sort comparator for concept IDs like "3.1.1.1", "551-12-39". */
77
+ export function naturalSort(a: string, b: string): number;
@@ -0,0 +1,292 @@
1
+ import JSZip from 'jszip';
2
+ import yaml from 'js-yaml';
3
+ import { InvalidInputError, YamlParseError } from './errors.js';
4
+
5
+ const STRUCTURAL_KEYS = new Set(['termid', 'term']);
6
+
7
+ const BASE64_RE = /^[A-Za-z0-9+/]{100,}={0,2}$/;
8
+
9
+ const NATURAL_SORT_RE = /(\d+|\D+)/g;
10
+ const DIGIT_RE = /^\d+$/;
11
+
12
+ /**
13
+ * @typedef {Object} Term
14
+ * @property {string} type - e.g. 'expression', 'symbol', 'abbreviation'
15
+ * @property {string} designation - the term text
16
+ * @property {string} [normative_status] - e.g. 'preferred', 'admitted'
17
+ */
18
+
19
+ /**
20
+ * @typedef {Object} Definition
21
+ * @property {string} content - the definition text
22
+ */
23
+
24
+ /**
25
+ * @typedef {Object} Source
26
+ * @property {string} type - e.g. 'authoritative', 'adapted'
27
+ * @property {{ ref: string }} [origin] - reference to the source standard
28
+ */
29
+
30
+ /**
31
+ * @typedef {Object} Localization
32
+ * @property {Term[]} terms - term designations
33
+ * @property {Definition[]} [definition] - definition content
34
+ * @property {{ content: string }[]} [notes] - editorial notes
35
+ * @property {{ content: string }[]} [examples] - usage examples
36
+ * @property {Source[]} [sources] - bibliographic sources
37
+ * @property {string} [entry_status] - e.g. 'valid', 'draft'
38
+ * @property {string} [normative_status] - e.g. 'preferred', 'admitted'
39
+ */
40
+
41
+ /**
42
+ * @typedef {Object} Concept
43
+ * @property {string} termid - concept identifier (e.g. '3.1.1.1', '551-12-39')
44
+ * @property {string|null} term - primary term (canonical format only)
45
+ * @property {Record<string, Localization>} localizations - keyed by ISO 639-3 language code
46
+ * @property {Record<string, unknown>} raw - original parsed YAML
47
+ */
48
+
49
+ /**
50
+ * Load a GCR package from a ZIP archive.
51
+ *
52
+ * Accepts a Buffer/ArrayBuffer/Uint8Array (Node or browser), a Blob (browser),
53
+ * or a base64-encoded string.
54
+ *
55
+ * @param {Buffer | ArrayBuffer | Uint8Array | Blob | string} input
56
+ * @returns {Promise<GcrPackage>}
57
+ * @throws {InvalidInputError} if input is null or undefined
58
+ *
59
+ * @example
60
+ * import { loadGcr } from 'glossarist';
61
+ * import fs from 'fs';
62
+ * const pkg = await loadGcr(fs.readFileSync('dataset.gcr'));
63
+ * const meta = await pkg.metadata();
64
+ */
65
+ export async function loadGcr(input) {
66
+ if (input == null) {
67
+ throw new InvalidInputError('loadGcr requires a Buffer, ArrayBuffer, Uint8Array, Blob, or base64 string', 'non-null input');
68
+ }
69
+ const opts = typeof input === 'string' && BASE64_RE.test(input) ? { base64: true } : undefined;
70
+ const zip = await JSZip.loadAsync(input, opts);
71
+ return new GcrPackage(zip);
72
+ }
73
+
74
+ /**
75
+ * Represents a loaded GCR package (ZIP archive of glossarist concept data).
76
+ */
77
+ export class GcrPackage {
78
+ /** @param {JSZip} zip */
79
+ constructor(zip) {
80
+ this._zip = zip;
81
+ }
82
+
83
+ /**
84
+ * Read and parse metadata.yaml from the package.
85
+ * @returns {Promise<GcrMetadata | null>}
86
+ */
87
+ async metadata() {
88
+ const raw = await this._readText('metadata.yaml');
89
+ return raw ? yaml.load(raw) : null;
90
+ }
91
+
92
+ /**
93
+ * Read and parse optional register.yaml from the package.
94
+ * @returns {Promise<Record<string, unknown> | null>}
95
+ */
96
+ async register() {
97
+ const raw = await this._readText('register.yaml');
98
+ return raw ? yaml.load(raw) : null;
99
+ }
100
+
101
+ /**
102
+ * List all concept IDs (filenames without .yaml), naturally sorted.
103
+ * @returns {Promise<string[]>}
104
+ */
105
+ async conceptIds() {
106
+ const ids = [];
107
+ this._zip.forEach((relativePath, entry) => {
108
+ if (!entry.dir && relativePath.startsWith('concepts/') && relativePath.endsWith('.yaml')) {
109
+ ids.push(relativePath.slice('concepts/'.length, -'.yaml'.length));
110
+ }
111
+ });
112
+ return ids.sort(naturalSort);
113
+ }
114
+
115
+ /**
116
+ * Read and normalize a single concept by ID.
117
+ * @param {string} id - concept identifier
118
+ * @returns {Promise<Concept | null>}
119
+ */
120
+ async concept(id) {
121
+ const raw = await this._readText(`concepts/${id}.yaml`);
122
+ if (raw === null) return null;
123
+ return parseConceptYaml(raw, id);
124
+ }
125
+
126
+ /**
127
+ * Iterate all concepts. Use for large packages to avoid loading everything at once.
128
+ * @param {(concept: Concept, index: number) => void | Promise<void>} callback
129
+ * @returns {Promise<void>}
130
+ */
131
+ async eachConcept(callback) {
132
+ const ids = await this.conceptIds();
133
+ for (let i = 0; i < ids.length; i++) {
134
+ const concept = await this.concept(ids[i]);
135
+ if (concept) await callback(concept, i);
136
+ }
137
+ }
138
+
139
+ /**
140
+ * Load all concepts into an array. Beware memory for large packages.
141
+ * @returns {Promise<Concept[]>}
142
+ */
143
+ async allConcepts() {
144
+ const ids = await this.conceptIds();
145
+ const concepts = [];
146
+ for (const id of ids) {
147
+ const c = await this.concept(id);
148
+ if (c) concepts.push(c);
149
+ }
150
+ return concepts;
151
+ }
152
+
153
+ /** @private @param {string} filePath @returns {Promise<string | null>} */
154
+ async _readText(filePath) {
155
+ const entry = this._zip.file(filePath);
156
+ if (!entry) return null;
157
+ return entry.async('text');
158
+ }
159
+ }
160
+
161
+ // --- Concept YAML parsing ---
162
+
163
+ /**
164
+ * Parse concept YAML (canonical or managed format) into a normalized object.
165
+ *
166
+ * Canonical format (single doc):
167
+ * { termid: "3.1.1.1", eng: { terms: [...], definition: [...] }, ... }
168
+ *
169
+ * Managed concept format (multi-doc):
170
+ * doc 0: { data: { identifier: "3.1.1.1", localized_concepts: { eng: "uuid" } }, id: "uuid" }
171
+ * doc 1+: { data: { language_code: "eng", terms: [...], ... }, id: "uuid" }
172
+ *
173
+ * @param {string} raw - raw YAML string
174
+ * @param {string} [context] - concept ID or filename for error messages
175
+ * @returns {Concept}
176
+ * @throws {InvalidInputError} if raw is null, undefined, or empty
177
+ * @throws {YamlParseError} if the YAML content is malformed
178
+ *
179
+ * @example
180
+ * const concept = parseConceptYaml('termid: "001"\neng:\n terms:\n - designation: test', '001');
181
+ * console.log(concept.localizations.eng.terms[0].designation); // "test"
182
+ */
183
+ export function parseConceptYaml(raw, context) {
184
+ const label = context ?? 'concept';
185
+
186
+ if (raw == null) {
187
+ throw new InvalidInputError(`parseConceptYaml requires a non-empty YAML string (${label})`, 'non-null string');
188
+ }
189
+ if (typeof raw !== 'string' || raw.trim() === '') {
190
+ throw new InvalidInputError(`parseConceptYaml requires a non-empty YAML string (${label})`, 'non-empty string');
191
+ }
192
+
193
+ let docs;
194
+ try {
195
+ docs = yaml.loadAll(raw, null, { schema: yaml.DEFAULT_SCHEMA });
196
+ } catch (err) {
197
+ throw new YamlParseError(label, err);
198
+ }
199
+
200
+ if (docs.length === 1 && docs[0]?.termid !== undefined) {
201
+ return normalizeCanonical(docs[0]);
202
+ }
203
+
204
+ if (docs.length >= 1 && docs[0]?.data?.identifier !== undefined) {
205
+ return normalizeManaged(docs);
206
+ }
207
+
208
+ if (docs[0] == null) {
209
+ throw new YamlParseError(label, new Error('YAML document is empty'));
210
+ }
211
+
212
+ return normalizeCanonical(docs[0]);
213
+ }
214
+
215
+ /** @private @param {Record<string, any>} doc @returns {Concept} */
216
+ function normalizeCanonical(doc) {
217
+ const localizations = {};
218
+ for (const key of Object.keys(doc)) {
219
+ if (!STRUCTURAL_KEYS.has(key) && typeof doc[key] === 'object' && doc[key] !== null) {
220
+ localizations[key] = doc[key];
221
+ }
222
+ }
223
+ return {
224
+ termid: String(doc.termid),
225
+ term: doc.term || null,
226
+ localizations,
227
+ raw: doc,
228
+ };
229
+ }
230
+
231
+ /** @private @param {Record<string, any>[]} docs @returns {Concept} */
232
+ function normalizeManaged(docs) {
233
+ const mc = docs[0];
234
+ const termid = String(mc.data.identifier);
235
+ const localizations = {};
236
+
237
+ for (const doc of docs.slice(1)) {
238
+ if (!doc || !doc.data || !doc.data.language_code) continue;
239
+ const lang = doc.data.language_code;
240
+ const lcData = { ...doc.data };
241
+ delete lcData.language_code;
242
+ localizations[lang] = lcData;
243
+ }
244
+
245
+ return {
246
+ termid,
247
+ term: null,
248
+ localizations,
249
+ raw: mc,
250
+ };
251
+ }
252
+
253
+ // --- Helpers ---
254
+
255
+ /**
256
+ * Natural sort comparator for concept IDs like "3.1.1.1", "551-12-39".
257
+ * @param {string} a
258
+ * @param {string} b
259
+ * @returns {number}
260
+ *
261
+ * @example
262
+ * ['3.1.10', '3.1.2', '3.1.1'].sort(naturalSort); // ['3.1.1', '3.1.2', '3.1.10']
263
+ */
264
+ export function naturalSort(a, b) {
265
+ const pa = a.match(NATURAL_SORT_RE) || [];
266
+ const pb = b.match(NATURAL_SORT_RE) || [];
267
+ for (let i = 0; i < Math.max(pa.length, pb.length); i++) {
268
+ const na = pa[i] || '';
269
+ const nb = pb[i] || '';
270
+ if (DIGIT_RE.test(na) && DIGIT_RE.test(nb)) {
271
+ const diff = parseInt(na, 10) - parseInt(nb, 10);
272
+ if (diff !== 0) return diff;
273
+ } else {
274
+ const cmp = na.localeCompare(nb);
275
+ if (cmp !== 0) return cmp;
276
+ }
277
+ }
278
+ return 0;
279
+ }
280
+
281
+ /**
282
+ * @typedef {Object} GcrMetadata
283
+ * @property {string} shortname - dataset short name
284
+ * @property {string} [version] - dataset version
285
+ * @property {string} [title] - dataset title
286
+ * @property {number} [concept_count] - number of concepts
287
+ * @property {string[]} [languages] - supported language codes
288
+ * @property {string} [schema_version] - schema version
289
+ * @property {string} [glossarist_version] - glossarist tool version
290
+ * @property {string} [created_at] - ISO 8601 creation timestamp
291
+ * @property {Record<string, unknown>} [statistics] - dataset statistics
292
+ */
package/src/index.d.ts ADDED
@@ -0,0 +1,4 @@
1
+ export { loadGcr, GcrPackage, parseConceptYaml, naturalSort } from './gcr-reader';
2
+ export type { Concept, Localization, Term, Definition, Source, GcrMetadata } from './gcr-reader';
3
+ export { readConcepts, readConcept, listConceptIds, readRegister } from './concept-reader';
4
+ export { GlossaristError, InvalidInputError, YamlParseError } from './errors';
package/src/index.js ADDED
@@ -0,0 +1,3 @@
1
+ export { loadGcr, GcrPackage, parseConceptYaml, naturalSort } from './gcr-reader.js';
2
+ export { readConcepts, readConcept, listConceptIds, readRegister } from './concept-reader.js';
3
+ export { GlossaristError, InvalidInputError, YamlParseError } from './errors.js';