@glossarist/concept-browser 0.7.43 → 0.7.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/cli/index.mjs +12 -13
  2. package/package.json +3 -2
  3. package/scripts/__tests__/fetch-datasets.test.mjs +105 -0
  4. package/scripts/fetch-datasets.mjs +53 -51
  5. package/scripts/generate-data.mjs +41 -19
  6. package/scripts/lib/build/image-assets.mjs +190 -0
  7. package/scripts/lib/build/non-verbal-consumer.mjs +221 -0
  8. package/scripts/lib/local-path-safety.mjs +68 -0
  9. package/src/__tests__/bibliography-adapter.test.ts +79 -0
  10. package/src/__tests__/content-renderer-nvr-mentions.test.ts +57 -0
  11. package/src/__tests__/locale.test.ts +46 -0
  12. package/src/__tests__/model-bridge-entity-refs.test.ts +114 -0
  13. package/src/__tests__/non-verbal-anchor.test.ts +33 -0
  14. package/src/__tests__/non-verbal-cross-ref.test.ts +146 -0
  15. package/src/__tests__/non-verbal-highlight.test.ts +56 -0
  16. package/src/__tests__/non-verbal-kind.test.ts +77 -0
  17. package/src/__tests__/non-verbal-list.test.ts +67 -0
  18. package/src/__tests__/non-verbal-rep-display.test.ts +85 -0
  19. package/src/__tests__/non-verbal-scroll-guard.test.ts +116 -0
  20. package/src/__tests__/use-concept-entities.test.ts +76 -0
  21. package/src/adapters/bibliography-adapter.ts +49 -0
  22. package/src/adapters/factory.ts +14 -0
  23. package/src/adapters/model-bridge.ts +51 -0
  24. package/src/adapters/non-verbal/figure-bridge.ts +101 -0
  25. package/src/adapters/non-verbal/formula-bridge.ts +48 -0
  26. package/src/adapters/non-verbal/index.ts +55 -0
  27. package/src/adapters/non-verbal/kind.ts +46 -0
  28. package/src/adapters/non-verbal/prefix.ts +67 -0
  29. package/src/adapters/non-verbal/source-bridge.ts +81 -0
  30. package/src/adapters/non-verbal/table-bridge.ts +98 -0
  31. package/src/adapters/non-verbal/types.ts +133 -0
  32. package/src/adapters/non-verbal-resolver.ts +101 -0
  33. package/src/components/ConceptDetail.vue +17 -4
  34. package/src/components/LanguageDetail.vue +0 -3
  35. package/src/components/NonVerbalRepDisplay.vue +82 -24
  36. package/src/components/figure/FigureDisplay.vue +132 -0
  37. package/src/components/figure/FigureImages.vue +111 -0
  38. package/src/components/figure/figure-image-pick.ts +56 -0
  39. package/src/components/figure/figure-layout.ts +26 -0
  40. package/src/components/formula/FormulaDisplay.vue +90 -0
  41. package/src/components/formula/FormulaExpression.vue +70 -0
  42. package/src/components/non-verbal/NonVerbalCaption.vue +104 -0
  43. package/src/components/non-verbal/NonVerbalFallback.vue +69 -0
  44. package/src/components/non-verbal/NonVerbalList.vue +118 -0
  45. package/src/components/non-verbal/NonVerbalSources.vue +61 -0
  46. package/src/components/table/TableDisplay.vue +99 -0
  47. package/src/components/table/TableMarkup.vue +63 -0
  48. package/src/components/table/TableStructured.vue +66 -0
  49. package/src/composables/use-concept-entities.ts +70 -0
  50. package/src/composables/use-non-verbal-cross-ref.ts +79 -0
  51. package/src/composables/use-non-verbal-entity.ts +58 -0
  52. package/src/composables/use-reduced-motion.ts +26 -0
  53. package/src/composables/use-render-options.ts +30 -33
  54. package/src/router/index.ts +3 -0
  55. package/src/router/non-verbal-scroll-guard.ts +56 -0
  56. package/src/style.css +17 -0
  57. package/src/utils/content-renderer.ts +76 -64
  58. package/src/utils/locale.ts +92 -0
  59. package/src/utils/non-verbal-anchor.ts +51 -0
  60. package/src/utils/non-verbal-highlight.ts +27 -0
package/cli/index.mjs CHANGED
@@ -20,7 +20,7 @@
20
20
  */
21
21
 
22
22
  import { loadSiteConfig } from '../scripts/load-site-config.mjs';
23
- import { execSync } from 'child_process';
23
+ import { existsSync } from 'fs';
24
24
  import { resolve, dirname } from 'path';
25
25
  import { fileURLToPath } from 'url';
26
26
 
@@ -166,22 +166,21 @@ Environment:
166
166
  }
167
167
  }
168
168
 
169
- // Run vite build using the package's vite.config.ts
169
+ // Run vite build using the package's vite.config.ts via programmatic API
170
170
  console.log(`\n=== BUILD SPA ===\n`);
171
171
  const viteConfig = resolve(pkgRoot, 'vite.config.ts');
172
- const viteBin = [resolve(pkgRoot, 'node_modules', '.bin', 'vite'), 'vite'].find(p => {
173
- try { execSync(`${p} --version`, { stdio: 'pipe' }); return true; } catch { return false; }
174
- });
175
- execSync(`${viteBin} build --config ${viteConfig}`, {
176
- stdio: 'inherit',
177
- env: { ...process.env },
172
+ const { build: viteBuild } = await import('vite');
173
+ await viteBuild({
174
+ configFile: viteConfig,
175
+ root: pkgRoot,
176
+ mode: 'production',
178
177
  });
179
178
 
180
- // Run postbuild (404 page)
181
- try {
182
- const postbuild = resolve(pkgRoot, 'scripts', 'generate-404.js');
183
- execSync(`node ${postbuild}`, { stdio: 'inherit' });
184
- } catch {}
179
+ // Run postbuild (404 page) via dynamic import
180
+ const postbuild = resolve(pkgRoot, 'scripts', 'generate-404.js');
181
+ if (existsSync(postbuild)) {
182
+ await import(`file://${postbuild}`);
183
+ }
185
184
 
186
185
  return;
187
186
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glossarist/concept-browser",
3
- "version": "0.7.43",
3
+ "version": "0.7.45",
4
4
  "description": "Vue SPA for browsing Glossarist terminology datasets with cross-reference resolution, graph visualization, and multi-language support",
5
5
  "type": "module",
6
6
  "bin": {
@@ -25,8 +25,9 @@
25
25
  "autoprefixer": "^10.4.21",
26
26
  "d3": "^7.9.0",
27
27
  "favicons": "^7.2.0",
28
- "glossarist": "^0.3.7",
28
+ "glossarist": "^0.4.0",
29
29
  "js-yaml": "^4.1.0",
30
+ "jszip": "^3.10.1",
30
31
  "pinia": "^2.3.1",
31
32
  "postcss": "^8.5.3",
32
33
  "sharp": "^0.34.5",
@@ -0,0 +1,105 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import os from 'os';
5
+ import { assertLocalPathSafe } from '../lib/local-path-safety.mjs';
6
+
7
+ function makeTmpTree() {
8
+ const root = fs.mkdtempSync(path.join(os.tmpdir(), 'cb-fetch-'));
9
+ const datasetsDir = path.join(root, '.datasets');
10
+ const sourceDir = path.join(root, 'source-data');
11
+ fs.mkdirSync(datasetsDir);
12
+ fs.mkdirSync(path.join(sourceDir, 'concepts'), { recursive: true });
13
+ fs.writeFileSync(path.join(sourceDir, 'concepts', 'a.yaml'), 'termid: 1\n');
14
+ return { root, datasetsDir, sourceDir };
15
+ }
16
+
17
+ describe('assertLocalPathSafe', () => {
18
+ let tree;
19
+
20
+ beforeEach(() => { tree = makeTmpTree(); });
21
+ afterEach(() => {
22
+ fs.rmSync(tree.root, { recursive: true, force: true });
23
+ });
24
+
25
+ it('returns resolved path for a safe external location', () => {
26
+ const resolved = assertLocalPathSafe('foo', tree.sourceDir, {
27
+ root: tree.root,
28
+ datasetsDir: tree.datasetsDir,
29
+ });
30
+ // Returns the realpath (symlinks resolved); on macOS tmpdir resolves
31
+ // /var → /private/var, so compare against realpath, not path.resolve.
32
+ expect(resolved).toBe(fs.realpathSync(path.resolve(tree.root, tree.sourceDir)));
33
+ });
34
+
35
+ it('throws when localPath does not exist', () => {
36
+ expect(() =>
37
+ assertLocalPathSafe('foo', path.join(tree.root, 'nope'), {
38
+ root: tree.root,
39
+ datasetsDir: tree.datasetsDir,
40
+ })
41
+ ).toThrow(/does not exist/);
42
+ });
43
+
44
+ it('throws when localPath equals .datasets/<id>', () => {
45
+ const staged = path.join(tree.datasetsDir, 'foo');
46
+ fs.mkdirSync(staged, { recursive: true });
47
+ expect(() =>
48
+ assertLocalPathSafe('foo', staged, {
49
+ root: tree.root,
50
+ datasetsDir: tree.datasetsDir,
51
+ })
52
+ ).toThrow(/same physical location/);
53
+ });
54
+
55
+ it('throws when localPath is nested inside .datasets/<id>', () => {
56
+ const staged = path.join(tree.datasetsDir, 'foo');
57
+ fs.mkdirSync(path.join(staged, 'subdir'), { recursive: true });
58
+ expect(() =>
59
+ assertLocalPathSafe('foo', path.join(staged, 'subdir'), {
60
+ root: tree.root,
61
+ datasetsDir: tree.datasetsDir,
62
+ })
63
+ ).toThrow(/nested inside/);
64
+ });
65
+
66
+ it('throws when localPath contains .datasets/<id> (parent-of-staging hazard)', () => {
67
+ // localPath = root itself, datasetsDir = root/.datasets — staging ops
68
+ // (rm -rf .datasets/<id>) would touch files inside localPath.
69
+ expect(() =>
70
+ assertLocalPathSafe('foo', tree.root, {
71
+ root: tree.root,
72
+ datasetsDir: tree.datasetsDir,
73
+ })
74
+ ).toThrow(/contains .datasets/);
75
+ });
76
+
77
+ it('throws when localPath is a symlink to .datasets/<id> (the reported bug)', () => {
78
+ const staged = path.join(tree.datasetsDir, 'foo');
79
+ fs.mkdirSync(staged, { recursive: true });
80
+ const symlinkPath = path.join(tree.root, 'evil-link');
81
+ fs.symlinkSync(staged, symlinkPath);
82
+ expect(() =>
83
+ assertLocalPathSafe('foo', symlinkPath, {
84
+ root: tree.root,
85
+ datasetsDir: tree.datasetsDir,
86
+ })
87
+ ).toThrow(/same physical location/);
88
+ });
89
+
90
+ it('does NOT modify the source directory (regression for data-loss bug)', () => {
91
+ const sentinel = path.join(tree.sourceDir, 'concepts', 'SENTINEL.yaml');
92
+ fs.writeFileSync(sentinel, 'termid: sentinel\n');
93
+ const beforeMtime = fs.statSync(sentinel).mtimeMs;
94
+
95
+ assertLocalPathSafe('foo', tree.sourceDir, {
96
+ root: tree.root,
97
+ datasetsDir: tree.datasetsDir,
98
+ });
99
+
100
+ // Source directory must be completely untouched after the safety check.
101
+ expect(fs.existsSync(sentinel)).toBe(true);
102
+ expect(fs.statSync(sentinel).mtimeMs).toBe(beforeMtime);
103
+ expect(fs.readdirSync(path.join(tree.sourceDir, 'concepts'))).toContain('SENTINEL.yaml');
104
+ });
105
+ });
@@ -1,23 +1,26 @@
1
1
  #!/usr/bin/env node
2
2
  /**
3
- * fetch-datasets.mjs — Load datasets from .gcr files or clone source repos.
3
+ * fetch-datasets.mjs — Load datasets from .gcr files, local paths, or git repos.
4
4
  *
5
5
  * Reads site config (via load-site-config.mjs), for each dataset:
6
6
  * 1. If .gcr/{id}.gcr exists, extract to .datasets/{id}/
7
7
  * 2. Else download from gcrPackage URL and extract
8
- * 3. Else clone/update source repo into .datasets/{id}/
8
+ * 3. Else if localPath is set, use it in-place (NO copy, NO staging)
9
+ * 4. Else clone/update source repo into .datasets/{id}/
9
10
  *
10
11
  * After fetching, validates that all GCR dependencies are satisfiable
11
12
  * (either provided locally or routed externally).
12
13
  *
13
- * Supports localPath field in dataset config for local paths.
14
- * Supports GITHUB_TOKEN for private repos.
14
+ * No shell commands. All file ops use Node fs; ZIP uses JSZip; git uses
15
+ * execFileSync with array args (no shell interpolation).
15
16
  */
16
17
  import fs from 'fs';
17
18
  import path from 'path';
19
+ import JSZip from 'jszip';
18
20
  import { loadGcr } from 'glossarist';
19
- import { execSync } from 'child_process';
21
+ import { execFileSync } from 'child_process';
20
22
  import { loadSiteConfig } from './load-site-config.mjs';
23
+ import { assertLocalPathSafe } from './lib/local-path-safety.mjs';
21
24
 
22
25
  const ROOT = process.cwd();
23
26
  const DATASETS_DIR = path.join(ROOT, '.datasets');
@@ -39,23 +42,29 @@ async function downloadGcr(url, destPath) {
39
42
  console.log(` Saved to ${destPath} (${(buf.length / 1024).toFixed(0)} KB)`);
40
43
  }
41
44
 
42
- // --- GCR extraction ---
43
- function extractGcr(gcrPath, targetDir) {
44
- if (fs.existsSync(targetDir)) {
45
- fs.rmSync(targetDir, { recursive: true, force: true });
45
+ // --- GCR extraction (pure JSZip; no shell, cross-platform) ---
46
+ async function extractGcr(gcrPath, targetDir) {
47
+ const targetAbs = path.resolve(targetDir);
48
+ if (fs.existsSync(targetAbs)) {
49
+ fs.rmSync(targetAbs, { recursive: true, force: true });
46
50
  }
47
- fs.mkdirSync(targetDir, { recursive: true });
48
-
49
- try {
50
- execSync(`unzip -o -q "${gcrPath}" -d "${targetDir}"`, { stdio: 'pipe' });
51
- } catch {
52
- try {
53
- execSync(`python3 -c "import zipfile; zipfile.ZipFile('${gcrPath}').extractall('${targetDir}')"`, { stdio: 'pipe' });
54
- } catch (e2) {
55
- throw new Error(`Failed to extract ${gcrPath}`);
51
+ fs.mkdirSync(targetAbs, { recursive: true });
52
+
53
+ const buf = fs.readFileSync(gcrPath);
54
+ const zip = await JSZip.loadAsync(buf);
55
+ const entries = Object.values(zip.files);
56
+ for (const entry of entries) {
57
+ if (entry.dir) continue;
58
+ // zip-slip guard: refuse entries that escape targetDir
59
+ const dest = path.resolve(targetAbs, entry.name);
60
+ if (dest !== targetAbs && !dest.startsWith(targetAbs + path.sep)) {
61
+ throw new Error(`Refusing to extract entry outside target dir: ${entry.name}`);
56
62
  }
63
+ fs.mkdirSync(path.dirname(dest), { recursive: true });
64
+ const content = await entry.async('nodebuffer');
65
+ fs.writeFileSync(dest, content);
57
66
  }
58
- console.log(` Extracted to ${targetDir}`);
67
+ console.log(` Extracted to ${targetAbs}`);
59
68
  }
60
69
 
61
70
  // --- Read GCR metadata from ZIP without extraction ---
@@ -96,7 +105,7 @@ function validateDependencies(config, gcrMetadata) {
96
105
  return errors;
97
106
  }
98
107
 
99
- // --- Git operations ---
108
+ // --- Git operations (execFileSync with array args — no shell) ---
100
109
  function cloneOrUpdate(sourceRepo, targetDir) {
101
110
  const env = { ...process.env };
102
111
  let repoUrl = sourceRepo;
@@ -104,24 +113,28 @@ function cloneOrUpdate(sourceRepo, targetDir) {
104
113
  repoUrl = sourceRepo.replace('https://', `https://x-access-token:${env.GITHUB_TOKEN}@`);
105
114
  }
106
115
 
107
- if (fs.existsSync(path.join(targetDir, '.git'))) {
116
+ const targetAbs = path.resolve(targetDir);
117
+
118
+ if (fs.existsSync(path.join(targetAbs, '.git'))) {
108
119
  console.log(` Updating existing clone...`);
109
120
  try {
110
- execSync('git fetch origin', { cwd: targetDir, stdio: 'pipe', env });
111
- execSync('git reset --hard origin/HEAD', { cwd: targetDir, stdio: 'pipe', env });
112
- execSync('git clean -fd', { cwd: targetDir, stdio: 'pipe', env });
121
+ execFileSync('git', ['fetch', 'origin'], { cwd: targetAbs, stdio: 'pipe', env });
122
+ execFileSync('git', ['reset', '--hard', 'origin/HEAD'], { cwd: targetAbs, stdio: 'pipe', env });
123
+ execFileSync('git', ['clean', '-fd'], { cwd: targetAbs, stdio: 'pipe', env });
113
124
  } catch {
114
125
  console.warn(` git update failed, re-cloning`);
115
- fs.rmSync(targetDir, { recursive: true, force: true });
116
- execSync(`git clone --depth 1 "${repoUrl}" "${targetDir}"`, { stdio: 'pipe', env });
126
+ fs.rmSync(targetAbs, { recursive: true, force: true });
127
+ execFileSync('git', ['clone', '--depth', '1', repoUrl, targetAbs], { stdio: 'pipe', env });
117
128
  }
118
129
  } else {
119
- fs.mkdirSync(targetDir, { recursive: true });
130
+ fs.mkdirSync(targetAbs, { recursive: true });
120
131
  console.log(` Cloning ${sourceRepo}...`);
121
- execSync(`git clone --depth 1 "${repoUrl}" "${targetDir}"`, { stdio: 'pipe', env });
132
+ execFileSync('git', ['clone', '--depth', '1', repoUrl, targetAbs], { stdio: 'pipe', env });
122
133
  }
123
134
  }
124
135
 
136
+ // --- localPath safety check: see scripts/lib/local-path-safety.mjs ---
137
+
125
138
  // --- Main ---
126
139
  console.log('Fetching glossarist datasets...\n');
127
140
 
@@ -137,7 +150,7 @@ for (const ds of config.datasets) {
137
150
  try {
138
151
  if (fs.existsSync(gcrPath)) {
139
152
  console.log(` Using local .gcr/${ds.id}.gcr`);
140
- extractGcr(gcrPath, targetDir);
153
+ await extractGcr(gcrPath, targetDir);
141
154
  } else if (ds.gcrPackage) {
142
155
  console.log(` Using GCR package: ${ds.gcrPackage}`);
143
156
  try {
@@ -148,29 +161,18 @@ for (const ds of config.datasets) {
148
161
  console.log();
149
162
  continue;
150
163
  }
151
- extractGcr(gcrPath, targetDir);
164
+ await extractGcr(gcrPath, targetDir);
165
+ } else if (ds.localPath) {
166
+ // localPath means "data is here, use in-place." No copy, no staging.
167
+ // generate-data.mjs reads from localPath directly via datasetDir(ds).
168
+ const localResolved = assertLocalPathSafe(ds.id, ds.localPath);
169
+ console.log(` Using localPath in-place: ${localResolved}`);
170
+ } else if (ds.sourceRepo) {
171
+ cloneOrUpdate(ds.sourceRepo, targetDir);
152
172
  } else {
153
- const envOverride = ds.localPath;
154
- if (envOverride) {
155
- console.log(` Using local path: ${envOverride}`);
156
- if (!fs.existsSync(targetDir)) fs.mkdirSync(targetDir, { recursive: true });
157
- const localConcepts = path.join(envOverride, 'concepts');
158
- const targetConcepts = path.join(targetDir, 'concepts');
159
- if (fs.existsSync(localConcepts)) {
160
- if (fs.existsSync(targetConcepts)) fs.rmSync(targetConcepts, { recursive: true, force: true });
161
- execSync(`cp -r "${localConcepts}" "${targetConcepts}"`, { stdio: 'pipe' });
162
- }
163
- const registerYaml = path.join(envOverride, 'register.yaml');
164
- if (fs.existsSync(registerYaml)) {
165
- fs.copyFileSync(registerYaml, path.join(targetDir, 'register.yaml'));
166
- }
167
- } else if (ds.sourceRepo) {
168
- cloneOrUpdate(ds.sourceRepo, targetDir);
169
- } else {
170
- console.warn(` No source configured, skipping`);
171
- console.log();
172
- continue;
173
- }
173
+ console.warn(` No source configured, skipping`);
174
+ console.log();
175
+ continue;
174
176
  }
175
177
 
176
178
  // Read metadata for dependency validation (from GCR ZIP, not extracted dir)
@@ -4,11 +4,25 @@ import yaml from 'js-yaml';
4
4
  import { naturalSort, Register, parseMention } from 'glossarist';
5
5
  import { loadSiteConfig } from './load-site-config.mjs';
6
6
  import { getGroups } from './lib/concept-groups.mjs';
7
+ import { consumeDatasetEntities } from './lib/build/non-verbal-consumer.mjs';
8
+ import { copyImageAssets } from './lib/build/image-assets.mjs';
7
9
  const __dirname = path.dirname(new URL(import.meta.url).pathname);
8
10
  const ROOT = process.cwd();
9
11
  const PUBLIC = path.join(ROOT, 'public');
10
12
  const DATA = path.join(PUBLIC, 'data');
11
13
 
14
+ /**
15
+ * Resolve a dataset's source directory.
16
+ * - If `ds.localPath` is set, use it in-place (resolved against ROOT).
17
+ * No staging, no copy. fetch-datasets.mjs verifies the path is safe.
18
+ * - Otherwise fall back to the standard .datasets/<id>/ staging dir.
19
+ */
20
+ function datasetDir(ds) {
21
+ return ds.localPath
22
+ ? path.resolve(ROOT, ds.localPath)
23
+ : path.join(ROOT, '.datasets', ds.id);
24
+ }
25
+
12
26
  const DS_PALETTE = [
13
27
  '#3366ff', '#0d9488', '#d97706', '#8b5cf6',
14
28
  '#ec4899', '#059669', '#dc2626', '#6366f1',
@@ -934,7 +948,8 @@ function processDataset(dir, register, opts) {
934
948
  }
935
949
 
936
950
  // Copy bulk format files from compiled/ directory (full GCR)
937
- const compiledDir = path.join(ROOT, '.datasets', register, 'compiled');
951
+ const sourceRoot = path.dirname(dir);
952
+ const compiledDir = path.join(sourceRoot, 'compiled');
938
953
  const bulkFormats = [];
939
954
  if (fs.existsSync(compiledDir)) {
940
955
  for (const file of fs.readdirSync(compiledDir)) {
@@ -993,27 +1008,34 @@ function processDataset(dir, register, opts) {
993
1008
  writeJson(path.join(DATA, register, 'manifest.json'), manifest);
994
1009
 
995
1010
  // Copy bibliography.yaml → bibliography.json
996
- const bibPath = path.join(ROOT, '.datasets', register, 'bibliography.yaml');
1011
+ const bibPath = path.join(sourceRoot, 'bibliography.yaml');
997
1012
  if (fs.existsSync(bibPath)) {
998
1013
  const bibData = readYaml(bibPath);
999
1014
  writeJson(path.join(DATA, register, 'bibliography.json'), bibData);
1000
- console.log(` Copied bibliography (${Object.keys(bibData).length} entries)`);
1015
+ const bibCount = Array.isArray(bibData?.bibliography) ? bibData.bibliography.length : 0;
1016
+ console.log(` Copied bibliography (${bibCount} entries)`);
1001
1017
  }
1002
1018
 
1003
- // Copy images/
1004
- const imagesSrcDir = path.join(ROOT, '.datasets', register, 'images');
1019
+ // Copy images/ with magic-byte validation + manifest emission.
1020
+ const imagesSrcDir = path.join(sourceRoot, 'images');
1005
1021
  if (fs.existsSync(imagesSrcDir) && fs.statSync(imagesSrcDir).isDirectory()) {
1006
1022
  const imagesDestDir = path.join(DATA, register, 'images');
1007
- fs.mkdirSync(imagesDestDir, { recursive: true });
1008
- let imgCount = 0;
1009
- for (const file of fs.readdirSync(imagesSrcDir)) {
1010
- const src = path.join(imagesSrcDir, file);
1011
- if (fs.statSync(src).isFile()) {
1012
- fs.copyFileSync(src, path.join(imagesDestDir, file));
1013
- imgCount++;
1014
- }
1023
+ const result = await copyImageAssets(imagesSrcDir, imagesDestDir);
1024
+ console.log(` Copied ${result.count} images (skipped ${result.skipped.length})`);
1025
+ for (const w of result.skipped) {
1026
+ console.warn(` Warning: skipped image ${w}`);
1027
+ }
1028
+ }
1029
+
1030
+ // Consume non-verbal entities (figures/tables/formulas) — JSON-LD preferred,
1031
+ // YAML fallback. Writes per-entity JSON + indexes.
1032
+ const nvResult = await consumeDatasetEntities(sourceRoot, path.join(DATA, register));
1033
+ const nvTotal = nvResult.figures + nvResult.tables + nvResult.formulas;
1034
+ if (nvTotal > 0) {
1035
+ console.log(` Consumed ${nvResult.figures} figures, ${nvResult.tables} tables, ${nvResult.formulas} formulas`);
1036
+ for (const w of nvResult.warnings) {
1037
+ console.warn(` Warning: ${w}`);
1015
1038
  }
1016
- console.log(` Copied ${imgCount} images`);
1017
1039
  }
1018
1040
 
1019
1041
  console.log(` Generated ${concepts.length} concepts, manifest, ${chunks.length} index chunks`);
@@ -1030,8 +1052,8 @@ const registerCache = {};
1030
1052
 
1031
1053
  // Pre-load all register.yaml files (needed before buildRefMaps for URI pattern indexing)
1032
1054
  for (const ds of config.datasets) {
1033
- const registerDir = path.join(ROOT, '.datasets', ds.id);
1034
- const registerYamlPath = path.join(registerDir, 'register.yaml');
1055
+ const dsDir = datasetDir(ds);
1056
+ const registerYamlPath = path.join(dsDir, 'register.yaml');
1035
1057
  if (fs.existsSync(registerYamlPath)) {
1036
1058
  try {
1037
1059
  const raw = yaml.load(fs.readFileSync(registerYamlPath, 'utf8'));
@@ -1047,7 +1069,7 @@ const refMaps = buildRefMaps(config, registerCache);
1047
1069
  for (let i = 0; i < config.datasets.length; i++) {
1048
1070
  const ds = config.datasets[i];
1049
1071
 
1050
- const dir = path.join(ROOT, '.datasets', ds.id, 'concepts');
1072
+ const dir = path.join(datasetDir(ds), 'concepts');
1051
1073
  if (!fs.existsSync(dir)) {
1052
1074
  console.warn(`Skipping ${ds.id}: source directory not found (${dir})`);
1053
1075
  console.warn(` Run: npm run fetch-datasets`);
@@ -1089,8 +1111,8 @@ for (let i = 0; i < config.datasets.length; i++) {
1089
1111
  status: ds.editionStatus || reg?.status,
1090
1112
  ordering: reg?.ordering || null,
1091
1113
  sections: reg?.sections ? reg.sections.map(s => s.toJSON()) : [],
1092
- hasBibliography: fs.existsSync(path.join(ROOT, '.datasets', ds.id, 'bibliography.yaml')),
1093
- hasImages: fs.existsSync(path.join(ROOT, '.datasets', ds.id, 'images')),
1114
+ hasBibliography: fs.existsSync(path.join(datasetDir(ds), 'bibliography.yaml')),
1115
+ hasImages: fs.existsSync(path.join(datasetDir(ds), 'images')),
1094
1116
  });
1095
1117
  registry.push({
1096
1118
  id: ds.id,
@@ -0,0 +1,190 @@
1
+ /**
2
+ * Image asset pipeline — pure Node, no shell commands.
3
+ *
4
+ * Copies image bytes from the source directory to the runtime data
5
+ * directory, validates magic bytes for known formats, parses intrinsic
6
+ * dimensions for raster formats, and emits `images-manifest.json` with
7
+ * SHA-256 hashes + dimensions.
8
+ *
9
+ * If the source directory contains its own `manifest.json`, that file is
10
+ * reused as the SSOT (single source of truth) and only missing entries
11
+ * are computed. This lets glossarist-ruby ship a manifest if it wants to.
12
+ */
13
+
14
+ import { createHash } from 'node:crypto';
15
+ import { readFile, writeFile, mkdir, readdir, stat, copyFile } from 'node:fs/promises';
16
+ import path from 'node:path';
17
+
18
+ /** @typedef {{ src: string, format: string, sha256: string, width?: number, height?: number, bytes: number }} ImageManifestEntry */
19
+
20
+ export const SUPPORTED_FORMATS = new Set(['svg', 'png', 'jpg', 'jpeg', 'gif', 'webp', 'avif']);
21
+
22
+ const MAGIC_BYTES = {
23
+ png: [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
24
+ jpg: [0xFF, 0xD8, 0xFF],
25
+ gif: [0x47, 0x49, 0x46, 0x38],
26
+ webp: [0x52, 0x49, 0x46, 0x46], // "RIFF" — full match needs bytes 8-11 = "WEBP"
27
+ };
28
+
29
+ const FILENAME_RE = /^[a-z0-9._-]+$/i;
30
+
31
+ /**
32
+ * Validate the file's magic bytes against its declared extension.
33
+ * Returns the canonical format (e.g. "jpg" for ".jpeg"), or null if
34
+ * validation fails.
35
+ *
36
+ * @param {Buffer} buf
37
+ * @param {string} ext
38
+ * @returns {string | null}
39
+ */
40
+ export function detectFormat(buf, ext) {
41
+ const e = ext.toLowerCase();
42
+ if (e === 'svg') return buf.includes('<svg') || buf.includes('<SVG') ? 'svg' : null;
43
+ if (e === 'png' || e === 'jpg' || e === 'jpeg' || e === 'gif' || e === 'webp') {
44
+ const png = MAGIC_BYTES.png;
45
+ if (e === 'png' && buf.length >= png.length && png.every((b, i) => buf[i] === b)) return 'png';
46
+ const jpg = MAGIC_BYTES.jpg;
47
+ if ((e === 'jpg' || e === 'jpeg') && buf.length >= jpg.length && jpg.every((b, i) => buf[i] === b)) return 'jpg';
48
+ const gif = MAGIC_BYTES.gif;
49
+ if (e === 'gif' && buf.length >= gif.length && gif.every((b, i) => buf[i] === b)) return 'gif';
50
+ if (e === 'webp' && buf.length >= 12 && buf.toString('ascii', 0, 4) === 'RIFF' && buf.toString('ascii', 8, 12) === 'WEBP') return 'webp';
51
+ }
52
+ // Unknown formats (avif, etc.) — accept on trust; authors are responsible.
53
+ if (SUPPORTED_FORMATS.has(e)) return e;
54
+ return null;
55
+ }
56
+
57
+ /**
58
+ * Parse intrinsic dimensions for PNG/JPEG. Returns null for unknown formats.
59
+ * WebP/AVIF parsing is intentionally V1-light — authors declare via YAML.
60
+ *
61
+ * @param {Buffer} buf
62
+ * @param {string} format
63
+ * @returns {{ width?: number, height?: number }}
64
+ */
65
+ export function readIntrinsicDimensions(buf, format) {
66
+ if (format === 'png' && buf.length >= 24) {
67
+ return { width: buf.readUInt32BE(16), height: buf.readUInt32BE(20) };
68
+ }
69
+ if (format === 'jpg' || format === 'jpeg') {
70
+ return readJpegDimensions(buf);
71
+ }
72
+ return {};
73
+ }
74
+
75
+ function readJpegDimensions(buf) {
76
+ // Scan JPEG markers for SOFn (0xFFC0–0xFFCF, excluding 0xFFC4, 0xFFC8, 0xFFCC).
77
+ let i = 2;
78
+ while (i < buf.length - 9) {
79
+ if (buf[i] !== 0xFF) { i++; continue; }
80
+ const marker = buf[i + 1];
81
+ if (marker >= 0xC0 && marker <= 0xCF &&
82
+ marker !== 0xC4 && marker !== 0xC8 && marker !== 0xCC) {
83
+ const height = buf.readUInt16BE(i + 5);
84
+ const width = buf.readUInt16BE(i + 7);
85
+ return { width, height };
86
+ }
87
+ const segLen = buf.readUInt16BE(i + 2);
88
+ i += 2 + segLen;
89
+ }
90
+ return {};
91
+ }
92
+
93
+ /**
94
+ * Sanitize an image filename per the wire-format rule
95
+ * `[a-z0-9._-]+`. Returns the sanitized name (lowercased, with unsafe
96
+ * characters replaced by `-`).
97
+ *
98
+ * @param {string} name
99
+ * @returns {string}
100
+ */
101
+ export function sanitizeImageFilename(name) {
102
+ const lowered = name.toLowerCase();
103
+ const sanitized = lowered.replace(/[^a-z0-9._-]/g, '-').replace(/-+/g, '-');
104
+ return sanitized.replace(/^-+|-+$/g, '');
105
+ }
106
+
107
+ function sha256(buf) {
108
+ return createHash('sha256').update(buf).digest('hex');
109
+ }
110
+
111
+ /**
112
+ * Copy image assets from srcDir into destDir, validate, and emit
113
+ * images-manifest.json. Reuses upstream manifest if present.
114
+ *
115
+ * @param {string} srcDir Absolute path to source images/
116
+ * @param {string} destDir Absolute path to public/data/{ds}/images/
117
+ * @returns {Promise<{ count: number, manifest: Record<string, ImageManifestEntry>, skipped: string[] }>}
118
+ */
119
+ export async function copyImageAssets(srcDir, destDir) {
120
+ await mkdir(destDir, { recursive: true });
121
+ const entries = await readdir(srcDir, { withFileTypes: true });
122
+
123
+ // Reuse upstream manifest if present (SSOT).
124
+ const upstreamManifestPath = path.join(srcDir, 'manifest.json');
125
+ let upstream = null;
126
+ try {
127
+ const st = await stat(upstreamManifestPath);
128
+ if (st.isFile()) {
129
+ upstream = JSON.parse(await readFile(upstreamManifestPath, 'utf8'));
130
+ }
131
+ } catch { /* no upstream manifest — compute fresh */ }
132
+
133
+ /** @type {Record<string, ImageManifestEntry>} */
134
+ const manifest = {};
135
+ const skipped = [];
136
+ let count = 0;
137
+
138
+ for (const entry of entries) {
139
+ if (!entry.isFile()) continue;
140
+ if (entry.name === 'manifest.json') continue;
141
+
142
+ const safeName = sanitizeImageFilename(entry.name);
143
+ if (!FILENAME_RE.test(safeName)) {
144
+ skipped.push(entry.name);
145
+ continue;
146
+ }
147
+ const ext = path.extname(safeName).slice(1);
148
+ if (!ext || !SUPPORTED_FORMATS.has(ext)) {
149
+ skipped.push(entry.name);
150
+ continue;
151
+ }
152
+
153
+ const src = path.join(srcDir, entry.name);
154
+ const dest = path.join(destDir, safeName);
155
+ const buf = await readFile(src);
156
+
157
+ const format = detectFormat(buf, ext);
158
+ if (!format) {
159
+ skipped.push(`${entry.name} (format mismatch)`);
160
+ continue;
161
+ }
162
+
163
+ await copyFile(src, dest);
164
+
165
+ /** @type {ImageManifestEntry} */
166
+ let entry_record;
167
+ if (upstream && upstream[safeName]) {
168
+ entry_record = { ...upstream[safeName], src: safeName, format };
169
+ } else {
170
+ const dims = readIntrinsicDimensions(buf, format);
171
+ entry_record = {
172
+ src: safeName,
173
+ format,
174
+ sha256: sha256(buf),
175
+ bytes: buf.length,
176
+ ...dims,
177
+ };
178
+ }
179
+ manifest[safeName] = entry_record;
180
+ count++;
181
+ }
182
+
183
+ await mkdir(destDir, { recursive: true });
184
+ await writeFile(
185
+ path.join(destDir, '..', 'images-manifest.json'),
186
+ JSON.stringify(manifest, null, 2),
187
+ );
188
+
189
+ return { count, manifest, skipped };
190
+ }