docsgov 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/README.md +242 -0
  2. package/dist/apispec/apispec.js +401 -0
  3. package/dist/apispec/apispec.test.js +444 -0
  4. package/dist/apispec/errors.js +17 -0
  5. package/dist/apispec/index.js +2 -0
  6. package/dist/check/doclinks.js +167 -0
  7. package/dist/check/index.js +8 -0
  8. package/dist/check/run.js +391 -0
  9. package/dist/check/run.test.js +513 -0
  10. package/dist/check/suggest.js +134 -0
  11. package/dist/check/suggest.test.js +92 -0
  12. package/dist/check/tokens.js +125 -0
  13. package/dist/cmd/main.js +330 -0
  14. package/dist/cmd/main.test.js +422 -0
  15. package/dist/codeq/cache.js +71 -0
  16. package/dist/codeq/cache.test.js +67 -0
  17. package/dist/codeq/errors.js +52 -0
  18. package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
  19. package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
  20. package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
  21. package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
  22. package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
  23. package/dist/codeq/index.js +11 -0
  24. package/dist/codeq/resolve.test.js +109 -0
  25. package/dist/codeq/resolver.js +128 -0
  26. package/dist/codeq/resolver.test.js +124 -0
  27. package/dist/codeq/resolvers/go.js +242 -0
  28. package/dist/codeq/resolvers/go.test.js +143 -0
  29. package/dist/codeq/resolvers/java.js +349 -0
  30. package/dist/codeq/resolvers/java.test.js +138 -0
  31. package/dist/codeq/resolvers/java_queries.js +63 -0
  32. package/dist/codeq/resolvers/javascript.js +412 -0
  33. package/dist/codeq/resolvers/javascript.test.js +125 -0
  34. package/dist/codeq/resolvers/javascript_queries.js +46 -0
  35. package/dist/codeq/resolvers/typescript.js +366 -0
  36. package/dist/codeq/resolvers/typescript.test.js +180 -0
  37. package/dist/codeq/resolvers/typescript_queries.js +78 -0
  38. package/dist/codeq/signature.js +50 -0
  39. package/dist/codeq/signature.test.js +50 -0
  40. package/dist/codeq/suggest.js +96 -0
  41. package/dist/codeq/treesitter.js +122 -0
  42. package/dist/codeq/treesitter.test.js +118 -0
  43. package/dist/config/config.js +74 -0
  44. package/dist/config/config.test.js +98 -0
  45. package/dist/config/fs.js +116 -0
  46. package/dist/config/glob.js +82 -0
  47. package/dist/config/glob.test.js +61 -0
  48. package/dist/config/index.js +4 -0
  49. package/dist/dedup/analyzer/analyzer.js +533 -0
  50. package/dist/dedup/analyzer/analyzer.test.js +530 -0
  51. package/dist/dedup/analyzer/canonical.js +74 -0
  52. package/dist/dedup/analyzer/canonical.test.js +70 -0
  53. package/dist/dedup/analyzer/cosine_clusters.js +169 -0
  54. package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
  55. package/dist/dedup/analyzer/distinctive.js +85 -0
  56. package/dist/dedup/analyzer/distinctive.test.js +49 -0
  57. package/dist/dedup/analyzer/exact_clusters.js +63 -0
  58. package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
  59. package/dist/dedup/analyzer/index.js +14 -0
  60. package/dist/dedup/analyzer/multiplicity.js +110 -0
  61. package/dist/dedup/analyzer/multiplicity.test.js +123 -0
  62. package/dist/dedup/analyzer/order.js +22 -0
  63. package/dist/dedup/analyzer/partial_overlaps.js +65 -0
  64. package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
  65. package/dist/dedup/analyzer/preview.js +84 -0
  66. package/dist/dedup/analyzer/preview.test.js +46 -0
  67. package/dist/dedup/analyzer/safety.js +27 -0
  68. package/dist/dedup/analyzer/safety.test.js +39 -0
  69. package/dist/dedup/config.js +18 -0
  70. package/dist/dedup/configload.js +299 -0
  71. package/dist/dedup/configload.test.js +410 -0
  72. package/dist/dedup/dedup.index.test.js +203 -0
  73. package/dist/dedup/dedup.js +143 -0
  74. package/dist/dedup/dedup.test.js +212 -0
  75. package/dist/dedup/dedupcfg/config.js +112 -0
  76. package/dist/dedup/dedupcfg/config.test.js +70 -0
  77. package/dist/dedup/dedupcfg/index.js +1 -0
  78. package/dist/dedup/deduptypes/index.js +1 -0
  79. package/dist/dedup/deduptypes/types.js +9 -0
  80. package/dist/dedup/deduptypes/types.test.js +34 -0
  81. package/dist/dedup/embedder/cache.js +23 -0
  82. package/dist/dedup/embedder/cache.test.js +50 -0
  83. package/dist/dedup/embedder/constants.js +10 -0
  84. package/dist/dedup/embedder/embedder.js +76 -0
  85. package/dist/dedup/embedder/embedder.mock.test.js +128 -0
  86. package/dist/dedup/embedder/embedder.test.js +96 -0
  87. package/dist/dedup/embedder/errors.js +20 -0
  88. package/dist/dedup/embedder/errors.test.js +35 -0
  89. package/dist/dedup/embedder/index.js +4 -0
  90. package/dist/dedup/embedder/session.js +78 -0
  91. package/dist/dedup/embedder/session.test.js +172 -0
  92. package/dist/dedup/gitignore.js +97 -0
  93. package/dist/dedup/gitignore.test.js +98 -0
  94. package/dist/dedup/index.js +11 -0
  95. package/dist/dedup/indexdb/errors.js +48 -0
  96. package/dist/dedup/indexdb/index.js +6 -0
  97. package/dist/dedup/indexdb/indexdb.js +302 -0
  98. package/dist/dedup/indexdb/indexdb.test.js +739 -0
  99. package/dist/dedup/indexdb/load.js +110 -0
  100. package/dist/dedup/indexdb/migrations.js +58 -0
  101. package/dist/dedup/indexdb/schema.js +83 -0
  102. package/dist/dedup/indexer/index.js +9 -0
  103. package/dist/dedup/indexer/indexer.js +501 -0
  104. package/dist/dedup/indexer/indexer.test.js +510 -0
  105. package/dist/dedup/indexer/links.js +89 -0
  106. package/dist/dedup/mdsection/anchor.js +60 -0
  107. package/dist/dedup/mdsection/anchor.test.js +39 -0
  108. package/dist/dedup/mdsection/blocks.js +409 -0
  109. package/dist/dedup/mdsection/blocks.test.js +359 -0
  110. package/dist/dedup/mdsection/index.js +4 -0
  111. package/dist/dedup/mdsection/parse.js +21 -0
  112. package/dist/dedup/mdsection/section.js +234 -0
  113. package/dist/dedup/mdsection/section.test.js +221 -0
  114. package/dist/dedup/report/floatfmt.js +71 -0
  115. package/dist/dedup/report/floatfmt.test.js +42 -0
  116. package/dist/dedup/report/index.js +8 -0
  117. package/dist/dedup/report/quote.js +77 -0
  118. package/dist/dedup/report/quote.test.js +67 -0
  119. package/dist/dedup/report/text.js +251 -0
  120. package/dist/dedup/report/text.test.js +420 -0
  121. package/dist/dedup/report_types.js +8 -0
  122. package/dist/dedup/sectionid/index.js +1 -0
  123. package/dist/dedup/sectionid/sectionid.js +16 -0
  124. package/dist/dedup/sectionid/sectionid.test.js +49 -0
  125. package/dist/guard/api/errors.js +12 -0
  126. package/dist/guard/api/index.js +2 -0
  127. package/dist/guard/api/parser.js +81 -0
  128. package/dist/guard/api/parser.test.js +58 -0
  129. package/dist/guard/api/types.js +1 -0
  130. package/dist/guard/code/errors.js +16 -0
  131. package/dist/guard/code/index.js +2 -0
  132. package/dist/guard/code/parser.js +54 -0
  133. package/dist/guard/code/parser.test.js +111 -0
  134. package/dist/guard/code/types.js +6 -0
  135. package/dist/index.js +1 -0
  136. package/dist/index.test.js +5 -0
  137. package/dist/repo/boundary.js +92 -0
  138. package/dist/repo/boundary.test.js +65 -0
  139. package/dist/repo/errors.js +56 -0
  140. package/dist/repo/errors.test.js +85 -0
  141. package/dist/repo/exists.test.js +72 -0
  142. package/dist/repo/filename.js +46 -0
  143. package/dist/repo/filename.test.js +39 -0
  144. package/dist/repo/fs.js +53 -0
  145. package/dist/repo/index.js +7 -0
  146. package/dist/repo/overlay.js +36 -0
  147. package/dist/repo/overlay.test.js +80 -0
  148. package/dist/repo/repo.js +353 -0
  149. package/dist/repo/repo.test.js +255 -0
  150. package/dist/repo/testutil.js +27 -0
  151. package/dist/repo/write.test.js +125 -0
  152. package/dist/report/color.js +73 -0
  153. package/dist/report/index.js +1 -0
  154. package/dist/report/report.js +112 -0
  155. package/dist/report/report.test.js +368 -0
  156. package/dist/violation/index.js +1 -0
  157. package/dist/violation/types.js +22 -0
  158. package/dist/violation/types.test.js +70 -0
  159. package/package.json +48 -0
@@ -0,0 +1,39 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { defaultConfig } from "../dedupcfg/index.js";
3
+ import { findDifferentiators } from "./safety.js";
4
+ // Differentiators are the L4 safety net: when two near-duplicate sections
5
+ // disagree on a load-bearing axis (sync vs async, create vs cancel, ...) the
6
+ // dedup tool must NOT auto-merge them. These tests pin WHY each branch of the
7
+ // cross-side substring contract fires or stays silent — a drift here would let
8
+ // the tool recommend merging genuinely-different sections.
9
+ describe("findDifferentiators", () => {
10
+ const differentiators = defaultConfig().Analyzer.differentiators;
11
+ it("detects a pair when A has the left term and B has the right term", () => {
12
+ const textA = "This section explains sync operations in detail.";
13
+ const textB = "This section explains async operations in detail.";
14
+ const hits = findDifferentiators(textA, textB, differentiators);
15
+ expect(hits.length).toBeGreaterThan(0);
16
+ // The reason tag must name both sides so reviewers know what differs.
17
+ expect(hits.some((h) => h.includes("sync") && h.includes("async"))).toBe(true);
18
+ });
19
+ it("fires on a cross-side match (A=sync, B=async)", () => {
20
+ const textA = "Both sync and async operations.";
21
+ const textB = "Configuration of async jobs.";
22
+ const hits = findDifferentiators(textA, textB, differentiators);
23
+ expect(hits.length).toBeGreaterThan(0);
24
+ });
25
+ it("does NOT fire when both sides mention only the same term", () => {
26
+ // WHY: the rule needs A on one side AND B on the OTHER side. Two sections
27
+ // both about "sync" do not meet the pair contract, so no false positive.
28
+ const textA = "This section explains sync operations.";
29
+ const textB = "Configuration of sync workers.";
30
+ const hits = findDifferentiators(textA, textB, differentiators);
31
+ expect(hits).toHaveLength(0);
32
+ });
33
+ it("does NOT fire on unrelated text with no differentiator terms", () => {
34
+ const textA = "Overview of request handling middleware.";
35
+ const textB = "Configuration and request lifecycle overview.";
36
+ const hits = findDifferentiators(textA, textB, differentiators);
37
+ expect(hits).toHaveLength(0);
38
+ });
39
+ });
@@ -0,0 +1,18 @@
1
+ // Top-level config facade for the dedup subsystem.
2
+ //
3
+ // Ported from internal/dedup/config.go. The Go facade re-exported the Config
4
+ // types (as aliases) and Default() from the leaf dedupcfg package to break the
5
+ // import cycle: internal/dedup/indexer imports dedupcfg directly, while the
6
+ // facade re-exports for ergonomic callers. The same re-export holds here.
7
+ //
8
+ // SectionID (Go's re-export of sectionid.Derive) is intentionally NOT re-exported
9
+ // here: it has no caller in the TS tree and the sectionid leaf is imported
10
+ // directly where needed, so re-adding it would be speculative.
11
+ import { defaultConfig } from "./dedupcfg/index.js";
12
+ /**
13
+ * Default returns the v1 locked default configuration (Go: Default()).
14
+ * Delegates to dedupcfg.defaultConfig().
15
+ */
16
+ export function Default() {
17
+ return defaultConfig();
18
+ }
@@ -0,0 +1,299 @@
1
+ // Loads and validates the dedup config: Default() overlaid with any
2
+ // .docgov/dedup/config.yml present at repoRoot.
3
+ //
4
+ // Ported from internal/dedup/configload.go. Go used yaml.Unmarshal(data, &cfg),
5
+ // which sets only the fields present in the YAML (scalars override, list fields
6
+ // replace) and returns *yaml.TypeError on a type mismatch. The `yaml` package
7
+ // here parses into plain JS values without that struct-aware overlay, so the
8
+ // overlay + per-field type checking is done explicitly below. A type mismatch
9
+ // throws YAMLTypeError so callers can match it (Go's errors.As against
10
+ // *yaml.TypeError → instanceof YAMLTypeError here, per the Go-sentinel→subclass
11
+ // porting rule).
12
+ //
13
+ // Two divergences from Go that the *_test cases pin:
14
+ // - list fields present in YAML REPLACE the default list entirely;
15
+ // - *_extra fields APPEND to the (possibly replaced) base list.
16
+ //
17
+ // ctx threading is dropped: there is no cancellation context in the TS tree (the
18
+ // embedder/indexdb already dropped ctx), so Load takes only repoRoot.
19
+ import { readFile } from "node:fs/promises";
20
+ import * as path from "node:path";
21
+ import { parse as parseYaml } from "yaml";
22
+ import { Default } from "./config.js";
23
+ /**
24
+ * YAMLTypeError is thrown when a config.yml value has the wrong type for its
25
+ * field (Go's *yaml.TypeError). Callers can match it with `instanceof` — the TS
26
+ * analogue of Go's errors.As(err, &typeErr).
27
+ */
28
+ export class YAMLTypeError extends Error {
29
+ constructor(message) {
30
+ super(message);
31
+ this.name = "YAMLTypeError";
32
+ }
33
+ }
34
+ /** isNotExist reports whether err is a Node ENOENT (Go's os.IsNotExist). */
35
+ function isNotExist(err) {
36
+ return (typeof err === "object" &&
37
+ err !== null &&
38
+ err.code === "ENOENT");
39
+ }
40
+ /** quote renders a path as a Go %q-style double-quoted string for error messages. */
41
+ function quote(s) {
42
+ return JSON.stringify(s);
43
+ }
44
+ /**
45
+ * Load returns Default() overlaid with any .docgov/dedup/config.yml present at
46
+ * repoRoot. A missing file is silently skipped. A malformed YAML file throws a
47
+ * wrapped error (hard error — never falls back to defaults on parse failure).
48
+ * Validation runs last so user overlays are fully applied before checking.
49
+ */
50
+ export async function Load(repoRoot) {
51
+ const cfg = Default();
52
+ const dedupDir = path.join(repoRoot, ".docgov", "dedup");
53
+ // Step 1: overlay config.yml onto defaults.
54
+ const configPath = path.join(dedupDir, "config.yml");
55
+ let data;
56
+ try {
57
+ data = await readFile(configPath, "utf8");
58
+ }
59
+ catch (err) {
60
+ if (!isNotExist(err)) {
61
+ throw new Error(`configload.Load: read ${quote(configPath)}: ${String(err)}`, { cause: err });
62
+ }
63
+ // Missing file: silently skip — cfg stays at defaults.
64
+ }
65
+ if (data !== undefined) {
66
+ let parsed;
67
+ try {
68
+ parsed = parseYaml(data);
69
+ }
70
+ catch (err) {
71
+ throw new Error(`configload.Load: parse ${quote(configPath)}: ${String(err)}`, { cause: err });
72
+ }
73
+ // An empty/null document is a no-op overlay (cfg stays at defaults).
74
+ if (parsed !== null && parsed !== undefined) {
75
+ if (typeof parsed !== "object" || Array.isArray(parsed)) {
76
+ throw new YAMLTypeError(`configload.Load: parse ${quote(configPath)}: expected a mapping`);
77
+ }
78
+ overlay(cfg, parsed, configPath);
79
+ }
80
+ // Apply *_extra fields: append extra entries to the (possibly overridden) base list.
81
+ cfg.Markdown.ignored_dirs = cfg.Markdown.ignored_dirs.concat(cfg.Markdown.ignored_dirs_extra ?? []);
82
+ cfg.Analyzer.heading_blacklist = cfg.Analyzer.heading_blacklist.concat(cfg.Analyzer.heading_blacklist_extra ?? []);
83
+ }
84
+ // Step 2: validate. Runs last so all overlays are applied before checking.
85
+ validate(cfg);
86
+ return cfg;
87
+ }
88
+ // --- overlay helpers -------------------------------------------------------
89
+ /** typeErr builds a YAMLTypeError for a field that got the wrong YAML type. */
90
+ function typeErr(configPath, key, want, got) {
91
+ return new YAMLTypeError(`configload.Load: parse ${quote(configPath)}: field ${key}: cannot unmarshal ${describe(got)} into ${want}`);
92
+ }
93
+ function describe(v) {
94
+ if (v === null)
95
+ return "null";
96
+ if (Array.isArray(v))
97
+ return "sequence";
98
+ return typeof v;
99
+ }
100
+ function asNumber(v, configPath, key) {
101
+ // YAML booleans/strings must NOT coerce to a number (yaml.v3 rejects them).
102
+ if (typeof v !== "number") {
103
+ throw typeErr(configPath, key, "number", v);
104
+ }
105
+ return v;
106
+ }
107
+ function asString(v, configPath, key) {
108
+ if (typeof v !== "string") {
109
+ throw typeErr(configPath, key, "string", v);
110
+ }
111
+ return v;
112
+ }
113
+ function asStringList(v, configPath, key) {
114
+ if (!Array.isArray(v)) {
115
+ throw typeErr(configPath, key, "sequence", v);
116
+ }
117
+ return v.map((item, i) => {
118
+ if (typeof item !== "string") {
119
+ throw typeErr(configPath, `${key}[${i}]`, "string", item);
120
+ }
121
+ return item;
122
+ });
123
+ }
124
+ function asDifferentiators(v, configPath, key) {
125
+ if (!Array.isArray(v)) {
126
+ throw typeErr(configPath, key, "sequence", v);
127
+ }
128
+ return v.map((pair, i) => {
129
+ if (!Array.isArray(pair) || pair.length !== 2) {
130
+ throw typeErr(configPath, `${key}[${i}]`, "2-element sequence", pair);
131
+ }
132
+ const a = pair[0];
133
+ const b = pair[1];
134
+ if (typeof a !== "string" || typeof b !== "string") {
135
+ throw typeErr(configPath, `${key}[${i}]`, "string pair", pair);
136
+ }
137
+ return [a, b];
138
+ });
139
+ }
140
+ /** has reports whether the YAML mapping carries key (present, even if null). */
141
+ function has(rec, key) {
142
+ return Object.prototype.hasOwnProperty.call(rec, key);
143
+ }
144
+ /** group returns the sub-mapping under a top-level key, or null if absent/empty. */
145
+ function group(rec, key, configPath) {
146
+ if (!has(rec, key)) {
147
+ return null;
148
+ }
149
+ const v = rec[key];
150
+ if (v === null || v === undefined) {
151
+ return null; // present-but-empty section → no overlay
152
+ }
153
+ if (typeof v !== "object" || Array.isArray(v)) {
154
+ throw typeErr(configPath, key, "mapping", v);
155
+ }
156
+ return v;
157
+ }
158
+ /**
159
+ * overlay applies the parsed YAML mapping onto cfg in place, mirroring
160
+ * yaml.Unmarshal(data, &cfg): only keys present in the YAML are set; list fields
161
+ * replace; *_extra fields are stored for later append in Load. Unknown keys are
162
+ * ignored (yaml.v3 defaults — no KnownFields strict mode).
163
+ */
164
+ function overlay(cfg, root, p) {
165
+ const md = group(root, "markdown", p);
166
+ if (md !== null) {
167
+ if (has(md, "min_prose_words"))
168
+ cfg.Markdown.min_prose_words = asNumber(md["min_prose_words"], p, "markdown.min_prose_words");
169
+ if (has(md, "heading_token_min_len"))
170
+ cfg.Markdown.heading_token_min_len = asNumber(md["heading_token_min_len"], p, "markdown.heading_token_min_len");
171
+ if (has(md, "hidden_dir_prefix"))
172
+ cfg.Markdown.hidden_dir_prefix = asString(md["hidden_dir_prefix"], p, "markdown.hidden_dir_prefix");
173
+ if (has(md, "ignored_dirs"))
174
+ cfg.Markdown.ignored_dirs = asStringList(md["ignored_dirs"], p, "markdown.ignored_dirs");
175
+ if (has(md, "ignored_dirs_extra"))
176
+ cfg.Markdown.ignored_dirs_extra = asStringList(md["ignored_dirs_extra"], p, "markdown.ignored_dirs_extra");
177
+ }
178
+ const idx = group(root, "indexer", p);
179
+ if (idx !== null) {
180
+ if (has(idx, "embed_progress_threshold"))
181
+ cfg.Indexer.embed_progress_threshold = asNumber(idx["embed_progress_threshold"], p, "indexer.embed_progress_threshold");
182
+ if (has(idx, "max_workers"))
183
+ cfg.Indexer.max_workers = asNumber(idx["max_workers"], p, "indexer.max_workers");
184
+ if (has(idx, "external_url_prefixes"))
185
+ cfg.Indexer.external_url_prefixes = asStringList(idx["external_url_prefixes"], p, "indexer.external_url_prefixes");
186
+ }
187
+ const an = group(root, "analyzer", p);
188
+ if (an !== null) {
189
+ if (has(an, "thresh_high"))
190
+ cfg.Analyzer.thresh_high = asNumber(an["thresh_high"], p, "analyzer.thresh_high");
191
+ if (has(an, "thresh_maybe"))
192
+ cfg.Analyzer.thresh_maybe = asNumber(an["thresh_maybe"], p, "analyzer.thresh_maybe");
193
+ if (has(an, "distinctive_abs_min"))
194
+ cfg.Analyzer.distinctive_abs_min = asNumber(an["distinctive_abs_min"], p, "analyzer.distinctive_abs_min");
195
+ if (has(an, "distinctive_pct_of_headings"))
196
+ cfg.Analyzer.distinctive_pct_of_headings = asNumber(an["distinctive_pct_of_headings"], p, "analyzer.distinctive_pct_of_headings");
197
+ if (has(an, "universal_stopwords"))
198
+ cfg.Analyzer.universal_stopwords = asStringList(an["universal_stopwords"], p, "analyzer.universal_stopwords");
199
+ if (has(an, "differentiators"))
200
+ cfg.Analyzer.differentiators = asDifferentiators(an["differentiators"], p, "analyzer.differentiators");
201
+ if (has(an, "path_priority"))
202
+ cfg.Analyzer.path_priority = asStringList(an["path_priority"], p, "analyzer.path_priority");
203
+ if (has(an, "heading_blacklist"))
204
+ cfg.Analyzer.heading_blacklist = asStringList(an["heading_blacklist"], p, "analyzer.heading_blacklist");
205
+ if (has(an, "heading_blacklist_extra"))
206
+ cfg.Analyzer.heading_blacklist_extra = asStringList(an["heading_blacklist_extra"], p, "analyzer.heading_blacklist_extra");
207
+ if (has(an, "path_blacklist"))
208
+ cfg.Analyzer.path_blacklist = asStringList(an["path_blacklist"], p, "analyzer.path_blacklist");
209
+ }
210
+ const rep = group(root, "report", p);
211
+ if (rep !== null) {
212
+ if (has(rep, "preview_chars"))
213
+ cfg.Report.preview_chars = asNumber(rep["preview_chars"], p, "report.preview_chars");
214
+ if (has(rep, "preview_word_ratio"))
215
+ cfg.Report.preview_word_ratio = asNumber(rep["preview_word_ratio"], p, "report.preview_word_ratio");
216
+ if (has(rep, "wrap_cols"))
217
+ cfg.Report.wrap_cols = asNumber(rep["wrap_cols"], p, "report.wrap_cols");
218
+ if (has(rep, "separator"))
219
+ cfg.Report.separator = asString(rep["separator"], p, "report.separator");
220
+ }
221
+ const em = group(root, "embedder", p);
222
+ if (em !== null) {
223
+ if (has(em, "batch_size"))
224
+ cfg.Embedder.batch_size = asNumber(em["batch_size"], p, "embedder.batch_size");
225
+ }
226
+ const blk = group(root, "block", p);
227
+ if (blk !== null) {
228
+ if (has(blk, "min_words"))
229
+ cfg.Block.min_words = asNumber(blk["min_words"], p, "block.min_words");
230
+ if (has(blk, "table_min_rows"))
231
+ cfg.Block.table_min_rows = asNumber(blk["table_min_rows"], p, "block.table_min_rows");
232
+ if (has(blk, "cosine_threshold"))
233
+ cfg.Block.cosine_threshold = asNumber(blk["cosine_threshold"], p, "block.cosine_threshold");
234
+ if (has(blk, "multiplicity_cap"))
235
+ cfg.Block.multiplicity_cap = asNumber(blk["multiplicity_cap"], p, "block.multiplicity_cap");
236
+ }
237
+ }
238
+ // --- validation ------------------------------------------------------------
239
+ /**
240
+ * validate checks that all scalar tunables are within their allowed ranges.
241
+ * Rules per the Go validate():
242
+ * - 0 < ThreshMaybe < ThreshHigh < 1
243
+ * - MinProseWords >= 1
244
+ * - HeadingTokenMinLen >= 1
245
+ * - DistinctiveAbsMin >= 1
246
+ * - 0 < DistinctivePctOfHeadings <= 1
247
+ * - PreviewChars >= 1
248
+ * - WrapCols >= 1
249
+ * - BatchSize >= 1
250
+ * - Block.MinWords >= 1
251
+ * - Block.TableMinRows >= 1
252
+ * - 0 < Block.CosineThreshold < 1
253
+ * - Block.MultiplicityCap >= 1
254
+ */
255
+ function validate(cfg) {
256
+ const a = cfg.Analyzer;
257
+ if (a.thresh_maybe <= 0 || a.thresh_maybe >= 1) {
258
+ throw new Error(`configload.Load: thresh_maybe must be in (0, 1), got ${a.thresh_maybe}`);
259
+ }
260
+ if (a.thresh_high <= 0 || a.thresh_high >= 1) {
261
+ throw new Error(`configload.Load: thresh_high must be in (0, 1), got ${a.thresh_high}`);
262
+ }
263
+ if (a.thresh_maybe >= a.thresh_high) {
264
+ throw new Error(`configload.Load: thresh_maybe (${a.thresh_maybe}) must be < thresh_high (${a.thresh_high})`);
265
+ }
266
+ if (cfg.Markdown.min_prose_words < 1) {
267
+ throw new Error(`configload.Load: min_prose_words must be >= 1, got ${cfg.Markdown.min_prose_words}`);
268
+ }
269
+ if (cfg.Markdown.heading_token_min_len < 1) {
270
+ throw new Error(`configload.Load: heading_token_min_len must be >= 1, got ${cfg.Markdown.heading_token_min_len}`);
271
+ }
272
+ if (a.distinctive_abs_min < 1) {
273
+ throw new Error(`configload.Load: distinctive_abs_min must be >= 1, got ${a.distinctive_abs_min}`);
274
+ }
275
+ if (a.distinctive_pct_of_headings <= 0 || a.distinctive_pct_of_headings > 1) {
276
+ throw new Error(`configload.Load: distinctive_pct_of_headings must be in (0, 1], got ${a.distinctive_pct_of_headings}`);
277
+ }
278
+ if (cfg.Report.preview_chars < 1) {
279
+ throw new Error(`configload.Load: preview_chars must be >= 1, got ${cfg.Report.preview_chars}`);
280
+ }
281
+ if (cfg.Report.wrap_cols < 1) {
282
+ throw new Error(`configload.Load: wrap_cols must be >= 1, got ${cfg.Report.wrap_cols}`);
283
+ }
284
+ if (cfg.Embedder.batch_size < 1) {
285
+ throw new Error(`configload.Load: batch_size must be >= 1, got ${cfg.Embedder.batch_size}`);
286
+ }
287
+ if (cfg.Block.min_words < 1) {
288
+ throw new Error(`configload.Load: block.min_words must be >= 1, got ${cfg.Block.min_words}`);
289
+ }
290
+ if (cfg.Block.table_min_rows < 1) {
291
+ throw new Error(`configload.Load: block.table_min_rows must be >= 1, got ${cfg.Block.table_min_rows}`);
292
+ }
293
+ if (cfg.Block.cosine_threshold <= 0 || cfg.Block.cosine_threshold >= 1) {
294
+ throw new Error(`configload.Load: block.cosine_threshold must be in (0, 1), got ${cfg.Block.cosine_threshold}`);
295
+ }
296
+ if (cfg.Block.multiplicity_cap < 1) {
297
+ throw new Error(`configload.Load: block.multiplicity_cap must be >= 1, got ${cfg.Block.multiplicity_cap}`);
298
+ }
299
+ }