docsgov 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/apispec/apispec.js +401 -0
- package/dist/apispec/apispec.test.js +444 -0
- package/dist/apispec/errors.js +17 -0
- package/dist/apispec/index.js +2 -0
- package/dist/check/doclinks.js +167 -0
- package/dist/check/index.js +8 -0
- package/dist/check/run.js +391 -0
- package/dist/check/run.test.js +513 -0
- package/dist/check/suggest.js +134 -0
- package/dist/check/suggest.test.js +92 -0
- package/dist/check/tokens.js +125 -0
- package/dist/cmd/main.js +330 -0
- package/dist/cmd/main.test.js +422 -0
- package/dist/codeq/cache.js +71 -0
- package/dist/codeq/cache.test.js +67 -0
- package/dist/codeq/errors.js +52 -0
- package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
- package/dist/codeq/index.js +11 -0
- package/dist/codeq/resolve.test.js +109 -0
- package/dist/codeq/resolver.js +128 -0
- package/dist/codeq/resolver.test.js +124 -0
- package/dist/codeq/resolvers/go.js +242 -0
- package/dist/codeq/resolvers/go.test.js +143 -0
- package/dist/codeq/resolvers/java.js +349 -0
- package/dist/codeq/resolvers/java.test.js +138 -0
- package/dist/codeq/resolvers/java_queries.js +63 -0
- package/dist/codeq/resolvers/javascript.js +412 -0
- package/dist/codeq/resolvers/javascript.test.js +125 -0
- package/dist/codeq/resolvers/javascript_queries.js +46 -0
- package/dist/codeq/resolvers/typescript.js +366 -0
- package/dist/codeq/resolvers/typescript.test.js +180 -0
- package/dist/codeq/resolvers/typescript_queries.js +78 -0
- package/dist/codeq/signature.js +50 -0
- package/dist/codeq/signature.test.js +50 -0
- package/dist/codeq/suggest.js +96 -0
- package/dist/codeq/treesitter.js +122 -0
- package/dist/codeq/treesitter.test.js +118 -0
- package/dist/config/config.js +74 -0
- package/dist/config/config.test.js +98 -0
- package/dist/config/fs.js +116 -0
- package/dist/config/glob.js +82 -0
- package/dist/config/glob.test.js +61 -0
- package/dist/config/index.js +4 -0
- package/dist/dedup/analyzer/analyzer.js +533 -0
- package/dist/dedup/analyzer/analyzer.test.js +530 -0
- package/dist/dedup/analyzer/canonical.js +74 -0
- package/dist/dedup/analyzer/canonical.test.js +70 -0
- package/dist/dedup/analyzer/cosine_clusters.js +169 -0
- package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
- package/dist/dedup/analyzer/distinctive.js +85 -0
- package/dist/dedup/analyzer/distinctive.test.js +49 -0
- package/dist/dedup/analyzer/exact_clusters.js +63 -0
- package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
- package/dist/dedup/analyzer/index.js +14 -0
- package/dist/dedup/analyzer/multiplicity.js +110 -0
- package/dist/dedup/analyzer/multiplicity.test.js +123 -0
- package/dist/dedup/analyzer/order.js +22 -0
- package/dist/dedup/analyzer/partial_overlaps.js +65 -0
- package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
- package/dist/dedup/analyzer/preview.js +84 -0
- package/dist/dedup/analyzer/preview.test.js +46 -0
- package/dist/dedup/analyzer/safety.js +27 -0
- package/dist/dedup/analyzer/safety.test.js +39 -0
- package/dist/dedup/config.js +18 -0
- package/dist/dedup/configload.js +299 -0
- package/dist/dedup/configload.test.js +410 -0
- package/dist/dedup/dedup.index.test.js +203 -0
- package/dist/dedup/dedup.js +143 -0
- package/dist/dedup/dedup.test.js +212 -0
- package/dist/dedup/dedupcfg/config.js +112 -0
- package/dist/dedup/dedupcfg/config.test.js +70 -0
- package/dist/dedup/dedupcfg/index.js +1 -0
- package/dist/dedup/deduptypes/index.js +1 -0
- package/dist/dedup/deduptypes/types.js +9 -0
- package/dist/dedup/deduptypes/types.test.js +34 -0
- package/dist/dedup/embedder/cache.js +23 -0
- package/dist/dedup/embedder/cache.test.js +50 -0
- package/dist/dedup/embedder/constants.js +10 -0
- package/dist/dedup/embedder/embedder.js +76 -0
- package/dist/dedup/embedder/embedder.mock.test.js +128 -0
- package/dist/dedup/embedder/embedder.test.js +96 -0
- package/dist/dedup/embedder/errors.js +20 -0
- package/dist/dedup/embedder/errors.test.js +35 -0
- package/dist/dedup/embedder/index.js +4 -0
- package/dist/dedup/embedder/session.js +78 -0
- package/dist/dedup/embedder/session.test.js +172 -0
- package/dist/dedup/gitignore.js +97 -0
- package/dist/dedup/gitignore.test.js +98 -0
- package/dist/dedup/index.js +11 -0
- package/dist/dedup/indexdb/errors.js +48 -0
- package/dist/dedup/indexdb/index.js +6 -0
- package/dist/dedup/indexdb/indexdb.js +302 -0
- package/dist/dedup/indexdb/indexdb.test.js +739 -0
- package/dist/dedup/indexdb/load.js +110 -0
- package/dist/dedup/indexdb/migrations.js +58 -0
- package/dist/dedup/indexdb/schema.js +83 -0
- package/dist/dedup/indexer/index.js +9 -0
- package/dist/dedup/indexer/indexer.js +501 -0
- package/dist/dedup/indexer/indexer.test.js +510 -0
- package/dist/dedup/indexer/links.js +89 -0
- package/dist/dedup/mdsection/anchor.js +60 -0
- package/dist/dedup/mdsection/anchor.test.js +39 -0
- package/dist/dedup/mdsection/blocks.js +409 -0
- package/dist/dedup/mdsection/blocks.test.js +359 -0
- package/dist/dedup/mdsection/index.js +4 -0
- package/dist/dedup/mdsection/parse.js +21 -0
- package/dist/dedup/mdsection/section.js +234 -0
- package/dist/dedup/mdsection/section.test.js +221 -0
- package/dist/dedup/report/floatfmt.js +71 -0
- package/dist/dedup/report/floatfmt.test.js +42 -0
- package/dist/dedup/report/index.js +8 -0
- package/dist/dedup/report/quote.js +77 -0
- package/dist/dedup/report/quote.test.js +67 -0
- package/dist/dedup/report/text.js +251 -0
- package/dist/dedup/report/text.test.js +420 -0
- package/dist/dedup/report_types.js +8 -0
- package/dist/dedup/sectionid/index.js +1 -0
- package/dist/dedup/sectionid/sectionid.js +16 -0
- package/dist/dedup/sectionid/sectionid.test.js +49 -0
- package/dist/guard/api/errors.js +12 -0
- package/dist/guard/api/index.js +2 -0
- package/dist/guard/api/parser.js +81 -0
- package/dist/guard/api/parser.test.js +58 -0
- package/dist/guard/api/types.js +1 -0
- package/dist/guard/code/errors.js +16 -0
- package/dist/guard/code/index.js +2 -0
- package/dist/guard/code/parser.js +54 -0
- package/dist/guard/code/parser.test.js +111 -0
- package/dist/guard/code/types.js +6 -0
- package/dist/index.js +1 -0
- package/dist/index.test.js +5 -0
- package/dist/repo/boundary.js +92 -0
- package/dist/repo/boundary.test.js +65 -0
- package/dist/repo/errors.js +56 -0
- package/dist/repo/errors.test.js +85 -0
- package/dist/repo/exists.test.js +72 -0
- package/dist/repo/filename.js +46 -0
- package/dist/repo/filename.test.js +39 -0
- package/dist/repo/fs.js +53 -0
- package/dist/repo/index.js +7 -0
- package/dist/repo/overlay.js +36 -0
- package/dist/repo/overlay.test.js +80 -0
- package/dist/repo/repo.js +353 -0
- package/dist/repo/repo.test.js +255 -0
- package/dist/repo/testutil.js +27 -0
- package/dist/repo/write.test.js +125 -0
- package/dist/report/color.js +73 -0
- package/dist/report/index.js +1 -0
- package/dist/report/report.js +112 -0
- package/dist/report/report.test.js +368 -0
- package/dist/violation/index.js +1 -0
- package/dist/violation/types.js +22 -0
- package/dist/violation/types.test.js +70 -0
- package/package.json +48 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { AnchorTracker, makeAnchor } from "./index.js";
|
|
3
|
+
describe("makeAnchor — locked slugification", () => {
|
|
4
|
+
// Spaces become single hyphens and text is lowercased; this is the GitHub-style
|
|
5
|
+
// base slug that every section anchor (and thus its ID) is built from.
|
|
6
|
+
it("lowercases and hyphenates spaces", () => {
|
|
7
|
+
expect(makeAnchor("Table Section")).toBe("table-section");
|
|
8
|
+
});
|
|
9
|
+
// Punctuation outside [letter, digit, hyphen, space, underscore] is stripped
|
|
10
|
+
// BEFORE hyphenation, matching the locked strip regex; a wrong class would shift
|
|
11
|
+
// every anchor and re-key sections.
|
|
12
|
+
it("strips punctuation but keeps hyphens and underscores", () => {
|
|
13
|
+
expect(makeAnchor("Hello, World!")).toBe("hello-world");
|
|
14
|
+
expect(makeAnchor("a_b-c")).toBe("a_b-c");
|
|
15
|
+
});
|
|
16
|
+
// A run of whitespace collapses to ONE hyphen (not one per space), per Go's
|
|
17
|
+
// single-separator emission.
|
|
18
|
+
it("collapses whitespace runs to a single hyphen", () => {
|
|
19
|
+
expect(makeAnchor("foo bar")).toBe("foo-bar");
|
|
20
|
+
});
|
|
21
|
+
// CJK characters are Unicode letters (\p{L}) and must pass through unchanged —
|
|
22
|
+
// the strip rule is Unicode-aware, not ASCII-only.
|
|
23
|
+
it("preserves CJK letters", () => {
|
|
24
|
+
expect(makeAnchor("補貨單流程")).toBe("補貨單流程");
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
describe("AnchorTracker — per-file collision suffixes", () => {
|
|
28
|
+
// The first occurrence is unsuffixed and each subsequent duplicate gets the
|
|
29
|
+
// next integer suffix, in assignment order. This is what keeps repeated
|
|
30
|
+
// headings addressable as distinct sections within one file.
|
|
31
|
+
it("suffixes duplicates -1, -2, … in order", () => {
|
|
32
|
+
const t = new AnchorTracker();
|
|
33
|
+
expect(t.assign("Installation")).toBe("installation");
|
|
34
|
+
expect(t.assign("Installation")).toBe("installation-1");
|
|
35
|
+
expect(t.assign("Installation")).toBe("installation-2");
|
|
36
|
+
// A different heading starts its own counter.
|
|
37
|
+
expect(t.assign("Usage")).toBe("usage");
|
|
38
|
+
});
|
|
39
|
+
});
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Block classification, text extraction, table linearization, and BlockRecord
|
|
3
|
+
* production. Ported from internal/dedup/mdsection/blocks.go.
|
|
4
|
+
*
|
|
5
|
+
* The goldmark→mdast mapping that matters here:
|
|
6
|
+
* - goldmark Text node with SoftLineBreak()/HardLineBreak() appends a space.
|
|
7
|
+
* In mdast a soft break is an embedded "\n" inside one text node's value;
|
|
8
|
+
* a hard break is a separate `break` node. We reproduce Go's "append a space"
|
|
9
|
+
* behavior by turning each embedded "\n" into a space and emitting a space for
|
|
10
|
+
* every `break` node.
|
|
11
|
+
* - goldmark's table has a separate header node; mdast's `table` has the header
|
|
12
|
+
* as its first `tableRow`. linearizeTable / countTableDataRows account for this.
|
|
13
|
+
* - goldmark autolinks reach collectText as a childless leaf; mdast represents
|
|
14
|
+
* `<https://…>` as a `link` node with a child text whose value is the URL, so
|
|
15
|
+
* normal recursion already yields the visible URL text.
|
|
16
|
+
* - inline raw HTML (`<br>`) is an mdast `html` node with no children; Go skips
|
|
17
|
+
* it (only Text/AutoLink/String are emitted), so we skip `html`/`break`-value.
|
|
18
|
+
*/
|
|
19
|
+
import { createHash } from "node:crypto";
|
|
20
|
+
/**
|
|
21
|
+
* Applies the same normalization pipeline as buildEmbedText:
|
|
22
|
+
* toLowerCase → collapseWhitespace → trim. This ensures a block hashed in
|
|
23
|
+
* isolation equals that same text as it appears inside a section's embed_text.
|
|
24
|
+
*/
|
|
25
|
+
export function normalizeBlockText(s) {
|
|
26
|
+
s = s.toLowerCase();
|
|
27
|
+
s = collapseWhitespace(s);
|
|
28
|
+
return s.trim();
|
|
29
|
+
}
|
|
30
|
+
/** Returns the hex-encoded SHA-256 of the normalized block text. */
|
|
31
|
+
export function blockContentHash(s) {
|
|
32
|
+
const normalized = normalizeBlockText(s);
|
|
33
|
+
return createHash("sha256").update(normalized).digest("hex");
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Splits prose on whitespace for word counting.
|
|
37
|
+
*
|
|
38
|
+
* Go uses `regexp.MustCompile("\\s+")`, whose `\s` in Go's RE2 is the ASCII set
|
|
39
|
+
* `[\t\n\f\r ]` (NOT vertical tab, NOT Unicode spaces). JS `\s` differs (it adds
|
|
40
|
+
* \v and Unicode spaces), so the class is spelled out explicitly to match Go.
|
|
41
|
+
*/
|
|
42
|
+
const wsRE = /[\t\n\f\r ]+/g;
|
|
43
|
+
/**
|
|
44
|
+
* Classifies an mdast block node and returns a Block.
|
|
45
|
+
*/
|
|
46
|
+
export function classifyNode(n) {
|
|
47
|
+
switch (n.type) {
|
|
48
|
+
case "code":
|
|
49
|
+
// mdast `code` covers both fenced and indented code blocks
|
|
50
|
+
// (goldmark KindFencedCodeBlock + KindCodeBlock).
|
|
51
|
+
return { class: 2 /* BlockClass.Code */, text: "" };
|
|
52
|
+
case "html":
|
|
53
|
+
return { class: 3 /* BlockClass.HTML */, text: "" };
|
|
54
|
+
case "thematicBreak":
|
|
55
|
+
return { class: 3 /* BlockClass.HTML */, text: "" };
|
|
56
|
+
case "table":
|
|
57
|
+
return { class: 1 /* BlockClass.Table */, text: linearizeTable(n) };
|
|
58
|
+
case "list":
|
|
59
|
+
return classifyList(n);
|
|
60
|
+
case "paragraph":
|
|
61
|
+
if (isImageOnlyParagraph(n)) {
|
|
62
|
+
return { class: 3 /* BlockClass.HTML */, text: "" };
|
|
63
|
+
}
|
|
64
|
+
return { class: 0 /* BlockClass.Prose */, text: extractInlineText(n) };
|
|
65
|
+
case "blockquote":
|
|
66
|
+
return { class: 0 /* BlockClass.Prose */, text: extractBlockText(n) };
|
|
67
|
+
default: {
|
|
68
|
+
// For any unknown block type, extract as prose.
|
|
69
|
+
const text = extractBlockText(n);
|
|
70
|
+
if (text !== "") {
|
|
71
|
+
return { class: 0 /* BlockClass.Prose */, text };
|
|
72
|
+
}
|
|
73
|
+
return { class: 3 /* BlockClass.HTML */, text: "" };
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Processes a list node, producing one chunk per list item joined by newlines.
|
|
79
|
+
* Returns a single Prose block.
|
|
80
|
+
*/
|
|
81
|
+
function classifyList(list) {
|
|
82
|
+
const parts = [];
|
|
83
|
+
for (const item of list.children) {
|
|
84
|
+
if (item.type !== "listItem") {
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
const text = listItemText(item);
|
|
88
|
+
if (text !== "") {
|
|
89
|
+
parts.push(text);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return { class: 0 /* BlockClass.Prose */, text: parts.join("\n") };
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Extracts the prose text from a list item, joining all prose children with a
|
|
96
|
+
* single space (locked: list_item flattening).
|
|
97
|
+
*/
|
|
98
|
+
function listItemText(item) {
|
|
99
|
+
const chunks = [];
|
|
100
|
+
for (const child of item.children) {
|
|
101
|
+
let text;
|
|
102
|
+
switch (child.type) {
|
|
103
|
+
case "paragraph":
|
|
104
|
+
text = extractInlineText(child);
|
|
105
|
+
break;
|
|
106
|
+
default:
|
|
107
|
+
// mdast has no "textBlock"; any block child with children is descended.
|
|
108
|
+
if (hasChildren(child)) {
|
|
109
|
+
text = extractBlockText(child);
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
text = "";
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
const t = text.trim();
|
|
116
|
+
if (t !== "") {
|
|
117
|
+
chunks.push(t);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return chunks.join(" ");
|
|
121
|
+
}
|
|
122
|
+
/** Returns true if the paragraph contains only image nodes. */
|
|
123
|
+
function isImageOnlyParagraph(n) {
|
|
124
|
+
let hasChild = false;
|
|
125
|
+
for (const c of n.children) {
|
|
126
|
+
hasChild = true;
|
|
127
|
+
if (c.type !== "image") {
|
|
128
|
+
return false;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return hasChild;
|
|
132
|
+
}
|
|
133
|
+
/** Extracts plain text from a node's inline children. */
|
|
134
|
+
export function extractInlineText(n) {
|
|
135
|
+
const out = [];
|
|
136
|
+
collectText(n, out);
|
|
137
|
+
return out.join("");
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Recursively collects text from a node's children, mirroring goldmark's
|
|
141
|
+
* collectText: emit Text values, append a space for each soft/hard line break,
|
|
142
|
+
* skip markup-only inline leaves (raw inline HTML).
|
|
143
|
+
*
|
|
144
|
+
* mdast specifics:
|
|
145
|
+
* - a `text` node value may contain "\n" (soft break) — each is turned into a
|
|
146
|
+
* space, matching goldmark appending a space after a soft-broken Text node.
|
|
147
|
+
* - a `break` node (hard break) emits a single space.
|
|
148
|
+
* - `inlineCode`/`emphasis`/`strong`/`link`/`delete` recurse (their visible
|
|
149
|
+
* text lives in child text nodes, except inlineCode which carries its own
|
|
150
|
+
* `value`).
|
|
151
|
+
* - `html` (inline raw HTML) and image-only leaves are skipped.
|
|
152
|
+
*/
|
|
153
|
+
function collectText(n, out) {
|
|
154
|
+
const children = getChildren(n);
|
|
155
|
+
if (children === undefined) {
|
|
156
|
+
return;
|
|
157
|
+
}
|
|
158
|
+
for (const c of children) {
|
|
159
|
+
switch (c.type) {
|
|
160
|
+
case "text":
|
|
161
|
+
// A soft line break appears as an embedded "\n"; goldmark appends a
|
|
162
|
+
// space after each soft-broken Text node, so map "\n" → " ".
|
|
163
|
+
out.push(c.value.replace(/\n/g, " "));
|
|
164
|
+
break;
|
|
165
|
+
case "break":
|
|
166
|
+
// Hard line break: goldmark appends a space.
|
|
167
|
+
out.push(" ");
|
|
168
|
+
break;
|
|
169
|
+
case "inlineCode":
|
|
170
|
+
// goldmark CodeSpan holds its text in child Text nodes; mdast carries it
|
|
171
|
+
// as `value`. Emit it directly (its visible text).
|
|
172
|
+
out.push(c.value);
|
|
173
|
+
break;
|
|
174
|
+
case "html":
|
|
175
|
+
// Inline raw HTML and markup-only leaves are skipped, consistent with
|
|
176
|
+
// block-level HTML being stripped from embed text.
|
|
177
|
+
break;
|
|
178
|
+
case "image":
|
|
179
|
+
case "imageReference":
|
|
180
|
+
case "footnoteReference":
|
|
181
|
+
// Markup-only / non-text inline leaves: skip.
|
|
182
|
+
break;
|
|
183
|
+
default:
|
|
184
|
+
if (hasChildren(c)) {
|
|
185
|
+
collectText(c, out);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
/** Extracts all text from a block node recursively. */
|
|
191
|
+
export function extractBlockText(n) {
|
|
192
|
+
const out = [];
|
|
193
|
+
const children = getChildren(n);
|
|
194
|
+
if (children !== undefined) {
|
|
195
|
+
for (const c of children) {
|
|
196
|
+
switch (c.type) {
|
|
197
|
+
case "paragraph":
|
|
198
|
+
out.push(extractInlineText(c));
|
|
199
|
+
out.push("\n");
|
|
200
|
+
break;
|
|
201
|
+
default:
|
|
202
|
+
if (hasChildren(c)) {
|
|
203
|
+
out.push(extractBlockText(c));
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return out.join("").trim();
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Converts an mdast Table node to the locked linearized format:
|
|
212
|
+
*
|
|
213
|
+
* "h1=v1, h2=v2; h1=v1b, h2=v2b"
|
|
214
|
+
*
|
|
215
|
+
* Semicolons separate rows; commas separate cells within a row; "=" separates
|
|
216
|
+
* header from cell value.
|
|
217
|
+
*
|
|
218
|
+
* mdast does not have a distinct header node: the first `tableRow` is the
|
|
219
|
+
* header and the remaining `tableRow`s are data rows.
|
|
220
|
+
*/
|
|
221
|
+
export function linearizeTable(table) {
|
|
222
|
+
const rows = table.children.filter((c) => c.type === "tableRow");
|
|
223
|
+
if (rows.length === 0) {
|
|
224
|
+
return "";
|
|
225
|
+
}
|
|
226
|
+
const headerRow = rows[0];
|
|
227
|
+
const headers = [];
|
|
228
|
+
for (const cell of headerRow.children) {
|
|
229
|
+
if (cell.type === "tableCell") {
|
|
230
|
+
headers.push(extractInlineText(cell).trim());
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
const dataRows = [];
|
|
234
|
+
for (let i = 1; i < rows.length; i++) {
|
|
235
|
+
const cells = [];
|
|
236
|
+
for (const cell of rows[i].children) {
|
|
237
|
+
if (cell.type === "tableCell") {
|
|
238
|
+
cells.push(extractInlineText(cell).trim());
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
dataRows.push(cells);
|
|
242
|
+
}
|
|
243
|
+
const rowStrs = [];
|
|
244
|
+
for (const row of dataRows) {
|
|
245
|
+
const cellStrs = [];
|
|
246
|
+
for (let i = 0; i < row.length; i++) {
|
|
247
|
+
const header = i < headers.length ? headers[i] : "";
|
|
248
|
+
cellStrs.push(`${header}=${row[i]}`);
|
|
249
|
+
}
|
|
250
|
+
rowStrs.push(cellStrs.join(", "));
|
|
251
|
+
}
|
|
252
|
+
return rowStrs.join("; ");
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Returns the number of data rows in an mdast Table node — the count of
|
|
256
|
+
* `tableRow` children minus the single header row. This is the unambiguous
|
|
257
|
+
* source for the table eligibility gate; counting ";" in the linearized text
|
|
258
|
+
* over-counts when a cell value itself contains a ";".
|
|
259
|
+
*/
|
|
260
|
+
export function countTableDataRows(table) {
|
|
261
|
+
const total = table.children.reduce((n, c) => (c.type === "tableRow" ? n + 1 : n), 0);
|
|
262
|
+
return total > 0 ? total - 1 : 0;
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Returns the 1-indexed start line of the first content of node n.
|
|
266
|
+
*
|
|
267
|
+
* Go's firstSegStart returns the byte offset of n's first line segment, or, when
|
|
268
|
+
* the node carries no segments (e.g. a list), descends to the first descendant
|
|
269
|
+
* that does. mdast attaches a position to every node — for a list this position
|
|
270
|
+
* starts at the first list item's content — so n.position.start.line already
|
|
271
|
+
* equals what Go computes for the locked fixtures. Returns -1 if no position.
|
|
272
|
+
*/
|
|
273
|
+
export function firstStartLine(n) {
|
|
274
|
+
if (n.position) {
|
|
275
|
+
return n.position.start.line;
|
|
276
|
+
}
|
|
277
|
+
return -1;
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Produces BlockRecords for the given body nodes of a section. Each block's
|
|
281
|
+
* [StartLine, EndLine) is computed from node positions: StartLine =
|
|
282
|
+
* firstStartLine(n), EndLine = next block's StartLine, or sectionEndLine for the
|
|
283
|
+
* last block. Code and HTML blocks are excluded. Lists are folded into "prose".
|
|
284
|
+
*
|
|
285
|
+
* SectionID/FilePath/Heading are NOT stamped here — the caller fills them.
|
|
286
|
+
*/
|
|
287
|
+
export function collectBlockRecords(bodyNodes, sectionEndLine) {
|
|
288
|
+
const candidates = [];
|
|
289
|
+
for (const n of bodyNodes) {
|
|
290
|
+
const b = classifyNode(n);
|
|
291
|
+
if (b.class === 2 /* BlockClass.Code */ || b.class === 3 /* BlockClass.HTML */) {
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
const start = firstStartLine(n);
|
|
295
|
+
if (start < 0) {
|
|
296
|
+
continue;
|
|
297
|
+
}
|
|
298
|
+
candidates.push({ node: n, b, startLine: start });
|
|
299
|
+
}
|
|
300
|
+
if (candidates.length === 0) {
|
|
301
|
+
return [];
|
|
302
|
+
}
|
|
303
|
+
const records = [];
|
|
304
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
305
|
+
const c = candidates[i];
|
|
306
|
+
const endLine = i + 1 < candidates.length ? candidates[i + 1].startLine : sectionEndLine;
|
|
307
|
+
let kind;
|
|
308
|
+
let tableRows = 0;
|
|
309
|
+
if (c.b.class === 1 /* BlockClass.Table */) {
|
|
310
|
+
kind = "table";
|
|
311
|
+
tableRows = countTableDataRows(c.node);
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
kind = "prose";
|
|
315
|
+
}
|
|
316
|
+
records.push({
|
|
317
|
+
SectionID: "",
|
|
318
|
+
FilePath: "",
|
|
319
|
+
Heading: "",
|
|
320
|
+
Index: i,
|
|
321
|
+
Kind: kind,
|
|
322
|
+
StartLine: c.startLine,
|
|
323
|
+
EndLine: endLine,
|
|
324
|
+
ContentHash: blockContentHash(c.b.text),
|
|
325
|
+
Text: normalizeBlockText(c.b.text),
|
|
326
|
+
TableRows: tableRows,
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
return records;
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Splits text on whitespace and returns the number of non-empty tokens.
|
|
333
|
+
* Matches the locked rule: split on Go-RE2 `\s+` (ASCII), filter empty splits.
|
|
334
|
+
*/
|
|
335
|
+
export function countWords(text) {
|
|
336
|
+
const parts = text.trim().split(wsRE);
|
|
337
|
+
let count = 0;
|
|
338
|
+
for (const p of parts) {
|
|
339
|
+
if (p !== "") {
|
|
340
|
+
count++;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return count;
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Produces the lowercased, whitespace-collapsed embed text for a section from
|
|
347
|
+
* its classified blocks. Prose and table blocks are included; code and HTML are
|
|
348
|
+
* stripped.
|
|
349
|
+
*/
|
|
350
|
+
export function buildEmbedText(blocks) {
|
|
351
|
+
const parts = [];
|
|
352
|
+
for (const b of blocks) {
|
|
353
|
+
if (b.class === 0 /* BlockClass.Prose */ || b.class === 1 /* BlockClass.Table */) {
|
|
354
|
+
const t = b.text.trim();
|
|
355
|
+
if (t !== "") {
|
|
356
|
+
parts.push(t);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
if (parts.length === 0) {
|
|
361
|
+
return "";
|
|
362
|
+
}
|
|
363
|
+
let combined = parts.join(" ");
|
|
364
|
+
combined = combined.toLowerCase();
|
|
365
|
+
combined = collapseWhitespace(combined);
|
|
366
|
+
return combined.trim();
|
|
367
|
+
}
|
|
368
|
+
/**
|
|
369
|
+
* Replaces runs of whitespace with a single space.
|
|
370
|
+
*
|
|
371
|
+
* Go uses `unicode.IsSpace`, whose rune set is the Unicode White_Space property;
|
|
372
|
+
* `\p{White_Space}` is the faithful equivalent (note this differs from countWords'
|
|
373
|
+
* ASCII-only split, matching Go's distinction between unicode.IsSpace and RE2 `\s`).
|
|
374
|
+
*/
|
|
375
|
+
export function collapseWhitespace(s) {
|
|
376
|
+
let out = "";
|
|
377
|
+
let inSpace = false;
|
|
378
|
+
for (const r of s) {
|
|
379
|
+
if (isUnicodeSpace(r)) {
|
|
380
|
+
if (!inSpace) {
|
|
381
|
+
out += " ";
|
|
382
|
+
inSpace = true;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
else {
|
|
386
|
+
inSpace = false;
|
|
387
|
+
out += r;
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
return out;
|
|
391
|
+
}
|
|
392
|
+
const spaceRE = /^\p{White_Space}$/u;
|
|
393
|
+
/** Mirrors Go's unicode.IsSpace for a single code point. */
|
|
394
|
+
function isUnicodeSpace(r) {
|
|
395
|
+
return spaceRE.test(r);
|
|
396
|
+
}
|
|
397
|
+
/** True if a node has at least one child. */
|
|
398
|
+
function hasChildren(n) {
|
|
399
|
+
const children = getChildren(n);
|
|
400
|
+
return children !== undefined && children.length > 0;
|
|
401
|
+
}
|
|
402
|
+
/** Returns a node's children array, or undefined for leaf nodes. */
|
|
403
|
+
function getChildren(n) {
|
|
404
|
+
const c = n.children;
|
|
405
|
+
if (Array.isArray(c)) {
|
|
406
|
+
return c;
|
|
407
|
+
}
|
|
408
|
+
return undefined;
|
|
409
|
+
}
|