@nanocollective/get-md 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -3
- package/dist/cli.js +2 -2
- package/dist/cli.js.map +1 -1
- package/dist/extractors/metadata-extractor.js +1 -1
- package/dist/extractors/metadata-extractor.js.map +1 -1
- package/dist/extractors/metadata-extractor.spec.d.ts +2 -0
- package/dist/extractors/metadata-extractor.spec.d.ts.map +1 -0
- package/dist/extractors/metadata-extractor.spec.js +486 -0
- package/dist/extractors/metadata-extractor.spec.js.map +1 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/dist/index.spec.d.ts +2 -0
- package/dist/index.spec.d.ts.map +1 -0
- package/dist/index.spec.js +518 -0
- package/dist/index.spec.js.map +1 -0
- package/dist/optimizers/html-cleaner.js +1 -1
- package/dist/optimizers/html-cleaner.js.map +1 -1
- package/dist/optimizers/html-cleaner.spec.d.ts +2 -0
- package/dist/optimizers/html-cleaner.spec.d.ts.map +1 -0
- package/dist/optimizers/html-cleaner.spec.js +351 -0
- package/dist/optimizers/html-cleaner.spec.js.map +1 -0
- package/dist/optimizers/llm-formatter.js +2 -2
- package/dist/optimizers/llm-formatter.js.map +1 -1
- package/dist/optimizers/llm-formatter.spec.d.ts +2 -0
- package/dist/optimizers/llm-formatter.spec.d.ts.map +1 -0
- package/dist/optimizers/llm-formatter.spec.js +276 -0
- package/dist/optimizers/llm-formatter.spec.js.map +1 -0
- package/dist/optimizers/structure-enhancer.js +3 -3
- package/dist/optimizers/structure-enhancer.js.map +1 -1
- package/dist/optimizers/structure-enhancer.spec.d.ts +2 -0
- package/dist/optimizers/structure-enhancer.spec.d.ts.map +1 -0
- package/dist/optimizers/structure-enhancer.spec.js +331 -0
- package/dist/optimizers/structure-enhancer.spec.js.map +1 -0
- package/dist/parsers/markdown-parser.d.ts +1 -1
- package/dist/parsers/markdown-parser.d.ts.map +1 -1
- package/dist/parsers/markdown-parser.js +58 -37
- package/dist/parsers/markdown-parser.js.map +1 -1
- package/dist/parsers/markdown-parser.spec.js +106 -98
- package/dist/parsers/markdown-parser.spec.js.map +1 -1
- package/dist/utils/url-fetcher.d.ts.map +1 -1
- package/dist/utils/url-fetcher.js +1 -1
- package/dist/utils/url-fetcher.js.map +1 -1
- package/dist/utils/url-fetcher.spec.d.ts +2 -0
- package/dist/utils/url-fetcher.spec.d.ts.map +1 -0
- package/dist/utils/url-fetcher.spec.js +206 -0
- package/dist/utils/url-fetcher.spec.js.map +1 -0
- package/dist/utils/validators.js +1 -1
- package/dist/utils/validators.js.map +1 -1
- package/dist/utils/validators.spec.d.ts +2 -0
- package/dist/utils/validators.spec.d.ts.map +1 -0
- package/dist/utils/validators.spec.js +290 -0
- package/dist/utils/validators.spec.js.map +1 -0
- package/package.json +8 -14
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
// src/optimizers/llm-formatter.spec.ts
|
|
2
|
+
import test from "ava";
|
|
3
|
+
import { formatForLLM } from "./llm-formatter.js";
|
|
4
|
+
test("normalizes heading levels - prevents skipping levels", (t) => {
|
|
5
|
+
const markdown = `
|
|
6
|
+
# Level 1
|
|
7
|
+
##### Level 5 (should become Level 2)
|
|
8
|
+
`.trim();
|
|
9
|
+
const result = formatForLLM(markdown);
|
|
10
|
+
t.true(result.includes("# Level 1"));
|
|
11
|
+
t.true(result.includes("## Level 5"));
|
|
12
|
+
t.false(result.includes("#####"));
|
|
13
|
+
});
|
|
14
|
+
test("normalizes heading levels - respects proper hierarchy", (t) => {
|
|
15
|
+
const markdown = `
|
|
16
|
+
# Level 1
|
|
17
|
+
## Level 2
|
|
18
|
+
### Level 3
|
|
19
|
+
`.trim();
|
|
20
|
+
const result = formatForLLM(markdown);
|
|
21
|
+
t.true(result.includes("# Level 1"));
|
|
22
|
+
t.true(result.includes("## Level 2"));
|
|
23
|
+
t.true(result.includes("### Level 3"));
|
|
24
|
+
});
|
|
25
|
+
test("normalizes heading levels - resets on blank sections", (t) => {
|
|
26
|
+
const markdown = `
|
|
27
|
+
# First Section
|
|
28
|
+
## Subsection
|
|
29
|
+
|
|
30
|
+
# New Section
|
|
31
|
+
##### Should become Level 2
|
|
32
|
+
`.trim();
|
|
33
|
+
const result = formatForLLM(markdown);
|
|
34
|
+
const lines = result.split("\n");
|
|
35
|
+
const newSectionIndex = lines.findIndex((l) => l.includes("New Section"));
|
|
36
|
+
const nextHeadingIndex = lines.findIndex((l, i) => i > newSectionIndex && l.includes("Should become"));
|
|
37
|
+
t.true(lines[nextHeadingIndex].startsWith("##"));
|
|
38
|
+
});
|
|
39
|
+
test("normalizes list markers - converts * to -", (t) => {
|
|
40
|
+
const markdown = `
|
|
41
|
+
* Item 1
|
|
42
|
+
* Item 2
|
|
43
|
+
+ Item 3
|
|
44
|
+
`.trim();
|
|
45
|
+
const result = formatForLLM(markdown);
|
|
46
|
+
t.true(result.includes("- Item 1"));
|
|
47
|
+
t.true(result.includes("- Item 2"));
|
|
48
|
+
t.true(result.includes("- Item 3"));
|
|
49
|
+
t.false(result.includes("* Item"));
|
|
50
|
+
t.false(result.includes("+ Item"));
|
|
51
|
+
});
|
|
52
|
+
test("normalizes list indentation - uses 2 spaces", (t) => {
|
|
53
|
+
const markdown = `
|
|
54
|
+
- Item 1
|
|
55
|
+
- Nested 1
|
|
56
|
+
- Nested 2
|
|
57
|
+
- Nested 3
|
|
58
|
+
`.trim();
|
|
59
|
+
const result = formatForLLM(markdown);
|
|
60
|
+
t.true(result.includes("- Item 1"));
|
|
61
|
+
t.true(result.includes(" - Nested 1"));
|
|
62
|
+
t.true(result.includes(" - Nested 2"));
|
|
63
|
+
t.true(result.includes(" - Nested 3"));
|
|
64
|
+
});
|
|
65
|
+
test("normalizes list indentation - fixes inconsistent spacing", (t) => {
|
|
66
|
+
const markdown = `
|
|
67
|
+
- Item 1
|
|
68
|
+
- Nested (3 spaces)
|
|
69
|
+
- Deep nested (5 spaces)
|
|
70
|
+
`.trim();
|
|
71
|
+
const result = formatForLLM(markdown);
|
|
72
|
+
const lines = result.split("\n");
|
|
73
|
+
// Should normalize to proper 2-space indentation
|
|
74
|
+
t.true(lines.some((l) => l.match(/^ - Nested/)));
|
|
75
|
+
t.true(lines.some((l) => l.match(/^ - Deep nested/)));
|
|
76
|
+
});
|
|
77
|
+
test("cleans inline formatting - reduces excessive emphasis", (t) => {
|
|
78
|
+
const markdown = "This is ***very important*** text";
|
|
79
|
+
const result = formatForLLM(markdown);
|
|
80
|
+
t.true(result.includes("**very important**"));
|
|
81
|
+
t.false(result.includes("***"));
|
|
82
|
+
});
|
|
83
|
+
test("cleans inline formatting - removes spaces inside emphasis", (t) => {
|
|
84
|
+
const markdown = "This is ** spaced ** and * also spaced * text";
|
|
85
|
+
const result = formatForLLM(markdown);
|
|
86
|
+
t.true(result.includes("**spaced**"));
|
|
87
|
+
t.true(result.includes("*also spaced*"));
|
|
88
|
+
});
|
|
89
|
+
test("cleans inline formatting - handles multiple emphasis patterns", (t) => {
|
|
90
|
+
const markdown = "** bold ** and **** extra bold ****";
|
|
91
|
+
const result = formatForLLM(markdown);
|
|
92
|
+
t.true(result.includes("**bold**"));
|
|
93
|
+
t.true(result.includes("**extra bold**"));
|
|
94
|
+
});
|
|
95
|
+
test("enhances code blocks - adds line breaks before", (t) => {
|
|
96
|
+
const markdown = "Some text\`\`\`javascript\ncode\n\`\`\`";
|
|
97
|
+
const result = formatForLLM(markdown);
|
|
98
|
+
t.true(result.includes("Some text\n\n\`\`\`"));
|
|
99
|
+
});
|
|
100
|
+
test("enhances code blocks - adds line breaks after opening", (t) => {
|
|
101
|
+
const markdown = "\`\`\`javascriptcode here\`\`\`";
|
|
102
|
+
const result = formatForLLM(markdown);
|
|
103
|
+
t.true(result.includes("\`\`\`\njavascript"));
|
|
104
|
+
});
|
|
105
|
+
test("enhances code blocks - properly spaces full code block", (t) => {
|
|
106
|
+
const markdown = "Text\`\`\`js\nconst x = 1;\n\`\`\`More text";
|
|
107
|
+
const result = formatForLLM(markdown);
|
|
108
|
+
t.true(result.includes("Text\n\n\`\`\`"));
|
|
109
|
+
t.true(result.includes("\`\`\`\njs"));
|
|
110
|
+
});
|
|
111
|
+
test("optimizes link formatting - converts reference to inline", (t) => {
|
|
112
|
+
const markdown = `
|
|
113
|
+
This is a [link][ref] to something.
|
|
114
|
+
|
|
115
|
+
[ref]: https://example.com
|
|
116
|
+
`.trim();
|
|
117
|
+
const result = formatForLLM(markdown);
|
|
118
|
+
t.true(result.includes("[link](https://example.com)"));
|
|
119
|
+
t.false(result.includes("[ref]:"));
|
|
120
|
+
});
|
|
121
|
+
test("optimizes link formatting - handles implicit references", (t) => {
|
|
122
|
+
const markdown = `
|
|
123
|
+
This is a [link][] to something.
|
|
124
|
+
|
|
125
|
+
[link]: https://example.com
|
|
126
|
+
`.trim();
|
|
127
|
+
const result = formatForLLM(markdown);
|
|
128
|
+
t.true(result.includes("[link](https://example.com)"));
|
|
129
|
+
});
|
|
130
|
+
test("optimizes link formatting - handles multiple references", (t) => {
|
|
131
|
+
const markdown = `
|
|
132
|
+
[Link 1][ref1] and [Link 2][ref2].
|
|
133
|
+
|
|
134
|
+
[ref1]: https://example.com
|
|
135
|
+
[ref2]: https://other.com
|
|
136
|
+
`.trim();
|
|
137
|
+
const result = formatForLLM(markdown);
|
|
138
|
+
t.true(result.includes("[Link 1](https://example.com)"));
|
|
139
|
+
t.true(result.includes("[Link 2](https://other.com)"));
|
|
140
|
+
t.false(result.includes("[ref1]:"));
|
|
141
|
+
t.false(result.includes("[ref2]:"));
|
|
142
|
+
});
|
|
143
|
+
test("optimizes link formatting - case insensitive references", (t) => {
|
|
144
|
+
const markdown = `
|
|
145
|
+
[Link][REF]
|
|
146
|
+
|
|
147
|
+
[ref]: https://example.com
|
|
148
|
+
`.trim();
|
|
149
|
+
const result = formatForLLM(markdown);
|
|
150
|
+
t.true(result.includes("[Link](https://example.com)"));
|
|
151
|
+
});
|
|
152
|
+
test("optimizes link formatting - preserves unmatched references", (t) => {
|
|
153
|
+
const markdown = "[Link][missing]";
|
|
154
|
+
const result = formatForLLM(markdown);
|
|
155
|
+
t.true(result.includes("[Link][missing]"));
|
|
156
|
+
});
|
|
157
|
+
test("handles complex markdown with all transformations", (t) => {
|
|
158
|
+
const markdown = `
|
|
159
|
+
# Title
|
|
160
|
+
#### Should be h2
|
|
161
|
+
|
|
162
|
+
- Item 1
|
|
163
|
+
- Nested
|
|
164
|
+
|
|
165
|
+
This is ***bold*** and **emphasized**.
|
|
166
|
+
|
|
167
|
+
[Link][ref]
|
|
168
|
+
|
|
169
|
+
[ref]: https://example.com
|
|
170
|
+
|
|
171
|
+
Some text\`\`\`js
|
|
172
|
+
code
|
|
173
|
+
\`\`\`
|
|
174
|
+
`.trim();
|
|
175
|
+
const result = formatForLLM(markdown);
|
|
176
|
+
// Check heading normalization
|
|
177
|
+
t.true(result.includes("# Title"));
|
|
178
|
+
t.true(result.includes("## Should be h2"));
|
|
179
|
+
// Check list normalization
|
|
180
|
+
t.true(result.includes("- Item 1"));
|
|
181
|
+
t.true(result.includes(" - Nested"));
|
|
182
|
+
// Check inline formatting
|
|
183
|
+
t.true(result.includes("**bold**"));
|
|
184
|
+
t.true(result.includes("**emphasized**"));
|
|
185
|
+
// Check link optimization
|
|
186
|
+
t.true(result.includes("[Link](https://example.com)"));
|
|
187
|
+
// Check code block enhancement
|
|
188
|
+
t.true(result.includes("\n\n\`\`\`"));
|
|
189
|
+
});
|
|
190
|
+
test("preserves inline code", (t) => {
|
|
191
|
+
const markdown = "Use `const x = 1;` for variables";
|
|
192
|
+
const result = formatForLLM(markdown);
|
|
193
|
+
t.true(result.includes("`const x = 1;`"));
|
|
194
|
+
});
|
|
195
|
+
test("handles empty input", (t) => {
|
|
196
|
+
const result = formatForLLM("");
|
|
197
|
+
t.is(result, "");
|
|
198
|
+
});
|
|
199
|
+
test("handles markdown with no transformations needed", (t) => {
|
|
200
|
+
const markdown = `
|
|
201
|
+
# Title
|
|
202
|
+
## Subtitle
|
|
203
|
+
|
|
204
|
+
- Item 1
|
|
205
|
+
- Item 2
|
|
206
|
+
|
|
207
|
+
This is normal text with **bold**.
|
|
208
|
+
`.trim();
|
|
209
|
+
const result = formatForLLM(markdown);
|
|
210
|
+
t.true(result.includes("# Title"));
|
|
211
|
+
t.true(result.includes("## Subtitle"));
|
|
212
|
+
t.true(result.includes("- Item 1"));
|
|
213
|
+
t.true(result.includes("**bold**"));
|
|
214
|
+
});
|
|
215
|
+
test("normalizes mixed list styles in same list", (t) => {
|
|
216
|
+
const markdown = `
|
|
217
|
+
* Item 1
|
|
218
|
+
+ Item 2
|
|
219
|
+
- Item 3
|
|
220
|
+
`.trim();
|
|
221
|
+
const result = formatForLLM(markdown);
|
|
222
|
+
const lines = result.split("\n");
|
|
223
|
+
t.true(lines.every((l) => !l.trim() || l.trim().startsWith("- ")));
|
|
224
|
+
});
|
|
225
|
+
test("handles nested lists with mixed markers", (t) => {
|
|
226
|
+
const markdown = `
|
|
227
|
+
- Item 1
|
|
228
|
+
- Nested 1
|
|
229
|
+
- Deep nested
|
|
230
|
+
`.trim();
|
|
231
|
+
const result = formatForLLM(markdown);
|
|
232
|
+
t.true(result.includes("- Item 1"));
|
|
233
|
+
t.true(result.includes(" - Nested 1"));
|
|
234
|
+
t.true(result.includes(" - Deep nested"));
|
|
235
|
+
});
|
|
236
|
+
test("preserves ordered lists", (t) => {
|
|
237
|
+
const markdown = `
|
|
238
|
+
1. First
|
|
239
|
+
2. Second
|
|
240
|
+
3. Third
|
|
241
|
+
`.trim();
|
|
242
|
+
const result = formatForLLM(markdown);
|
|
243
|
+
t.true(result.includes("1. First"));
|
|
244
|
+
t.true(result.includes("2. Second"));
|
|
245
|
+
t.true(result.includes("3. Third"));
|
|
246
|
+
});
|
|
247
|
+
test("handles code blocks with language identifiers", (t) => {
|
|
248
|
+
const markdown = "\`\`\`typescript\nconst x: number = 1;\n\`\`\`";
|
|
249
|
+
const result = formatForLLM(markdown);
|
|
250
|
+
t.true(result.includes("\`\`\`"));
|
|
251
|
+
t.true(result.includes("typescript"));
|
|
252
|
+
});
|
|
253
|
+
test("handles multiple consecutive emphasis correctly", (t) => {
|
|
254
|
+
const markdown = "**bold** and **more bold** text";
|
|
255
|
+
const result = formatForLLM(markdown);
|
|
256
|
+
t.true(result.includes("**bold**"));
|
|
257
|
+
t.true(result.includes("**more bold**"));
|
|
258
|
+
});
|
|
259
|
+
test("normalizes heading levels progressively", (t) => {
|
|
260
|
+
const markdown = `
|
|
261
|
+
# H1
|
|
262
|
+
### H3 (stays H3, max jump is 2)
|
|
263
|
+
##### H5 (stays H5, max jump is 2)
|
|
264
|
+
`.trim();
|
|
265
|
+
const result = formatForLLM(markdown);
|
|
266
|
+
const lines = result.split("\n");
|
|
267
|
+
t.true(lines[0].startsWith("#"));
|
|
268
|
+
t.is(lines[0].match(/^#+/)?.[0].length, 1);
|
|
269
|
+
// H3 is allowed because it's currentLevel (1) + 2 = 3
|
|
270
|
+
t.true(lines[1].startsWith("###"));
|
|
271
|
+
t.is(lines[1].match(/^#+/)?.[0].length, 3);
|
|
272
|
+
// H5 is allowed because it's currentLevel (3) + 2 = 5
|
|
273
|
+
t.true(lines[2].startsWith("#####"));
|
|
274
|
+
t.is(lines[2].match(/^#+/)?.[0].length, 5);
|
|
275
|
+
});
|
|
276
|
+
//# sourceMappingURL=llm-formatter.spec.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-formatter.spec.js","sourceRoot":"","sources":["../../src/optimizers/llm-formatter.spec.ts"],"names":[],"mappings":"AAAA,uCAAuC;AAEvC,OAAO,IAAI,MAAM,KAAK,CAAC;AACvB,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAElD,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE,EAAE;IACjE,MAAM,QAAQ,GAAG;;;GAGhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC;IACrC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;AACpC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE,EAAE;IAClE,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC;IACrC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;AACzC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE,EAAE;IACjE,MAAM,QAAQ,GAAG;;;;;;GAMhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACjC,MAAM,eAAe,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;IAC1E,MAAM,gBAAgB,GAAG,KAAK,CAAC,SAAS,CACtC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,eAAe,IAAI,CAAC,CAAC,QAAQ,CAAC,eAAe,CAAC,CAC7D,CAAC;IAEF,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;AACnD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,2CAA2C,EAAE,CAAC,CAAC,EAAE,EAAE;IACtD,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,6CAA6C,EAAE,CAAC,CAAC,EAAE,EAAE;IACxD,MAAM,QAAQ,GAAG;;;;;GAKhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAC1C,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0DAA0D,EAAE,CAAC,CAAC,EAAE,EAAE;IACrE,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,iDAAiD;IACjD,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;IAClD,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC;AAC3D,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE,EAAE;IAClE,MAAM,QAAQ,GAAG,mCAAmC,CAAC;IACrD,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC;IAC9C,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;AAClC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,2DAA2D,EAAE,CAAC,CAAC,EAAE,EAAE;IACtE,MAAM,QAAQ,GAAG,+CAA+C,CAAC;IACjE,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC;AAC3C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,+DAA+D,EAAE,CAAC,CAAC,EAAE,EAAE;IAC1E,MAAM,QAAQ,GAAG,qCAAqC,CAAC;IACvD,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;AAC5C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,gDAAgD,EAAE,CAAC,CAAC,EAAE,EAAE;IAC3D,MAAM,QAAQ,GAAG,yCAAyC,CAAC;IAC3D,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,qBAAqB,CAAC,CAAC,CAAC;AACjD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE,EAAE;IAClE,MAAM,QAAQ,GAAG,iCAAiC,CAAC;IACnD,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC;AAChD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,wDAAwD,EAAE,CAAC,CAAC,EAAE,EAAE;IACnE,MAAM,QAAQ,GAAG,6CAA6C,CAAC;IAC/D,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAC1C,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;AACxC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0DAA0D,EAAE,CAAC,CAAC,EAAE,EAAE;IACrE,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;IACvD,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yDAAyD,EAAE,CAAC,CAAC,EAAE,EAAE;IACpE,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;AACzD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yDAAyD,EAAE,CAAC,CAAC,EAAE,EAAE;IACpE,MAAM,QAAQ,GAAG;;;;;GAKhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,+BAA+B,CAAC,CAAC,CAAC;IACzD,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;IACvD,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yDAAyD,EAAE,CAAC,CAAC,EAAE,EAAE;IACpE,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;AACzD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,4DAA4D,EAAE,CAAC,CAAC,EAAE,EAAE;IACvE,MAAM,QAAQ,GAAG,iBAAiB,CAAC;IACnC,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE,EAAE;IAC9D,MAAM,QAAQ,GAAG;;;;;;;;;;;;;;;;GAgBhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IAEtC,8BAA8B;IAC9B,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC,CAAC;IAE3C,2BAA2B;IAC3B,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;IAEtC,0BAA0B;IAC1B,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAE1C,0BAA0B;IAC1B,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;IAEvD,+BAA+B;IAC/B,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;AACxC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uBAAuB,EAAE,CAAC,CAAC,EAAE,EAAE;IAClC,MAAM,QAAQ,GAAG,kCAAkC,CAAC;IACpD,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;AAC5C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,qBAAqB,EAAE,CAAC,CAAC,EAAE,EAAE;IAChC,MAAM,MAAM,GAAG,YAAY,CAAC,EAAE,CAAC,CAAC;IAChC,CAAC,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,iDAAiD,EAAE,CAAC,CAAC,EAAE,EAAE;IAC5D,MAAM,QAAQ,GAAG;;;;;;;;GAQhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;IACvC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,2CAA2C,EAAE,CAAC,CAAC,EAAE,EAAE;IACtD,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACrE,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yCAAyC,EAAE,CAAC,CAAC,EAAE,EAAE;IACpD,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,mBAAmB,CAAC,CAAC,CAAC;AAC/C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yBAAyB,EAAE,CAAC,CAAC,EAAE,EAAE;IACpC,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC;IACrC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,+CAA+C,EAAE,CAAC,CAAC,EAAE,EAAE;IAC1D,MAAM,QAAQ,GAAG,gDAAgD,CAAC;IAClE,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IAClC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,CAAC;AACxC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,iDAAiD,EAAE,CAAC,CAAC,EAAE,EAAE;IAC5D,MAAM,QAAQ,GAAG,iCAAiC,CAAC;IACnD,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC;AAC3C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yCAAyC,EAAE,CAAC,CAAC,EAAE,EAAE;IACpD,MAAM,QAAQ,GAAG;;;;GAIhB,CAAC,IAAI,EAAE,CAAC;IAET,MAAM,MAAM,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IACtC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IACjC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAE3C,sDAAsD;IACtD,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC;IACnC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAE3C,sDAAsD;IACtD,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;IACrC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// src/optimizers/structure-enhancer.ts
|
|
2
|
-
import * as cheerio from "cheerio";
|
|
2
|
+
import * as cheerio from "cheerio/slim";
|
|
3
3
|
/**
|
|
4
4
|
* Enhance HTML structure for better markdown conversion
|
|
5
5
|
* - Improve heading hierarchy
|
|
@@ -18,12 +18,12 @@ export function enhanceStructure(html) {
|
|
|
18
18
|
}
|
|
19
19
|
function normalizeHeadings($) {
|
|
20
20
|
// Ensure headings have proper hierarchy
|
|
21
|
-
//
|
|
21
|
+
// biome-ignore lint/suspicious/noExplicitAny: Cheerio type compatibility
|
|
22
22
|
const headings = [];
|
|
23
23
|
$("h1, h2, h3, h4, h5, h6").each((_, el) => {
|
|
24
24
|
const $el = $(el);
|
|
25
25
|
const tagName = el.tagName?.toLowerCase() || "";
|
|
26
|
-
const level = parseInt(tagName.substring(1));
|
|
26
|
+
const level = parseInt(tagName.substring(1), 10);
|
|
27
27
|
headings.push({ level, $el });
|
|
28
28
|
});
|
|
29
29
|
// Adjust heading levels if they skip
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"structure-enhancer.js","sourceRoot":"","sources":["../../src/optimizers/structure-enhancer.ts"],"names":[],"mappings":"AAAA,uCAAuC;AAEvC,OAAO,KAAK,OAAO,MAAM,
|
|
1
|
+
{"version":3,"file":"structure-enhancer.js","sourceRoot":"","sources":["../../src/optimizers/structure-enhancer.ts"],"names":[],"mappings":"AAAA,uCAAuC;AAEvC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC;;;;;GAKG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,iCAAiC;IACjC,iBAAiB,CAAC,CAAC,CAAC,CAAC;IAErB,wCAAwC;IACxC,uBAAuB,CAAC,CAAC,CAAC,CAAC;IAE3B,+DAA+D;IAC/D,qBAAqB,CAAC,CAAC,CAAC,CAAC;IAEzB,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;AAClB,CAAC;AAED,SAAS,iBAAiB,CAAC,CAAqB;IAC9C,wCAAwC;IACxC,yEAAyE;IACzE,MAAM,QAAQ,GAAmD,EAAE,CAAC;IAEpE,CAAC,CAAC,wBAAwB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACzC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAChD,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACjD,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,qCAAqC;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,QAAQ,CAAC,OAAO,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE;QAClC,IAAI,KAAK,GAAG,SAAS,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,SAAS,GAAG,CAAC,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC9B,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,EAAE,CAAC;gBACT,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,MAAM,IAAI,IAAI,KAAK,MAAM,GAAG,CAAC,CAAC,CAAC;YACvD,CAAC;YACD,SAAS,GAAG,QAAQ,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,SAAS,GAAG,KAAK,CAAC;QACpB,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,uBAAuB,CAAC,CAAqB;IACpD,yCAAyC;IACzC,CAAC,CAAC,8CAA8C,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAC/D,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI;YAAE,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAClC,CAAC,CAAC,CAAC;IAEH,4DAA4D;IAC5D,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAEhC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,CAAC;YAChE,IACE,OAAO;gBACP,CAAC,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,EACnE,CAAC;gBACD,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;gBACxB,IAAI,IAAI;oBAAE,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YAClC,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,qBAAqB,CAAC,CAAqB;IAClD,kEAAkE;IAClE,CAAC,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAC5B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAE/B,gDAAgD;QAChD,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO;QAEtC,4DAA4D;QAC5D,MAAM,SAAS,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAEtC,IACE,IAAI,CAAC,MAAM,GAAG,CAAC;YACf,IAAI,CAAC,MAAM,GAAG,GAAG;YACjB,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC;gBACxC,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAC3C,KAAK,CAAC,QAAQ,CAAC,mBAAmB,CAAC;gBACnC,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAC,EACrC,CAAC;YACD,2BAA2B;YAC3B,MAAM,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC;YAC1D,GAAG,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structure-enhancer.spec.d.ts","sourceRoot":"","sources":["../../src/optimizers/structure-enhancer.spec.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
// src/optimizers/structure-enhancer.spec.ts
|
|
2
|
+
import test from "ava";
|
|
3
|
+
import { enhanceStructure } from "./structure-enhancer.js";
|
|
4
|
+
test("normalizes heading hierarchy - fixes skipped levels", (t) => {
|
|
5
|
+
const html = `
|
|
6
|
+
<div>
|
|
7
|
+
<h1>Title</h1>
|
|
8
|
+
<h5>Should be h2</h5>
|
|
9
|
+
</div>
|
|
10
|
+
`;
|
|
11
|
+
const result = enhanceStructure(html);
|
|
12
|
+
t.true(result.includes("<h1>Title</h1>"));
|
|
13
|
+
t.true(result.includes("<h2>Should be h2</h2>"));
|
|
14
|
+
t.false(result.includes("<h5>"));
|
|
15
|
+
});
|
|
16
|
+
test("normalizes heading hierarchy - respects proper sequence", (t) => {
|
|
17
|
+
const html = `
|
|
18
|
+
<div>
|
|
19
|
+
<h1>Level 1</h1>
|
|
20
|
+
<h2>Level 2</h2>
|
|
21
|
+
<h3>Level 3</h3>
|
|
22
|
+
</div>
|
|
23
|
+
`;
|
|
24
|
+
const result = enhanceStructure(html);
|
|
25
|
+
t.true(result.includes("<h1>Level 1</h1>"));
|
|
26
|
+
t.true(result.includes("<h2>Level 2</h2>"));
|
|
27
|
+
t.true(result.includes("<h3>Level 3</h3>"));
|
|
28
|
+
});
|
|
29
|
+
test("normalizes heading hierarchy - progressive adjustment", (t) => {
|
|
30
|
+
const html = `
|
|
31
|
+
<div>
|
|
32
|
+
<h1>H1</h1>
|
|
33
|
+
<h3>H3 becomes H2</h3>
|
|
34
|
+
<h6>H6 becomes H3</h6>
|
|
35
|
+
</div>
|
|
36
|
+
`;
|
|
37
|
+
const result = enhanceStructure(html);
|
|
38
|
+
t.true(result.includes("<h1>H1</h1>"));
|
|
39
|
+
t.true(result.includes("<h2>H3 becomes H2</h2>"));
|
|
40
|
+
t.true(result.includes("<h3>H6 becomes H3</h3>"));
|
|
41
|
+
t.false(result.includes("<h6>"));
|
|
42
|
+
});
|
|
43
|
+
test("normalizes heading hierarchy - handles multiple sections", (t) => {
|
|
44
|
+
const html = `
|
|
45
|
+
<div>
|
|
46
|
+
<h2>First section</h2>
|
|
47
|
+
<h3>Subsection</h3>
|
|
48
|
+
<h2>Second section</h2>
|
|
49
|
+
<h5>Should be h3</h5>
|
|
50
|
+
</div>
|
|
51
|
+
`;
|
|
52
|
+
const result = enhanceStructure(html);
|
|
53
|
+
// h2 at start becomes h1 (can't skip levels from 0)
|
|
54
|
+
t.true(result.includes("<h1>First section</h1>"));
|
|
55
|
+
// h3 after h1 becomes h2
|
|
56
|
+
t.true(result.includes("<h2>Subsection</h2>"));
|
|
57
|
+
// h2 after h2 stays h2
|
|
58
|
+
t.true(result.includes("<h2>Second section</h2>"));
|
|
59
|
+
// h5 after h2 becomes h3 (can only go up by 1)
|
|
60
|
+
t.true(result.includes("<h3>Should be h3</h3>"));
|
|
61
|
+
});
|
|
62
|
+
test("unwraps redundant nested divs", (t) => {
|
|
63
|
+
const html = `
|
|
64
|
+
<div>
|
|
65
|
+
<div>
|
|
66
|
+
<div>
|
|
67
|
+
<p>Content</p>
|
|
68
|
+
</div>
|
|
69
|
+
</div>
|
|
70
|
+
</div>
|
|
71
|
+
`;
|
|
72
|
+
const result = enhanceStructure(html);
|
|
73
|
+
t.true(result.includes("<p>Content</p>"));
|
|
74
|
+
// Should have fewer nested divs
|
|
75
|
+
const divCount = (result.match(/<div>/g) || []).length;
|
|
76
|
+
t.true(divCount < 3);
|
|
77
|
+
});
|
|
78
|
+
test("unwraps redundant nested spans", (t) => {
|
|
79
|
+
const html = `
|
|
80
|
+
<span>
|
|
81
|
+
<span>
|
|
82
|
+
Text content
|
|
83
|
+
</span>
|
|
84
|
+
</span>
|
|
85
|
+
`;
|
|
86
|
+
const result = enhanceStructure(html);
|
|
87
|
+
t.true(result.includes("Text content"));
|
|
88
|
+
// Should have fewer nested spans
|
|
89
|
+
const spanCount = (result.match(/<span>/g) || []).length;
|
|
90
|
+
t.true(spanCount < 2);
|
|
91
|
+
});
|
|
92
|
+
test("unwraps paragraphs containing block elements", (t) => {
|
|
93
|
+
const html = `
|
|
94
|
+
<div>
|
|
95
|
+
<p><div>Should be unwrapped</div></p>
|
|
96
|
+
<p><blockquote>Quote</blockquote></p>
|
|
97
|
+
<p><ul><li>List</li></ul></p>
|
|
98
|
+
</div>
|
|
99
|
+
`;
|
|
100
|
+
const result = enhanceStructure(html);
|
|
101
|
+
t.false(result.includes("<p><div>"));
|
|
102
|
+
t.false(result.includes("<p><blockquote>"));
|
|
103
|
+
t.false(result.includes("<p><ul>"));
|
|
104
|
+
t.true(result.includes("Should be unwrapped"));
|
|
105
|
+
t.true(result.includes("Quote"));
|
|
106
|
+
t.true(result.includes("List"));
|
|
107
|
+
});
|
|
108
|
+
test("preserves paragraphs with inline content", (t) => {
|
|
109
|
+
const html = `
|
|
110
|
+
<div>
|
|
111
|
+
<p>Normal paragraph with <strong>emphasis</strong></p>
|
|
112
|
+
<p><span>With span</span></p>
|
|
113
|
+
</div>
|
|
114
|
+
`;
|
|
115
|
+
const result = enhanceStructure(html);
|
|
116
|
+
t.true(result.includes("<p>Normal paragraph"));
|
|
117
|
+
t.true(result.includes("<strong>emphasis</strong>"));
|
|
118
|
+
});
|
|
119
|
+
test("converts title-classed divs to headings", (t) => {
|
|
120
|
+
const html = `
|
|
121
|
+
<div>
|
|
122
|
+
<div class="title">This looks like a title</div>
|
|
123
|
+
<p>Content</p>
|
|
124
|
+
</div>
|
|
125
|
+
`;
|
|
126
|
+
const result = enhanceStructure(html);
|
|
127
|
+
t.true(result.includes("<h3>This looks like a title</h3>"));
|
|
128
|
+
t.false(result.includes('class="title"'));
|
|
129
|
+
});
|
|
130
|
+
test("converts heading-classed divs to headings", (t) => {
|
|
131
|
+
const html = `
|
|
132
|
+
<div>
|
|
133
|
+
<div class="heading">Section Heading</div>
|
|
134
|
+
<p>Content</p>
|
|
135
|
+
</div>
|
|
136
|
+
`;
|
|
137
|
+
const result = enhanceStructure(html);
|
|
138
|
+
t.true(result.includes("<h3>Section Heading</h3>"));
|
|
139
|
+
});
|
|
140
|
+
test("converts bold styled divs to headings", (t) => {
|
|
141
|
+
const html = `
|
|
142
|
+
<div>
|
|
143
|
+
<div style="font-weight: bold">Bold Title</div>
|
|
144
|
+
<div style="font-weight:bold">Another Title</div>
|
|
145
|
+
<p>Content</p>
|
|
146
|
+
</div>
|
|
147
|
+
`;
|
|
148
|
+
const result = enhanceStructure(html);
|
|
149
|
+
t.true(result.includes("<h3>Bold Title</h3>"));
|
|
150
|
+
t.true(result.includes("<h3>Another Title</h3>"));
|
|
151
|
+
});
|
|
152
|
+
test("converts bold styled spans to headings", (t) => {
|
|
153
|
+
const html = `
|
|
154
|
+
<div>
|
|
155
|
+
<span style="font-weight: bold">Span Title</span>
|
|
156
|
+
<p>Content</p>
|
|
157
|
+
</div>
|
|
158
|
+
`;
|
|
159
|
+
const result = enhanceStructure(html);
|
|
160
|
+
t.true(result.includes("<h3>Span Title</h3>"));
|
|
161
|
+
});
|
|
162
|
+
test("does not convert long text to headings", (t) => {
|
|
163
|
+
const longText = "A".repeat(150);
|
|
164
|
+
const html = `
|
|
165
|
+
<div>
|
|
166
|
+
<div class="title">${longText}</div>
|
|
167
|
+
</div>
|
|
168
|
+
`;
|
|
169
|
+
const result = enhanceStructure(html);
|
|
170
|
+
t.false(result.includes("<h3>"));
|
|
171
|
+
t.true(result.includes(longText));
|
|
172
|
+
});
|
|
173
|
+
test("does not convert divs with children to headings", (t) => {
|
|
174
|
+
const html = `
|
|
175
|
+
<div>
|
|
176
|
+
<div class="title">
|
|
177
|
+
<span>Nested</span>
|
|
178
|
+
<span>Content</span>
|
|
179
|
+
</div>
|
|
180
|
+
</div>
|
|
181
|
+
`;
|
|
182
|
+
const result = enhanceStructure(html);
|
|
183
|
+
t.false(result.includes("<h3>"));
|
|
184
|
+
t.true(result.includes("<span>Nested</span>"));
|
|
185
|
+
});
|
|
186
|
+
test("does not convert empty divs to headings", (t) => {
|
|
187
|
+
const html = `
|
|
188
|
+
<div>
|
|
189
|
+
<div class="title"></div>
|
|
190
|
+
<p>Content</p>
|
|
191
|
+
</div>
|
|
192
|
+
`;
|
|
193
|
+
const result = enhanceStructure(html);
|
|
194
|
+
// Count h3 tags - should be 0
|
|
195
|
+
const h3Count = (result.match(/<h3>/g) || []).length;
|
|
196
|
+
t.is(h3Count, 0);
|
|
197
|
+
});
|
|
198
|
+
test("handles complex nested structure", (t) => {
|
|
199
|
+
const html = `
|
|
200
|
+
<div>
|
|
201
|
+
<h1>Main Title</h1>
|
|
202
|
+
<div>
|
|
203
|
+
<div>
|
|
204
|
+
<h5>Should be h2</h5>
|
|
205
|
+
<div class="title">Pseudo heading</div>
|
|
206
|
+
<p><blockquote>Quote</blockquote></p>
|
|
207
|
+
</div>
|
|
208
|
+
</div>
|
|
209
|
+
</div>
|
|
210
|
+
`;
|
|
211
|
+
const result = enhanceStructure(html);
|
|
212
|
+
t.true(result.includes("<h1>Main Title</h1>"));
|
|
213
|
+
t.true(result.includes("<h2>Should be h2</h2>"));
|
|
214
|
+
t.true(result.includes("<h3>Pseudo heading</h3>"));
|
|
215
|
+
t.false(result.includes("<p><blockquote>"));
|
|
216
|
+
});
|
|
217
|
+
test("preserves heading content with HTML entities", (t) => {
|
|
218
|
+
const html = `
|
|
219
|
+
<div>
|
|
220
|
+
<h1>Title & Subtitle</h1>
|
|
221
|
+
<h5>Second <Level></h5>
|
|
222
|
+
</div>
|
|
223
|
+
`;
|
|
224
|
+
const result = enhanceStructure(html);
|
|
225
|
+
t.true(result.includes("Title & Subtitle"));
|
|
226
|
+
t.true(result.includes("Second <Level>"));
|
|
227
|
+
});
|
|
228
|
+
test("handles headings with nested elements", (t) => {
|
|
229
|
+
const html = `
|
|
230
|
+
<div>
|
|
231
|
+
<h1>Title <strong>with emphasis</strong></h1>
|
|
232
|
+
<h5>Level <em>five</em></h5>
|
|
233
|
+
</div>
|
|
234
|
+
`;
|
|
235
|
+
const result = enhanceStructure(html);
|
|
236
|
+
t.true(result.includes("<h1>Title <strong>with emphasis</strong></h1>"));
|
|
237
|
+
t.true(result.includes("<h2>Level <em>five</em></h2>"));
|
|
238
|
+
});
|
|
239
|
+
test("handles empty input", (t) => {
|
|
240
|
+
const result = enhanceStructure("");
|
|
241
|
+
t.is(result, "");
|
|
242
|
+
});
|
|
243
|
+
test("handles input with no transformations needed", (t) => {
|
|
244
|
+
const html = `
|
|
245
|
+
<div>
|
|
246
|
+
<h1>Title</h1>
|
|
247
|
+
<h2>Subtitle</h2>
|
|
248
|
+
<p>Content</p>
|
|
249
|
+
</div>
|
|
250
|
+
`;
|
|
251
|
+
const result = enhanceStructure(html);
|
|
252
|
+
t.true(result.includes("<h1>Title</h1>"));
|
|
253
|
+
t.true(result.includes("<h2>Subtitle</h2>"));
|
|
254
|
+
t.true(result.includes("<p>Content</p>"));
|
|
255
|
+
});
|
|
256
|
+
test("converts multiple pseudo-headings in same document", (t) => {
|
|
257
|
+
const html = `
|
|
258
|
+
<div>
|
|
259
|
+
<div class="title">First Title</div>
|
|
260
|
+
<p>Content 1</p>
|
|
261
|
+
<div class="heading">Second Title</div>
|
|
262
|
+
<p>Content 2</p>
|
|
263
|
+
<span style="font-weight: bold">Third Title</span>
|
|
264
|
+
<p>Content 3</p>
|
|
265
|
+
</div>
|
|
266
|
+
`;
|
|
267
|
+
const result = enhanceStructure(html);
|
|
268
|
+
t.true(result.includes("<h3>First Title</h3>"));
|
|
269
|
+
t.true(result.includes("<h3>Second Title</h3>"));
|
|
270
|
+
t.true(result.includes("<h3>Third Title</h3>"));
|
|
271
|
+
});
|
|
272
|
+
test("handles case-insensitive class matching", (t) => {
|
|
273
|
+
const html = `
|
|
274
|
+
<div>
|
|
275
|
+
<div class="Title">Mixed Case Title</div>
|
|
276
|
+
<div class="HEADING">Upper Case Heading</div>
|
|
277
|
+
</div>
|
|
278
|
+
`;
|
|
279
|
+
const result = enhanceStructure(html);
|
|
280
|
+
t.true(result.includes("<h3>Mixed Case Title</h3>"));
|
|
281
|
+
t.true(result.includes("<h3>Upper Case Heading</h3>"));
|
|
282
|
+
});
|
|
283
|
+
test("unwraps nested paragraphs with pre elements", (t) => {
|
|
284
|
+
const html = `
|
|
285
|
+
<div>
|
|
286
|
+
<p><pre>Code block</pre></p>
|
|
287
|
+
</div>
|
|
288
|
+
`;
|
|
289
|
+
const result = enhanceStructure(html);
|
|
290
|
+
t.false(result.includes("<p><pre>"));
|
|
291
|
+
t.true(result.includes("<pre>Code block</pre>"));
|
|
292
|
+
});
|
|
293
|
+
test("unwraps nested paragraphs with table elements", (t) => {
|
|
294
|
+
const html = `
|
|
295
|
+
<div>
|
|
296
|
+
<p><table><tr><td>Cell</td></tr></table></p>
|
|
297
|
+
</div>
|
|
298
|
+
`;
|
|
299
|
+
const result = enhanceStructure(html);
|
|
300
|
+
t.false(result.includes("<p><table>"));
|
|
301
|
+
t.true(result.includes("<table>"));
|
|
302
|
+
});
|
|
303
|
+
test("preserves paragraph with multiple inline elements", (t) => {
|
|
304
|
+
const html = `
|
|
305
|
+
<div>
|
|
306
|
+
<p><strong>Bold</strong> and <em>italic</em> text</p>
|
|
307
|
+
</div>
|
|
308
|
+
`;
|
|
309
|
+
const result = enhanceStructure(html);
|
|
310
|
+
t.true(result.includes("<p>"));
|
|
311
|
+
t.true(result.includes("<strong>Bold</strong>"));
|
|
312
|
+
t.true(result.includes("<em>italic</em>"));
|
|
313
|
+
});
|
|
314
|
+
test("handles heading hierarchy reset across sections", (t) => {
|
|
315
|
+
const html = `
|
|
316
|
+
<div>
|
|
317
|
+
<h1>Section 1</h1>
|
|
318
|
+
<h2>Subsection</h2>
|
|
319
|
+
</div>
|
|
320
|
+
<div>
|
|
321
|
+
<h1>Section 2</h1>
|
|
322
|
+
<h2>Subsection</h2>
|
|
323
|
+
</div>
|
|
324
|
+
`;
|
|
325
|
+
const result = enhanceStructure(html);
|
|
326
|
+
const h1Count = (result.match(/<h1>/g) || []).length;
|
|
327
|
+
const h2Count = (result.match(/<h2>/g) || []).length;
|
|
328
|
+
t.is(h1Count, 2);
|
|
329
|
+
t.is(h2Count, 2);
|
|
330
|
+
});
|
|
331
|
+
//# sourceMappingURL=structure-enhancer.spec.js.map
|