hdoc-tools 0.47.2 → 0.47.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -10
- package/hdoc-build-db.js +103 -116
- package/hdoc-build.js +44 -68
- package/hdoc-db.js +37 -15
- package/hdoc-module.js +57 -9
- package/hdoc-validate.js +105 -107
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -35,7 +35,7 @@ Returns statistics regarding the book you are working on:
|
|
|
35
35
|
- Number of Markdown Files in the Book
|
|
36
36
|
- Number of Static HTML Files in the Book
|
|
37
37
|
|
|
38
|
-
If the
|
|
38
|
+
If the `-v` switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
|
|
39
39
|
|
|
40
40
|
The book statistics do not include counts for any externally hosted content injected into the book content using the [[INCLUDE]] tags.
|
|
41
41
|
|
|
@@ -43,25 +43,25 @@ The book statistics do not include counts for any externally hosted content inje
|
|
|
43
43
|
|
|
44
44
|
Performs a local build of the book, validates the links and static content are present and correct and outputs as a ZIP file.
|
|
45
45
|
|
|
46
|
-
If the
|
|
46
|
+
If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
|
|
47
47
|
|
|
48
|
-
Use the
|
|
48
|
+
Use the `--set-version` argument to set the version number of the built book.
|
|
49
49
|
|
|
50
|
-
Use the
|
|
50
|
+
Use the `--no-color` argument to remove any color control characters from the output.
|
|
51
51
|
|
|
52
|
-
Use the
|
|
52
|
+
Use the `--no-links` argument to skip link output to CLI during validation.
|
|
53
53
|
|
|
54
54
|
### validate
|
|
55
55
|
|
|
56
56
|
Performs a minimum local build of the book, then validates the links and static content are present and correct.
|
|
57
57
|
|
|
58
|
-
If the
|
|
58
|
+
If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
|
|
59
59
|
|
|
60
|
-
Use the
|
|
60
|
+
Use the `--set-version` argument to set the version number of the built book.
|
|
61
61
|
|
|
62
|
-
Use the
|
|
62
|
+
Use the `--no-color` argument to remove any color control characters from the output.
|
|
63
63
|
|
|
64
|
-
Use the
|
|
64
|
+
Use the `--no-links` argument to skip link output to CLI during validation.
|
|
65
65
|
|
|
66
66
|
### serve
|
|
67
67
|
|
|
@@ -72,4 +72,4 @@ command `hdoc serve` and in a local browser go to the URL `http://localhost:3000
|
|
|
72
72
|
|
|
73
73
|
## Installation
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
> `npm install hdoc-tools -g`
|
package/hdoc-build-db.js
CHANGED
|
@@ -87,41 +87,43 @@
|
|
|
87
87
|
return response;
|
|
88
88
|
};
|
|
89
89
|
|
|
90
|
-
exports.populate_redirects = (db, redirect_records,
|
|
90
|
+
exports.populate_redirects = (db, redirect_records, _verbose = false) => {
|
|
91
91
|
const response = {
|
|
92
92
|
success: true,
|
|
93
93
|
errors: [],
|
|
94
94
|
index_success_count: 0,
|
|
95
95
|
};
|
|
96
96
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
response.success = false;
|
|
111
|
-
response.errors.push(
|
|
112
|
-
`Redirect record creation failed - ${redirect_records[i].url}: ${index_response.error}`,
|
|
113
|
-
);
|
|
114
|
-
} else {
|
|
115
|
-
response.index_success_count++;
|
|
97
|
+
// Prepare once, insert all in one transaction
|
|
98
|
+
const cols = db_schema.hdoc_redirects.map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
|
|
99
|
+
const stmt = db.prepare(`INSERT INTO hdoc_redirects (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
|
|
100
|
+
|
|
101
|
+
const run_all = db.transaction(() => {
|
|
102
|
+
for (const record of redirect_records) {
|
|
103
|
+
try {
|
|
104
|
+
stmt.run(record.url, record.location ? record.location : "", record.code);
|
|
105
|
+
response.index_success_count++;
|
|
106
|
+
} catch (e) {
|
|
107
|
+
response.success = false;
|
|
108
|
+
response.errors.push(`Redirect record creation failed - ${record.url}: ${e}`);
|
|
109
|
+
}
|
|
116
110
|
}
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
run_all();
|
|
115
|
+
} catch (e) {
|
|
116
|
+
response.success = false;
|
|
117
|
+
response.errors.push(`Redirect index transaction failed: ${e}`);
|
|
117
118
|
}
|
|
119
|
+
|
|
118
120
|
console.log(
|
|
119
121
|
`\nRedirect Index Build Complete: ${response.index_success_count} document records created.`,
|
|
120
122
|
);
|
|
121
123
|
return response;
|
|
122
124
|
};
|
|
123
125
|
|
|
124
|
-
exports.populate_index =
|
|
126
|
+
exports.populate_index = (
|
|
125
127
|
db,
|
|
126
128
|
doc_id,
|
|
127
129
|
book_config,
|
|
@@ -136,124 +138,109 @@
|
|
|
136
138
|
|
|
137
139
|
if (!book_config.tags) book_config.tags = [];
|
|
138
140
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
141
|
+
// Build a prepared statement from a schema entry once, reusing it for every row.
|
|
142
|
+
// Previously insert_record() called db.prepare() on every single insert.
|
|
143
|
+
const make_stmt = (table) => {
|
|
144
|
+
const cols = db_schema[table].map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
|
|
145
|
+
return db.prepare(`INSERT INTO ${table} (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
|
|
146
|
+
};
|
|
147
|
+
const stmt_index = make_stmt("hdoc_index");
|
|
148
|
+
const stmt_meta = make_stmt("hdoc_meta");
|
|
149
|
+
const stmt_contrib = make_stmt("hdoc_contributors");
|
|
150
|
+
|
|
151
|
+
// A single transaction batches all disk flushes into one — critical for
|
|
152
|
+
// FTS5 which otherwise re-indexes on every individual insert.
|
|
153
|
+
const run_all = db.transaction(() => {
|
|
154
|
+
let curr_file = "";
|
|
155
|
+
for (const file of index_records) {
|
|
146
156
|
let index_path_name = file.relative_path.replaceAll("\\", "/");
|
|
147
157
|
if (
|
|
148
158
|
index_path_name.endsWith("/index.md") ||
|
|
149
159
|
index_path_name.endsWith("/index.html") ||
|
|
150
160
|
index_path_name.endsWith("/index.htm")
|
|
151
161
|
) {
|
|
152
|
-
index_path_name = index_path_name.substring(
|
|
153
|
-
0,
|
|
154
|
-
index_path_name.lastIndexOf("/"),
|
|
155
|
-
);
|
|
162
|
+
index_path_name = index_path_name.substring(0, index_path_name.lastIndexOf("/"));
|
|
156
163
|
}
|
|
157
164
|
index_path_name = `/${index_path_name.replace(path.extname(file.relative_path), "")}`;
|
|
158
165
|
|
|
159
|
-
let
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
if (file.index_html.id !== null)
|
|
165
|
-
index_content_path += `#${file.index_html.id}`;
|
|
166
|
+
let inserted_row_id = null;
|
|
167
|
+
const index_content_path = file.index_html.id !== null
|
|
168
|
+
? `${index_path_name}#${file.index_html.id}`
|
|
169
|
+
: index_path_name;
|
|
170
|
+
|
|
166
171
|
if (!file.inline) {
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
172
|
+
try {
|
|
173
|
+
const info = stmt_index.run(
|
|
174
|
+
index_content_path,
|
|
175
|
+
doc_id,
|
|
176
|
+
book_config.audience.join(","),
|
|
177
|
+
book_config.tags.join(","),
|
|
178
|
+
file.index_html.fm_props.title,
|
|
179
|
+
file.index_html.text,
|
|
180
|
+
file.index_html.preview,
|
|
181
|
+
book_config.productFamily,
|
|
182
|
+
file.md5,
|
|
183
|
+
file.lastmod,
|
|
184
|
+
file.status,
|
|
185
|
+
file.keywords,
|
|
186
|
+
);
|
|
187
|
+
inserted_row_id = info.lastInsertRowid;
|
|
188
|
+
} catch (e) {
|
|
189
|
+
console.error(`Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${e}`);
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
187
192
|
}
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
curr_file = index_path_name;
|
|
195
|
-
// Now add metadata
|
|
196
|
-
const meta_vals = [
|
|
193
|
+
|
|
194
|
+
if (curr_file === index_path_name) continue;
|
|
195
|
+
curr_file = index_path_name;
|
|
196
|
+
|
|
197
|
+
try {
|
|
198
|
+
const meta_info = stmt_meta.run(
|
|
197
199
|
index_path_name,
|
|
198
200
|
doc_id,
|
|
199
201
|
file.metadata.contributor_count,
|
|
200
202
|
file.metadata.edit_url,
|
|
201
203
|
file.metadata.last_commit,
|
|
202
204
|
file.pdf_size,
|
|
203
|
-
];
|
|
204
|
-
const meta_response = await hdoc_index.insert_record(
|
|
205
|
-
db,
|
|
206
|
-
"hdoc_meta",
|
|
207
|
-
db_schema.hdoc_meta,
|
|
208
|
-
meta_vals,
|
|
209
205
|
);
|
|
210
|
-
if (
|
|
211
|
-
console.
|
|
212
|
-
|
|
206
|
+
if (verbose) {
|
|
207
|
+
console.log(`Inserted index record ${inserted_row_id}: ${doc_id} - ${file.index_html.fm_props.title}`);
|
|
208
|
+
console.log(`Inserted index metadata record for index ID: ${meta_info.lastInsertRowid}`);
|
|
209
|
+
}
|
|
210
|
+
} catch (e) {
|
|
211
|
+
console.error(`Index metadata record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
for (const contrib of file.contributors) {
|
|
216
|
+
try {
|
|
217
|
+
const cont_info = stmt_contrib.run(
|
|
218
|
+
index_path_name,
|
|
219
|
+
doc_id,
|
|
220
|
+
contrib.login,
|
|
221
|
+
contrib.name,
|
|
222
|
+
contrib.avatar_url,
|
|
223
|
+
contrib.html_url,
|
|
213
224
|
);
|
|
214
|
-
} else {
|
|
215
225
|
if (verbose) {
|
|
216
|
-
console.log(
|
|
217
|
-
`Inserted index record ${index_response.row_id}: ${doc_id} - ${file.index_html.fm_props.title}`,
|
|
218
|
-
);
|
|
219
|
-
console.log(
|
|
220
|
-
`Inserted index metadata record for index ID: ${meta_response.row_id}`,
|
|
221
|
-
);
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
// Now add contributor records
|
|
225
|
-
for (let j = 0; j < file.contributors.length; j++) {
|
|
226
|
-
const contrib_vals = [
|
|
227
|
-
index_path_name,
|
|
228
|
-
doc_id,
|
|
229
|
-
file.contributors[j].login,
|
|
230
|
-
file.contributors[j].name,
|
|
231
|
-
file.contributors[j].avatar_url,
|
|
232
|
-
file.contributors[j].html_url,
|
|
233
|
-
];
|
|
234
|
-
const cont_response = await hdoc_index.insert_record(
|
|
235
|
-
db,
|
|
236
|
-
"hdoc_contributors",
|
|
237
|
-
db_schema.hdoc_contributors,
|
|
238
|
-
contrib_vals,
|
|
239
|
-
);
|
|
240
|
-
if (!cont_response.success) {
|
|
241
|
-
console.error(
|
|
242
|
-
`Index document contributor record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${cont_response.error}`,
|
|
243
|
-
);
|
|
244
|
-
continue;
|
|
245
|
-
}
|
|
246
|
-
if (verbose) {
|
|
247
|
-
console.log(
|
|
248
|
-
`Inserted document contributor record ${cont_response.row_id}`,
|
|
249
|
-
);
|
|
250
|
-
}
|
|
226
|
+
console.log(`Inserted document contributor record ${cont_info.lastInsertRowid}`);
|
|
251
227
|
}
|
|
252
|
-
|
|
228
|
+
} catch (e) {
|
|
229
|
+
console.error(`Index document contributor record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
|
|
253
230
|
}
|
|
254
231
|
}
|
|
255
|
-
|
|
256
|
-
|
|
232
|
+
|
|
233
|
+
response.index_success_count++;
|
|
234
|
+
}
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
try {
|
|
238
|
+
run_all();
|
|
239
|
+
} catch (e) {
|
|
240
|
+
response.error = e.message;
|
|
241
|
+
console.error(`Index build transaction failed: ${e}`);
|
|
242
|
+
return response;
|
|
243
|
+
}
|
|
257
244
|
|
|
258
245
|
response.success = true;
|
|
259
246
|
console.log(
|
package/hdoc-build.js
CHANGED
|
@@ -252,6 +252,11 @@
|
|
|
252
252
|
// Render markdown into HTML
|
|
253
253
|
html_txt = md.render(md_txt);
|
|
254
254
|
|
|
255
|
+
// Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
|
|
256
|
+
// Replaces separate wrapHContent + getFirstHTMLHeading + get_html_read_time calls.
|
|
257
|
+
const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
|
|
258
|
+
html_txt = extracted.html;
|
|
259
|
+
|
|
255
260
|
// Parse frontmatter properties from the YAML block
|
|
256
261
|
let fm_contains_title = false;
|
|
257
262
|
let fm_contains_reading_time = false;
|
|
@@ -297,17 +302,12 @@
|
|
|
297
302
|
|
|
298
303
|
// Title from heading if not in frontmatter
|
|
299
304
|
if (!fm_contains_title) {
|
|
300
|
-
|
|
301
|
-
html_txt,
|
|
302
|
-
h_tags_to_search,
|
|
303
|
-
);
|
|
304
|
-
|
|
305
|
-
if (html_heading?.[0]?.children?.[0]?.data) {
|
|
305
|
+
if (extracted.firstHeadingText) {
|
|
306
306
|
fm_headers.push({
|
|
307
307
|
id: "title",
|
|
308
|
-
value:
|
|
308
|
+
value: extracted.firstHeadingText,
|
|
309
309
|
});
|
|
310
|
-
doc_title =
|
|
310
|
+
doc_title = extracted.firstHeadingText;
|
|
311
311
|
} else if (
|
|
312
312
|
file_path.name !== "description_ext.md" &&
|
|
313
313
|
file_path.name !== "article_ext.md" &&
|
|
@@ -320,24 +320,19 @@
|
|
|
320
320
|
}
|
|
321
321
|
|
|
322
322
|
// Description from first paragraph if not in frontmatter
|
|
323
|
-
if (!fm_contains_description) {
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
value:
|
|
329
|
-
`${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
|
|
330
|
-
});
|
|
331
|
-
}
|
|
323
|
+
if (!fm_contains_description && extracted.firstParagraphText) {
|
|
324
|
+
fm_headers.push({
|
|
325
|
+
id: "description",
|
|
326
|
+
value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
|
|
327
|
+
});
|
|
332
328
|
}
|
|
333
329
|
|
|
334
330
|
// Reading time from content if not in frontmatter
|
|
335
331
|
if (!fm_contains_reading_time) {
|
|
336
|
-
|
|
337
|
-
book_read_time += read_time_mins;
|
|
332
|
+
book_read_time += extracted.readTimeMins;
|
|
338
333
|
fm_headers.push({
|
|
339
334
|
id: "reading-time",
|
|
340
|
-
value:
|
|
335
|
+
value: extracted.readTimeMins,
|
|
341
336
|
});
|
|
342
337
|
}
|
|
343
338
|
} else {
|
|
@@ -348,6 +343,12 @@
|
|
|
348
343
|
// Check if we have a frontmatter comment
|
|
349
344
|
html_fm = hdoc.getHTMLFrontmatterHeader(html_txt);
|
|
350
345
|
|
|
346
|
+
// Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
|
|
347
|
+
// Must run after getHTMLFrontmatterHeader (which reads the top-level comment)
|
|
348
|
+
// but before any per-field extraction; the resulting html replaces html_txt.
|
|
349
|
+
const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
|
|
350
|
+
html_txt = extracted.html;
|
|
351
|
+
|
|
351
352
|
if (Object.keys(html_fm.fm_properties).length > 0) {
|
|
352
353
|
existing_fm_headers = true;
|
|
353
354
|
|
|
@@ -374,9 +375,8 @@
|
|
|
374
375
|
|
|
375
376
|
// Is reading-time in the fm headers?
|
|
376
377
|
if (html_fm.fm_properties["reading-time"] === undefined) {
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
html_fm.fm_properties["reading-time"] = read_time_mins;
|
|
378
|
+
book_read_time += extracted.readTimeMins;
|
|
379
|
+
html_fm.fm_properties["reading-time"] = extracted.readTimeMins;
|
|
380
380
|
}
|
|
381
381
|
|
|
382
382
|
for (const key in html_fm.fm_properties) {
|
|
@@ -397,21 +397,13 @@
|
|
|
397
397
|
file_path.name !== "article_ext.md" &&
|
|
398
398
|
file_path.name !== "internal_ext.md"
|
|
399
399
|
) {
|
|
400
|
-
|
|
401
|
-
const html_heading = hdoc.getFirstHTMLHeading(
|
|
402
|
-
html_txt,
|
|
403
|
-
h_tags_to_search,
|
|
404
|
-
);
|
|
405
|
-
|
|
406
|
-
if (html_heading?.[0]?.children?.[0]?.data) {
|
|
407
|
-
// We've found a heading tag, add that as a title to the existing frontmatter properties
|
|
400
|
+
if (extracted.firstHeadingText) {
|
|
408
401
|
fm_headers.push({
|
|
409
402
|
id: "title",
|
|
410
|
-
value:
|
|
403
|
+
value: extracted.firstHeadingText,
|
|
411
404
|
});
|
|
412
|
-
doc_title =
|
|
405
|
+
doc_title = extracted.firstHeadingText;
|
|
413
406
|
} else {
|
|
414
|
-
// No header tag, no frontmatter title, output a warning
|
|
415
407
|
console.info(
|
|
416
408
|
`[WARNING] No frontmatter title property, or ${h_tags_to_search.join(
|
|
417
409
|
", ",
|
|
@@ -426,12 +418,10 @@
|
|
|
426
418
|
html_fm.fm_properties.description !== undefined
|
|
427
419
|
) {
|
|
428
420
|
if (html_fm.fm_properties.description === "") {
|
|
429
|
-
|
|
430
|
-
if (html_p_tag?.[0]?.children?.[0]?.data) {
|
|
421
|
+
if (extracted.firstParagraphText) {
|
|
431
422
|
fm_headers.push({
|
|
432
423
|
id: "description",
|
|
433
|
-
value:
|
|
434
|
-
`${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
|
|
424
|
+
value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
|
|
435
425
|
});
|
|
436
426
|
}
|
|
437
427
|
} else {
|
|
@@ -440,30 +430,22 @@
|
|
|
440
430
|
value: html_fm.fm_properties.description.trim(),
|
|
441
431
|
});
|
|
442
432
|
}
|
|
443
|
-
} else {
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
value:
|
|
449
|
-
`${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
|
|
450
|
-
});
|
|
451
|
-
}
|
|
433
|
+
} else if (extracted.firstParagraphText) {
|
|
434
|
+
fm_headers.push({
|
|
435
|
+
id: "description",
|
|
436
|
+
value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
|
|
437
|
+
});
|
|
452
438
|
}
|
|
453
439
|
} else {
|
|
454
440
|
// We have no frontmatter headers, get and build one from the html headings
|
|
455
|
-
const html_heading = hdoc.getFirstHTMLHeading(
|
|
456
|
-
html_txt,
|
|
457
|
-
h_tags_to_search,
|
|
458
|
-
);
|
|
459
441
|
let doc_title_local = "";
|
|
460
442
|
// Add the title
|
|
461
|
-
if (
|
|
443
|
+
if (extracted.firstHeadingText) {
|
|
462
444
|
fm_headers.push({
|
|
463
445
|
id: "title",
|
|
464
|
-
value:
|
|
446
|
+
value: extracted.firstHeadingText,
|
|
465
447
|
});
|
|
466
|
-
doc_title_local =
|
|
448
|
+
doc_title_local = extracted.firstHeadingText;
|
|
467
449
|
doc_title = doc_title_local;
|
|
468
450
|
} else if (
|
|
469
451
|
file_path.name !== "description_ext.md" &&
|
|
@@ -478,19 +460,16 @@
|
|
|
478
460
|
}
|
|
479
461
|
|
|
480
462
|
// Add the reading time
|
|
481
|
-
|
|
482
|
-
book_read_time += read_time_mins;
|
|
463
|
+
book_read_time += extracted.readTimeMins;
|
|
483
464
|
fm_headers.push({
|
|
484
465
|
id: "reading-time",
|
|
485
|
-
value:
|
|
466
|
+
value: extracted.readTimeMins,
|
|
486
467
|
});
|
|
487
468
|
|
|
488
|
-
|
|
489
|
-
if (html_p_tag?.[0]?.children?.[0]?.data) {
|
|
469
|
+
if (extracted.firstParagraphText) {
|
|
490
470
|
fm_headers.push({
|
|
491
471
|
id: "description",
|
|
492
|
-
value:
|
|
493
|
-
`${doc_title_local}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
|
|
472
|
+
value: `${doc_title_local}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
|
|
494
473
|
});
|
|
495
474
|
}
|
|
496
475
|
}
|
|
@@ -675,9 +654,6 @@
|
|
|
675
654
|
}
|
|
676
655
|
if (pdf_size > 0) pdf_created++;
|
|
677
656
|
|
|
678
|
-
// Wrap h2 and h3 tags, plus content, in id'd divs
|
|
679
|
-
html_txt = hdoc.wrapHContent(html_txt);
|
|
680
|
-
|
|
681
657
|
if (inline_content) html_txt = `${fm_header_str}\n${html_txt}`;
|
|
682
658
|
else html_txt = `${fm_header_str}\n${doc_header}\n${html_txt}`;
|
|
683
659
|
|
|
@@ -732,7 +708,7 @@
|
|
|
732
708
|
|
|
733
709
|
const tidy_code_tags = (markdown, file) => {
|
|
734
710
|
let clean_markdown = markdown;
|
|
735
|
-
const json_to_tidy = clean_markdown.match(/```json[\r\n]
|
|
711
|
+
const json_to_tidy = clean_markdown.match(/```json[\r\n][\s\S]*?```/g);
|
|
736
712
|
if (json_to_tidy && json_to_tidy.length > 0) {
|
|
737
713
|
for (let i = 0; i < json_to_tidy.length; i++) {
|
|
738
714
|
if (json_to_tidy[i] !== "") {
|
|
@@ -754,7 +730,7 @@
|
|
|
754
730
|
}
|
|
755
731
|
}
|
|
756
732
|
|
|
757
|
-
const xml_to_tidy = clean_markdown.match(/```xml[\r\n]
|
|
733
|
+
const xml_to_tidy = clean_markdown.match(/```xml[\r\n][\s\S]*?```/g);
|
|
758
734
|
if (xml_to_tidy && xml_to_tidy.length > 0) {
|
|
759
735
|
for (let i = 0; i < xml_to_tidy.length; i++) {
|
|
760
736
|
if (xml_to_tidy[i] !== "") {
|
|
@@ -1229,7 +1205,7 @@
|
|
|
1229
1205
|
for (let i = 0; i < md_files.length; i++) {
|
|
1230
1206
|
mdPromiseArray.push(md_files[i]);
|
|
1231
1207
|
}
|
|
1232
|
-
const chunkSize =
|
|
1208
|
+
const chunkSize = 8;
|
|
1233
1209
|
for (let i = 0; i < mdPromiseArray.length; i += chunkSize) {
|
|
1234
1210
|
const chunk = mdPromiseArray.slice(i, i + chunkSize);
|
|
1235
1211
|
// do whatever
|
|
@@ -1357,7 +1333,7 @@
|
|
|
1357
1333
|
process.exit(1);
|
|
1358
1334
|
}
|
|
1359
1335
|
// Populate primary index tables
|
|
1360
|
-
const index =
|
|
1336
|
+
const index = hdoc_build_db.populate_index(
|
|
1361
1337
|
db.db,
|
|
1362
1338
|
doc_id,
|
|
1363
1339
|
hdocbook_config,
|
package/hdoc-db.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
(() => {
|
|
2
|
+
const cheerio = require("cheerio");
|
|
2
3
|
const path = require("node:path");
|
|
3
4
|
const hdoc = require(path.join(__dirname, "hdoc-module.js"));
|
|
4
5
|
|
|
@@ -63,23 +64,44 @@
|
|
|
63
64
|
sections: [],
|
|
64
65
|
};
|
|
65
66
|
|
|
66
|
-
//
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
// Single parse covers frontmatter extraction, full-text, and preview —
|
|
68
|
+
// previously three separate cheerio.load() calls.
|
|
69
|
+
const $ = cheerio.load(html_txt, { decodeEntities: false });
|
|
69
70
|
|
|
70
|
-
//
|
|
71
|
-
|
|
71
|
+
// Extract frontmatter properties from the leading HTML comment
|
|
72
|
+
if ($._root?.children && Array.isArray($._root.children)) {
|
|
73
|
+
for (const child of $._root.children) {
|
|
74
|
+
if (child.type === "comment" && child.data?.startsWith("[[FRONTMATTER")) {
|
|
75
|
+
for (const line of child.data.split(/\r?\n/)) {
|
|
76
|
+
if (line.includes(":")) {
|
|
77
|
+
const parts = line.split(/:(.*)/s);
|
|
78
|
+
if (parts.length > 1) {
|
|
79
|
+
const key = parts[0].trim().toLowerCase();
|
|
80
|
+
let val = parts[1].trim();
|
|
81
|
+
if (/^".*"$/.test(val)) val = val.slice(1, -1);
|
|
82
|
+
if (key === "title") {
|
|
83
|
+
val = val.replace(
|
|
84
|
+
/&|<|>|"|'|'|&#(\d+);|&#x([0-9a-fA-F]+);/g,
|
|
85
|
+
(m, dec, hex) => dec ? String.fromCharCode(+dec) : hex ? String.fromCharCode(parseInt(hex, 16)) : ({ "&": "&", "<": "<", ">": ">", """: '"', "'": "'", "'": "'" })[m],
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
response.fm_props[key] = val;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Full-document plain text for search indexing
|
|
98
|
+
const text = $("body").text();
|
|
99
|
+
|
|
100
|
+
// Preview: first paragraph texts joined, then truncated
|
|
101
|
+
let preview = $("p").map((_i, el) => $(el).text()).get().join("\n");
|
|
102
|
+
preview = hdoc.truncate_string(preview, 200, true).replace(/(?:\r\n|\r|\n)/g, " ");
|
|
72
103
|
|
|
73
|
-
|
|
74
|
-
let preview = hdoc.html_to_text(html_txt, { baseElement: "p" });
|
|
75
|
-
preview = hdoc
|
|
76
|
-
.truncate_string(preview, 200, true)
|
|
77
|
-
.replace(/(?:\r\n|\r|\n)/g, " ");
|
|
78
|
-
response.sections.push({
|
|
79
|
-
text: response.text,
|
|
80
|
-
preview: preview,
|
|
81
|
-
});
|
|
82
|
-
//}
|
|
104
|
+
response.sections.push({ text, preview });
|
|
83
105
|
return response;
|
|
84
106
|
};
|
|
85
107
|
})();
|
package/hdoc-module.js
CHANGED
|
@@ -48,6 +48,8 @@
|
|
|
48
48
|
}
|
|
49
49
|
};
|
|
50
50
|
|
|
51
|
+
exports.fetchWithRetry = fetchWithRetry;
|
|
52
|
+
|
|
51
53
|
exports.content_type_for_ext = (ext) => {
|
|
52
54
|
switch (ext) {
|
|
53
55
|
case ".z":
|
|
@@ -220,15 +222,9 @@
|
|
|
220
222
|
// Looks for h1 tags first, then hX, hY, hZ in order
|
|
221
223
|
exports.getFirstHTMLHeading = (html_body, h_to_search = ["h1"]) => {
|
|
222
224
|
const $ = cheerio.load(html_body);
|
|
223
|
-
for (
|
|
224
|
-
const
|
|
225
|
-
|
|
226
|
-
return $(this);
|
|
227
|
-
})
|
|
228
|
-
.get();
|
|
229
|
-
if (heading.length > 0) {
|
|
230
|
-
return heading[0];
|
|
231
|
-
}
|
|
225
|
+
for (const tag of h_to_search) {
|
|
226
|
+
const el = $(tag).first();
|
|
227
|
+
if (el.length > 0) return el;
|
|
232
228
|
}
|
|
233
229
|
return false;
|
|
234
230
|
};
|
|
@@ -277,6 +273,58 @@
|
|
|
277
273
|
return `<html><head></head><body>${result}</body></html>`;
|
|
278
274
|
};
|
|
279
275
|
|
|
276
|
+
// Combined single-pass version of wrapHContent + getFirstHTMLHeading + get_html_read_time.
|
|
277
|
+
// Iterates body contents once to wrap h2/h3 divs AND extract the first matching heading text,
|
|
278
|
+
// first paragraph text, and reading-time estimate — avoiding 3 extra cheerio.load() calls.
|
|
279
|
+
exports.wrapAndExtract = (htmlContent, h_tags_to_search = ["h1"]) => {
|
|
280
|
+
const $ = cheerio.load(htmlContent, { decodeEntities: false });
|
|
281
|
+
let result = '';
|
|
282
|
+
let inH2 = false;
|
|
283
|
+
let inH3 = false;
|
|
284
|
+
let firstHeadingText = null;
|
|
285
|
+
let firstParagraphText = null;
|
|
286
|
+
|
|
287
|
+
$('body').contents().each(function() {
|
|
288
|
+
const tagName = this.type === 'tag' ? this.name?.toLowerCase() : null;
|
|
289
|
+
const text = tagName ? $(this).text().trim() : null;
|
|
290
|
+
|
|
291
|
+
if (firstHeadingText === null && tagName && h_tags_to_search.includes(tagName)) {
|
|
292
|
+
firstHeadingText = text;
|
|
293
|
+
}
|
|
294
|
+
if (firstParagraphText === null && tagName === 'p') {
|
|
295
|
+
firstParagraphText = text;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (tagName === 'h2') {
|
|
299
|
+
if (inH3) { result += '</div>'; inH3 = false; }
|
|
300
|
+
if (inH2) { result += '</div>'; inH2 = false; }
|
|
301
|
+
result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
|
|
302
|
+
inH2 = true;
|
|
303
|
+
} else if (tagName === 'h3') {
|
|
304
|
+
if (inH3) { result += '</div>'; inH3 = false; }
|
|
305
|
+
result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
|
|
306
|
+
inH3 = true;
|
|
307
|
+
} else {
|
|
308
|
+
result += $.html(this);
|
|
309
|
+
}
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
if (inH3) result += '</div>';
|
|
313
|
+
if (inH2) result += '</div>';
|
|
314
|
+
|
|
315
|
+
// Word count re-uses the already-parsed DOM — no extra cheerio.load()
|
|
316
|
+
const bodyText = $("body").text();
|
|
317
|
+
const wordCount = bodyText.trim().split(/\s+/).filter(Boolean).length;
|
|
318
|
+
const readTimeMins = wordCount === 0 ? 0 : (Math.round(wordCount / 200) || 1);
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
html: `<html><head></head><body>${result}</body></html>`,
|
|
322
|
+
firstHeadingText,
|
|
323
|
+
firstParagraphText,
|
|
324
|
+
readTimeMins,
|
|
325
|
+
};
|
|
326
|
+
};
|
|
327
|
+
|
|
280
328
|
exports.getIDDivs = (html_body) => {
|
|
281
329
|
const $ = cheerio.load(html_body, {
|
|
282
330
|
decodeEntities: false,
|
package/hdoc-validate.js
CHANGED
|
@@ -8,7 +8,6 @@ const { error } = require("node:console");
|
|
|
8
8
|
const path = require("node:path");
|
|
9
9
|
const hdoc = require(path.join(__dirname, "hdoc-module.js"));
|
|
10
10
|
const translator = require("american-british-english-translator");
|
|
11
|
-
const puppeteer = require("puppeteer");
|
|
12
11
|
|
|
13
12
|
const spellcheck_options = {
|
|
14
13
|
british: true,
|
|
@@ -26,10 +25,11 @@ const { error } = require("node:console");
|
|
|
26
25
|
let private_repo = false;
|
|
27
26
|
let redirects = {};
|
|
28
27
|
let skip_link_file = '';
|
|
28
|
+
let _on_int_net_cached = null; // null = not yet checked; cached after first DNS lookup
|
|
29
29
|
const exclude_h1_count = {};
|
|
30
30
|
const exclude_spellcheck_output = [];
|
|
31
31
|
|
|
32
|
-
const excludeLink =
|
|
32
|
+
const excludeLink = (url) => {
|
|
33
33
|
if (exclude_links[url]) return true;
|
|
34
34
|
for (let key in exclude_links) {
|
|
35
35
|
if (Object.hasOwn(exclude_links, key)) {
|
|
@@ -465,18 +465,67 @@ const { error } = require("node:console");
|
|
|
465
465
|
return returnPaths;
|
|
466
466
|
}
|
|
467
467
|
|
|
468
|
-
|
|
468
|
+
// Headers that mimic a real Chrome browser request — sites doing bot detection
|
|
469
|
+
// check far more than just User-Agent (Accept, Sec-Fetch-*, client hints, etc.).
|
|
470
|
+
const _fetch_headers = {
|
|
471
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
472
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
473
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
474
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
475
|
+
'Cache-Control': 'no-cache',
|
|
476
|
+
'Pragma': 'no-cache',
|
|
477
|
+
'Sec-Fetch-Dest': 'document',
|
|
478
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
479
|
+
'Sec-Fetch-Site': 'none',
|
|
480
|
+
'Sec-Fetch-User': '?1',
|
|
481
|
+
'Sec-Ch-Ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
|
482
|
+
'Sec-Ch-Ua-Mobile': '?0',
|
|
483
|
+
'Sec-Ch-Ua-Platform': '"Windows"',
|
|
484
|
+
'Upgrade-Insecure-Requests': '1',
|
|
485
|
+
};
|
|
486
|
+
|
|
487
|
+
// Checks a single external URL by sending a HEAD request, falling back to GET
|
|
488
|
+
// if the server returns 405 (Method Not Allowed) or 404 (some servers, e.g.
|
|
489
|
+
// marketplace.visualstudio.com, return 404 for HEAD even when the page exists).
|
|
490
|
+
// Retries up to 5 times on transient errors (5xx, 429, network failures).
|
|
491
|
+
// Returns the HTTP status code.
|
|
492
|
+
const fetchExternalLinkStatus = async (url) => {
|
|
493
|
+
const opts = { method: 'HEAD', headers: _fetch_headers, timeoutMs: 10000, redirect: 'follow' };
|
|
494
|
+
const resp = await hdoc.fetchWithRetry(url, opts);
|
|
495
|
+
if (resp.status === 404 || resp.status === 405) {
|
|
496
|
+
const getResp = await hdoc.fetchWithRetry(url, { ...opts, method: 'GET' });
|
|
497
|
+
return getResp.status;
|
|
498
|
+
}
|
|
499
|
+
return resp.status;
|
|
500
|
+
};
|
|
501
|
+
|
|
502
|
+
const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, global_links_checked, output_links) => {
|
|
469
503
|
const markdown_paths = getMDPathFromHtmlPath(htmlFile);
|
|
470
504
|
const markdown_content = fs.readFileSync(markdown_paths.markdownPath, 'utf8');
|
|
471
505
|
|
|
472
|
-
|
|
506
|
+
// Resolve the "are we on the internal network?" question once per process
|
|
507
|
+
// rather than once per internal.hornbill.com link.
|
|
508
|
+
const ensureIntNetCached = async () => {
|
|
509
|
+
if (_on_int_net_cached === null) {
|
|
510
|
+
try {
|
|
511
|
+
_on_int_net_cached = await checkHostExistsInDNS('docs-internal.hornbill.com');
|
|
512
|
+
} catch (_e) {
|
|
513
|
+
_on_int_net_cached = false;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return _on_int_net_cached;
|
|
517
|
+
};
|
|
518
|
+
|
|
519
|
+
// Collect external links that need an HTTP check so they can be run
|
|
520
|
+
// concurrently rather than one-at-a-time.
|
|
521
|
+
const externalChecks = [];
|
|
522
|
+
|
|
473
523
|
for (let i = 0; i < links.length; i++) {
|
|
474
|
-
// Validate that link is a valid URL first
|
|
475
524
|
if (output_links) console.log(` - ${links[i]}`);
|
|
476
525
|
if (exclude_links[links[i]]) continue;
|
|
477
526
|
if (global_links_checked.includes(links[i])) continue;
|
|
478
527
|
global_links_checked.push(links[i]);
|
|
479
|
-
|
|
528
|
+
|
|
480
529
|
const valid_url = hdoc.valid_url(links[i]);
|
|
481
530
|
if (!valid_url) {
|
|
482
531
|
// Could be a relative path, check
|
|
@@ -509,7 +558,7 @@ const { error } = require("node:console");
|
|
|
509
558
|
messages[htmlFile.relativePath].push(
|
|
510
559
|
`Link is a properly formatted external URL: ${links[i]}`,
|
|
511
560
|
);
|
|
512
|
-
|
|
561
|
+
|
|
513
562
|
// Skip if it's the auto-generated edit url, as these could be part of a private repo which would return a 404
|
|
514
563
|
if (
|
|
515
564
|
hdocbook_config.publicSource !== undefined &&
|
|
@@ -524,38 +573,14 @@ const { error } = require("node:console");
|
|
|
524
573
|
fs.appendFileSync(skip_link_file, `${links[i]}\n`);
|
|
525
574
|
continue;
|
|
526
575
|
}
|
|
527
|
-
|
|
576
|
+
|
|
528
577
|
if (valid_url.protocol === "mailto:") {
|
|
529
578
|
fs.appendFileSync(skip_link_file, `${links[i]}\n`);
|
|
530
579
|
continue;
|
|
531
580
|
}
|
|
532
581
|
|
|
533
|
-
// Skip internal.hornbill.com link validation if run outside of the Hornbill network
|
|
534
|
-
if (links[i].toLowerCase().includes("internal.hornbill.com")) {
|
|
535
|
-
// DNS lookup internal docs endpoint
|
|
536
|
-
const hostname = 'docs-internal.hornbill.com';
|
|
537
|
-
let on_int_net = false;
|
|
538
|
-
try {
|
|
539
|
-
on_int_net = await checkHostExistsInDNS(hostname);
|
|
540
|
-
} catch (e) {
|
|
541
|
-
// Don't need to do anything here
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
if (!on_int_net) {
|
|
545
|
-
messages[htmlFile.relativePath].push(
|
|
546
|
-
`Outside of Hornbill network - skipping internal link validation for: ${links[i]}`,
|
|
547
|
-
);
|
|
548
|
-
fs.appendFileSync(skip_link_file, `${links[i]}\n`);
|
|
549
|
-
continue;
|
|
550
|
-
}
|
|
551
|
-
messages[htmlFile.relativePath].push(
|
|
552
|
-
`Inside of Hornbill network - performing internal link validation for: ${links[i]}`,
|
|
553
|
-
);
|
|
554
|
-
}
|
|
555
|
-
|
|
556
582
|
// Skip if the link is excluded in the project config
|
|
557
|
-
|
|
558
|
-
if (skip_link) {
|
|
583
|
+
if (excludeLink(links[i])) {
|
|
559
584
|
messages[htmlFile.relativePath].push(
|
|
560
585
|
`Skipping link validation for: ${links[i]}`,
|
|
561
586
|
);
|
|
@@ -563,99 +588,76 @@ const { error } = require("node:console");
|
|
|
563
588
|
}
|
|
564
589
|
|
|
565
590
|
if (
|
|
566
|
-
(links[i].toLowerCase().includes("docs.hornbill.com") ||
|
|
591
|
+
(links[i].toLowerCase().includes("docs.hornbill.com") ||
|
|
567
592
|
links[i].toLowerCase().includes("docs-internal.hornbill.com")) &&
|
|
568
593
|
!markdown_paths.relativePath.includes('/_inline/')
|
|
569
594
|
) {
|
|
570
595
|
const error_message = processErrorMessage(`Hornbill Docs links should not be fully-qualified: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
|
|
571
|
-
errors[htmlFile.relativePath].push(
|
|
596
|
+
errors[htmlFile.relativePath].push(error_message);
|
|
572
597
|
continue;
|
|
573
598
|
}
|
|
574
599
|
|
|
575
|
-
if (
|
|
600
|
+
if (
|
|
576
601
|
links[i].toLowerCase().includes("docs-internal.hornbill.com") &&
|
|
577
602
|
markdown_paths.relativePath.includes('/_inline/') &&
|
|
578
603
|
!private_repo
|
|
579
604
|
) {
|
|
580
605
|
// Is the parent book in a public repo? If so, flag this as an error.
|
|
581
606
|
const error_message = processErrorMessage(`Hornbill docs-internal links should not be used in public book inline content: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
|
|
582
|
-
errors[htmlFile.relativePath].push(
|
|
607
|
+
errors[htmlFile.relativePath].push(error_message);
|
|
583
608
|
continue;
|
|
584
609
|
}
|
|
585
610
|
|
|
586
|
-
//
|
|
587
|
-
const
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
611
|
+
// Capture url in closure for the async check below
|
|
612
|
+
const url = links[i];
|
|
613
|
+
const isInternal = url.toLowerCase().includes("internal.hornbill.com");
|
|
614
|
+
|
|
615
|
+
externalChecks.push(async () => {
|
|
616
|
+
// For internal.hornbill.com links, check network reachability first (result cached)
|
|
617
|
+
if (isInternal) {
|
|
618
|
+
const on_int_net = await ensureIntNetCached();
|
|
619
|
+
if (!on_int_net) {
|
|
620
|
+
messages[htmlFile.relativePath].push(
|
|
621
|
+
`Outside of Hornbill network - skipping internal link validation for: ${url}`,
|
|
622
|
+
);
|
|
623
|
+
fs.appendFileSync(skip_link_file, `${url}\n`);
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
messages[htmlFile.relativePath].push(
|
|
627
|
+
`Inside of Hornbill network - performing internal link validation for: ${url}`,
|
|
628
|
+
);
|
|
629
|
+
}
|
|
592
630
|
|
|
593
631
|
try {
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
if (request.isNavigationRequest() && request.redirectChain().length) {
|
|
599
|
-
redirectChain = request.redirectChain().map((req) => req.url());
|
|
600
|
-
}
|
|
601
|
-
});
|
|
602
|
-
|
|
603
|
-
// Capture the response
|
|
604
|
-
page.on('response', (res) => {
|
|
605
|
-
const chain = res.request().redirectChain();
|
|
606
|
-
if (chain.length > 0) {
|
|
607
|
-
redirectChain = chain.map((req) => req.url());
|
|
608
|
-
lastRedirectStatus = res.status(); // Status of the last redirect
|
|
609
|
-
}
|
|
610
|
-
});
|
|
611
|
-
|
|
612
|
-
// Try loading the URL
|
|
613
|
-
response = await page.goto(links[i], { waitUntil: 'networkidle2', timeout: 10000 });
|
|
614
|
-
|
|
615
|
-
if (response) {
|
|
616
|
-
let status = response.status();
|
|
617
|
-
const contentType = response.headers()['content-type'];
|
|
618
|
-
|
|
619
|
-
// If it's a PDF switch to direct fetching
|
|
620
|
-
if (contentType && contentType.includes('application/')) {
|
|
621
|
-
status = await page.evaluate(async (url) => {
|
|
622
|
-
const res = await fetch(url, { method: 'HEAD' });
|
|
623
|
-
return res.status;
|
|
624
|
-
}, links[i]);
|
|
625
|
-
}
|
|
626
|
-
if ((status < 200 || status > 299) && status !== 304) {
|
|
627
|
-
if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && links[i].includes(".hornbill.com")) {
|
|
628
|
-
// STEVEG - do nothing here, as it always returns a 403 for Hornbill sites when accessing through GitHub Actions
|
|
629
|
-
// Works totally fine locally or in hdocpub, still trying to work out what's causing this in GitHub
|
|
630
|
-
} else {
|
|
631
|
-
throw `Unexpected Status Returned: ${status}`;
|
|
632
|
-
}
|
|
632
|
+
const status = await fetchExternalLinkStatus(url);
|
|
633
|
+
if ((status < 200 || status > 299) && status !== 304) {
|
|
634
|
+
if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && url.includes(".hornbill.com")) {
|
|
635
|
+
// Always returns 403 for Hornbill sites through GitHub Actions — not a real error
|
|
633
636
|
} else {
|
|
634
|
-
|
|
637
|
+
throw `Unexpected Status Returned: ${status}`;
|
|
635
638
|
}
|
|
636
639
|
} else {
|
|
637
|
-
|
|
640
|
+
fs.appendFileSync(skip_link_file, `${url}\n`);
|
|
638
641
|
}
|
|
639
|
-
} catch (
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
642
|
+
} catch (e) {
|
|
643
|
+
let error_message;
|
|
644
|
+
if (e instanceof AggregateError) {
|
|
645
|
+
error_message = processErrorMessage(`Issue with external link [${url}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, url);
|
|
646
|
+
} else {
|
|
647
|
+
error_message = processErrorMessage(`Issue with external link [${url}]: ${e}`, markdown_paths.relativePath, markdown_content, url);
|
|
648
|
+
}
|
|
649
|
+
if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
|
|
650
|
+
warnings[htmlFile.relativePath].push(error_message);
|
|
651
|
+
else
|
|
652
|
+
errors[htmlFile.relativePath].push(error_message);
|
|
648
653
|
}
|
|
649
|
-
|
|
650
|
-
warnings[htmlFile.relativePath].push(error_message);
|
|
651
|
-
else
|
|
652
|
-
errors[htmlFile.relativePath].push(error_message);
|
|
653
|
-
|
|
654
|
-
}
|
|
655
|
-
// Close the headless browser tab
|
|
656
|
-
page.close();
|
|
654
|
+
});
|
|
657
655
|
}
|
|
658
656
|
}
|
|
657
|
+
|
|
658
|
+
// Run all external HTTP checks concurrently — fetch is lightweight enough
|
|
659
|
+
// that uncapped concurrency is fine for the link counts seen in practice.
|
|
660
|
+
await Promise.all(externalChecks.map(fn => fn()));
|
|
659
661
|
};
|
|
660
662
|
|
|
661
663
|
const checkHostExistsInDNS = async (hostname) => {
|
|
@@ -1069,8 +1071,7 @@ const { error } = require("node:console");
|
|
|
1069
1071
|
|
|
1070
1072
|
|
|
1071
1073
|
const global_links_checked = [];
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
+
|
|
1074
1075
|
for (const key in html_to_validate) {
|
|
1075
1076
|
const file = html_to_validate[key];
|
|
1076
1077
|
// Check for British spellings in static HTML content
|
|
@@ -1095,7 +1096,7 @@ const { error } = require("node:console");
|
|
|
1095
1096
|
messages[file.relativePath].push("No links found in file");
|
|
1096
1097
|
} else {
|
|
1097
1098
|
console.log(`\r\nChecking ${links.href.length} Links in ${file.relativePath}`);
|
|
1098
|
-
await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project,
|
|
1099
|
+
await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, global_links_checked, output_links);
|
|
1099
1100
|
}
|
|
1100
1101
|
if (links.img.length === 0) {
|
|
1101
1102
|
messages[file.relativePath].push("No images found in file");
|
|
@@ -1107,9 +1108,6 @@ const { error } = require("node:console");
|
|
|
1107
1108
|
await checkTags(file);
|
|
1108
1109
|
}
|
|
1109
1110
|
|
|
1110
|
-
// Close the Chromium browser instance
|
|
1111
|
-
await validateBrowser.close();
|
|
1112
|
-
|
|
1113
1111
|
if (gen_exclude) console.log(JSON.stringify(excl_output, null, 2));
|
|
1114
1112
|
|
|
1115
1113
|
if (verbose) {
|