hdoc-tools 0.47.2 → 0.47.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,7 +35,7 @@ Returns statistics regarding the book you are working on:
35
35
  - Number of Markdown Files in the Book
36
36
  - Number of Static HTML Files in the Book
37
37
 
38
- If the -v switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
38
+ If the `-v` switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
39
39
 
40
40
  The book statistics do not include counts for any externally hosted content injected into the book content using the [[INCLUDE]] tags.
41
41
 
@@ -43,25 +43,25 @@ The book statistics do not include counts for any externally hosted content inje
43
43
 
44
44
  Performs a local build of the book, validates the links and static content are present and correct and outputs as a ZIP file.
45
45
 
46
- If the -v switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
46
+ If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
47
47
 
48
- Use the --set-version argument to set the version number of the built book.
48
+ Use the `--set-version` argument to set the version number of the built book.
49
49
 
50
- Use the --no-color argument to remove any color control characters from the output.
50
+ Use the `--no-color` argument to remove any color control characters from the output.
51
51
 
52
- Use the '--no-links' argument to skip link output to CLI during validation.
52
+ Use the `--no-links` argument to skip link output to CLI during validation.
53
53
 
54
54
  ### validate
55
55
 
56
56
  Performs a minimum local build of the book, then validates the links and static content are present and correct.
57
57
 
58
- If the -v switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
58
+ If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
59
59
 
60
- Use the --set-version argument to set the version number of the built book.
60
+ Use the `--set-version` argument to set the version number of the built book.
61
61
 
62
- Use the --no-color argument to remove any color control characters from the output.
62
+ Use the `--no-color` argument to remove any color control characters from the output.
63
63
 
64
- Use the '--no-links' argument to skip link output to CLI during validation.
64
+ Use the `--no-links` argument to skip link output to CLI during validation.
65
65
 
66
66
  ### serve
67
67
 
@@ -72,4 +72,4 @@ command `hdoc serve` and in a local browser go to the URL `http://localhost:3000
72
72
 
73
73
  ## Installation
74
74
 
75
- npm install hdoc-tools -g
75
+ > `npm install hdoc-tools -g`
package/hdoc-build-db.js CHANGED
@@ -87,41 +87,43 @@
87
87
  return response;
88
88
  };
89
89
 
90
- exports.populate_redirects = (db, redirect_records, verbose = false) => {
90
+ exports.populate_redirects = (db, redirect_records, _verbose = false) => {
91
91
  const response = {
92
92
  success: true,
93
93
  errors: [],
94
94
  index_success_count: 0,
95
95
  };
96
96
 
97
- for (let i = 0; i < redirect_records.length; i++) {
98
- const index_vals = [
99
- redirect_records[i].url,
100
- redirect_records[i].location ? redirect_records[i].location : "",
101
- redirect_records[i].code,
102
- ];
103
- const index_response = hdoc_index.insert_record(
104
- db,
105
- "hdoc_redirects",
106
- db_schema.hdoc_redirects,
107
- index_vals,
108
- );
109
- if (!index_response.success) {
110
- response.success = false;
111
- response.errors.push(
112
- `Redirect record creation failed - ${redirect_records[i].url}: ${index_response.error}`,
113
- );
114
- } else {
115
- response.index_success_count++;
97
+ // Prepare once, insert all in one transaction
98
+ const cols = db_schema.hdoc_redirects.map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
99
+ const stmt = db.prepare(`INSERT INTO hdoc_redirects (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
100
+
101
+ const run_all = db.transaction(() => {
102
+ for (const record of redirect_records) {
103
+ try {
104
+ stmt.run(record.url, record.location ? record.location : "", record.code);
105
+ response.index_success_count++;
106
+ } catch (e) {
107
+ response.success = false;
108
+ response.errors.push(`Redirect record creation failed - ${record.url}: ${e}`);
109
+ }
116
110
  }
111
+ });
112
+
113
+ try {
114
+ run_all();
115
+ } catch (e) {
116
+ response.success = false;
117
+ response.errors.push(`Redirect index transaction failed: ${e}`);
117
118
  }
119
+
118
120
  console.log(
119
121
  `\nRedirect Index Build Complete: ${response.index_success_count} document records created.`,
120
122
  );
121
123
  return response;
122
124
  };
123
125
 
124
- exports.populate_index = async (
126
+ exports.populate_index = (
125
127
  db,
126
128
  doc_id,
127
129
  book_config,
@@ -136,124 +138,109 @@
136
138
 
137
139
  if (!book_config.tags) book_config.tags = [];
138
140
 
139
- const indexPromises = [];
140
- for (let i = 0; i < index_records.length; i++) {
141
- indexPromises.push(index_records[i]);
142
- }
143
- let curr_file = "";
144
- await Promise.all(
145
- indexPromises.map(async (file) => {
141
+ // Build a prepared statement from a schema entry once, reusing it for every row.
142
+ // Previously insert_record() called db.prepare() on every single insert.
143
+ const make_stmt = (table) => {
144
+ const cols = db_schema[table].map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
145
+ return db.prepare(`INSERT INTO ${table} (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
146
+ };
147
+ const stmt_index = make_stmt("hdoc_index");
148
+ const stmt_meta = make_stmt("hdoc_meta");
149
+ const stmt_contrib = make_stmt("hdoc_contributors");
150
+
151
+ // A single transaction batches all disk flushes into one — critical for
152
+ // FTS5 which otherwise re-indexes on every individual insert.
153
+ const run_all = db.transaction(() => {
154
+ let curr_file = "";
155
+ for (const file of index_records) {
146
156
  let index_path_name = file.relative_path.replaceAll("\\", "/");
147
157
  if (
148
158
  index_path_name.endsWith("/index.md") ||
149
159
  index_path_name.endsWith("/index.html") ||
150
160
  index_path_name.endsWith("/index.htm")
151
161
  ) {
152
- index_path_name = index_path_name.substring(
153
- 0,
154
- index_path_name.lastIndexOf("/"),
155
- );
162
+ index_path_name = index_path_name.substring(0, index_path_name.lastIndexOf("/"));
156
163
  }
157
164
  index_path_name = `/${index_path_name.replace(path.extname(file.relative_path), "")}`;
158
165
 
159
- let index_response = {
160
- success: true,
161
- row_id: 0,
162
- };
163
- let index_content_path = index_path_name;
164
- if (file.index_html.id !== null)
165
- index_content_path += `#${file.index_html.id}`;
166
+ let inserted_row_id = null;
167
+ const index_content_path = file.index_html.id !== null
168
+ ? `${index_path_name}#${file.index_html.id}`
169
+ : index_path_name;
170
+
166
171
  if (!file.inline) {
167
- const index_vals = [
168
- index_content_path,
169
- doc_id,
170
- book_config.audience.join(","),
171
- book_config.tags.join(","),
172
- file.index_html.fm_props.title,
173
- file.index_html.text,
174
- file.index_html.preview,
175
- book_config.productFamily,
176
- file.md5,
177
- file.lastmod,
178
- file.status,
179
- file.keywords,
180
- ];
181
- index_response = hdoc_index.insert_record(
182
- db,
183
- "hdoc_index",
184
- db_schema.hdoc_index,
185
- index_vals,
186
- );
172
+ try {
173
+ const info = stmt_index.run(
174
+ index_content_path,
175
+ doc_id,
176
+ book_config.audience.join(","),
177
+ book_config.tags.join(","),
178
+ file.index_html.fm_props.title,
179
+ file.index_html.text,
180
+ file.index_html.preview,
181
+ book_config.productFamily,
182
+ file.md5,
183
+ file.lastmod,
184
+ file.status,
185
+ file.keywords,
186
+ );
187
+ inserted_row_id = info.lastInsertRowid;
188
+ } catch (e) {
189
+ console.error(`Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${e}`);
190
+ continue;
191
+ }
187
192
  }
188
- if (!index_response.success) {
189
- console.error(
190
- `Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${index_response.error}`,
191
- );
192
- } else {
193
- if (curr_file === index_path_name) return;
194
- curr_file = index_path_name;
195
- // Now add metadata
196
- const meta_vals = [
193
+
194
+ if (curr_file === index_path_name) continue;
195
+ curr_file = index_path_name;
196
+
197
+ try {
198
+ const meta_info = stmt_meta.run(
197
199
  index_path_name,
198
200
  doc_id,
199
201
  file.metadata.contributor_count,
200
202
  file.metadata.edit_url,
201
203
  file.metadata.last_commit,
202
204
  file.pdf_size,
203
- ];
204
- const meta_response = await hdoc_index.insert_record(
205
- db,
206
- "hdoc_meta",
207
- db_schema.hdoc_meta,
208
- meta_vals,
209
205
  );
210
- if (!meta_response.success) {
211
- console.error(
212
- `Index metadata record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${meta_response.error}`,
206
+ if (verbose) {
207
+ console.log(`Inserted index record ${inserted_row_id}: ${doc_id} - ${file.index_html.fm_props.title}`);
208
+ console.log(`Inserted index metadata record for index ID: ${meta_info.lastInsertRowid}`);
209
+ }
210
+ } catch (e) {
211
+ console.error(`Index metadata record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
212
+ continue;
213
+ }
214
+
215
+ for (const contrib of file.contributors) {
216
+ try {
217
+ const cont_info = stmt_contrib.run(
218
+ index_path_name,
219
+ doc_id,
220
+ contrib.login,
221
+ contrib.name,
222
+ contrib.avatar_url,
223
+ contrib.html_url,
213
224
  );
214
- } else {
215
225
  if (verbose) {
216
- console.log(
217
- `Inserted index record ${index_response.row_id}: ${doc_id} - ${file.index_html.fm_props.title}`,
218
- );
219
- console.log(
220
- `Inserted index metadata record for index ID: ${meta_response.row_id}`,
221
- );
222
- }
223
-
224
- // Now add contributor records
225
- for (let j = 0; j < file.contributors.length; j++) {
226
- const contrib_vals = [
227
- index_path_name,
228
- doc_id,
229
- file.contributors[j].login,
230
- file.contributors[j].name,
231
- file.contributors[j].avatar_url,
232
- file.contributors[j].html_url,
233
- ];
234
- const cont_response = await hdoc_index.insert_record(
235
- db,
236
- "hdoc_contributors",
237
- db_schema.hdoc_contributors,
238
- contrib_vals,
239
- );
240
- if (!cont_response.success) {
241
- console.error(
242
- `Index document contributor record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${cont_response.error}`,
243
- );
244
- continue;
245
- }
246
- if (verbose) {
247
- console.log(
248
- `Inserted document contributor record ${cont_response.row_id}`,
249
- );
250
- }
226
+ console.log(`Inserted document contributor record ${cont_info.lastInsertRowid}`);
251
227
  }
252
- response.index_success_count++;
228
+ } catch (e) {
229
+ console.error(`Index document contributor record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
253
230
  }
254
231
  }
255
- }),
256
- );
232
+
233
+ response.index_success_count++;
234
+ }
235
+ });
236
+
237
+ try {
238
+ run_all();
239
+ } catch (e) {
240
+ response.error = e.message;
241
+ console.error(`Index build transaction failed: ${e}`);
242
+ return response;
243
+ }
257
244
 
258
245
  response.success = true;
259
246
  console.log(
package/hdoc-build.js CHANGED
@@ -252,6 +252,11 @@
252
252
  // Render markdown into HTML
253
253
  html_txt = md.render(md_txt);
254
254
 
255
+ // Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
256
+ // Replaces separate wrapHContent + getFirstHTMLHeading + get_html_read_time calls.
257
+ const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
258
+ html_txt = extracted.html;
259
+
255
260
  // Parse frontmatter properties from the YAML block
256
261
  let fm_contains_title = false;
257
262
  let fm_contains_reading_time = false;
@@ -297,17 +302,12 @@
297
302
 
298
303
  // Title from heading if not in frontmatter
299
304
  if (!fm_contains_title) {
300
- const html_heading = hdoc.getFirstHTMLHeading(
301
- html_txt,
302
- h_tags_to_search,
303
- );
304
-
305
- if (html_heading?.[0]?.children?.[0]?.data) {
305
+ if (extracted.firstHeadingText) {
306
306
  fm_headers.push({
307
307
  id: "title",
308
- value: html_heading[0].children[0].data.trim(),
308
+ value: extracted.firstHeadingText,
309
309
  });
310
- doc_title = html_heading[0].children[0].data.trim();
310
+ doc_title = extracted.firstHeadingText;
311
311
  } else if (
312
312
  file_path.name !== "description_ext.md" &&
313
313
  file_path.name !== "article_ext.md" &&
@@ -320,24 +320,19 @@
320
320
  }
321
321
 
322
322
  // Description from first paragraph if not in frontmatter
323
- if (!fm_contains_description) {
324
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
325
- if (html_p_tag?.[0]?.children?.[0]?.data) {
326
- fm_headers.push({
327
- id: "description",
328
- value:
329
- `${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
330
- });
331
- }
323
+ if (!fm_contains_description && extracted.firstParagraphText) {
324
+ fm_headers.push({
325
+ id: "description",
326
+ value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
327
+ });
332
328
  }
333
329
 
334
330
  // Reading time from content if not in frontmatter
335
331
  if (!fm_contains_reading_time) {
336
- const read_time_mins = hdoc.get_html_read_time(html_txt);
337
- book_read_time += read_time_mins;
332
+ book_read_time += extracted.readTimeMins;
338
333
  fm_headers.push({
339
334
  id: "reading-time",
340
- value: read_time_mins,
335
+ value: extracted.readTimeMins,
341
336
  });
342
337
  }
343
338
  } else {
@@ -348,6 +343,12 @@
348
343
  // Check if we have a frontmatter comment
349
344
  html_fm = hdoc.getHTMLFrontmatterHeader(html_txt);
350
345
 
346
+ // Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
347
+ // Must run after getHTMLFrontmatterHeader (which reads the top-level comment)
348
+ // but before any per-field extraction; the resulting html replaces html_txt.
349
+ const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
350
+ html_txt = extracted.html;
351
+
351
352
  if (Object.keys(html_fm.fm_properties).length > 0) {
352
353
  existing_fm_headers = true;
353
354
 
@@ -374,9 +375,8 @@
374
375
 
375
376
  // Is reading-time in the fm headers?
376
377
  if (html_fm.fm_properties["reading-time"] === undefined) {
377
- const read_time_mins = hdoc.get_html_read_time(html_txt);
378
- book_read_time += read_time_mins;
379
- html_fm.fm_properties["reading-time"] = read_time_mins;
378
+ book_read_time += extracted.readTimeMins;
379
+ html_fm.fm_properties["reading-time"] = extracted.readTimeMins;
380
380
  }
381
381
 
382
382
  for (const key in html_fm.fm_properties) {
@@ -397,21 +397,13 @@
397
397
  file_path.name !== "article_ext.md" &&
398
398
  file_path.name !== "internal_ext.md"
399
399
  ) {
400
- // No frontmatter title found in properties - go get title from h tags in html
401
- const html_heading = hdoc.getFirstHTMLHeading(
402
- html_txt,
403
- h_tags_to_search,
404
- );
405
-
406
- if (html_heading?.[0]?.children?.[0]?.data) {
407
- // We've found a heading tag, add that as a title to the existing frontmatter properties
400
+ if (extracted.firstHeadingText) {
408
401
  fm_headers.push({
409
402
  id: "title",
410
- value: html_heading[0].children[0].data,
403
+ value: extracted.firstHeadingText,
411
404
  });
412
- doc_title = html_heading[0].children[0].data;
405
+ doc_title = extracted.firstHeadingText;
413
406
  } else {
414
- // No header tag, no frontmatter title, output a warning
415
407
  console.info(
416
408
  `[WARNING] No frontmatter title property, or ${h_tags_to_search.join(
417
409
  ", ",
@@ -426,12 +418,10 @@
426
418
  html_fm.fm_properties.description !== undefined
427
419
  ) {
428
420
  if (html_fm.fm_properties.description === "") {
429
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
430
- if (html_p_tag?.[0]?.children?.[0]?.data) {
421
+ if (extracted.firstParagraphText) {
431
422
  fm_headers.push({
432
423
  id: "description",
433
- value:
434
- `${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
424
+ value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
435
425
  });
436
426
  }
437
427
  } else {
@@ -440,30 +430,22 @@
440
430
  value: html_fm.fm_properties.description.trim(),
441
431
  });
442
432
  }
443
- } else {
444
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
445
- if (html_p_tag?.[0]?.children?.[0]?.data) {
446
- fm_headers.push({
447
- id: "description",
448
- value:
449
- `${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
450
- });
451
- }
433
+ } else if (extracted.firstParagraphText) {
434
+ fm_headers.push({
435
+ id: "description",
436
+ value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
437
+ });
452
438
  }
453
439
  } else {
454
440
  // We have no frontmatter headers, get and build one from the html headings
455
- const html_heading = hdoc.getFirstHTMLHeading(
456
- html_txt,
457
- h_tags_to_search,
458
- );
459
441
  let doc_title_local = "";
460
442
  // Add the title
461
- if (html_heading?.[0]?.children?.[0]?.data) {
443
+ if (extracted.firstHeadingText) {
462
444
  fm_headers.push({
463
445
  id: "title",
464
- value: html_heading[0].children[0].data,
446
+ value: extracted.firstHeadingText,
465
447
  });
466
- doc_title_local = html_heading[0].children[0].data;
448
+ doc_title_local = extracted.firstHeadingText;
467
449
  doc_title = doc_title_local;
468
450
  } else if (
469
451
  file_path.name !== "description_ext.md" &&
@@ -478,19 +460,16 @@
478
460
  }
479
461
 
480
462
  // Add the reading time
481
- const read_time_mins = hdoc.get_html_read_time(html_txt);
482
- book_read_time += read_time_mins;
463
+ book_read_time += extracted.readTimeMins;
483
464
  fm_headers.push({
484
465
  id: "reading-time",
485
- value: read_time_mins,
466
+ value: extracted.readTimeMins,
486
467
  });
487
468
 
488
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
489
- if (html_p_tag?.[0]?.children?.[0]?.data) {
469
+ if (extracted.firstParagraphText) {
490
470
  fm_headers.push({
491
471
  id: "description",
492
- value:
493
- `${doc_title_local}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
472
+ value: `${doc_title_local}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
494
473
  });
495
474
  }
496
475
  }
@@ -675,9 +654,6 @@
675
654
  }
676
655
  if (pdf_size > 0) pdf_created++;
677
656
 
678
- // Wrap h2 and h3 tags, plus content, in id'd divs
679
- html_txt = hdoc.wrapHContent(html_txt);
680
-
681
657
  if (inline_content) html_txt = `${fm_header_str}\n${html_txt}`;
682
658
  else html_txt = `${fm_header_str}\n${doc_header}\n${html_txt}`;
683
659
 
@@ -732,7 +708,7 @@
732
708
 
733
709
  const tidy_code_tags = (markdown, file) => {
734
710
  let clean_markdown = markdown;
735
- const json_to_tidy = clean_markdown.match(/```json[\r\n](\s|.)*?```/g);
711
+ const json_to_tidy = clean_markdown.match(/```json[\r\n][\s\S]*?```/g);
736
712
  if (json_to_tidy && json_to_tidy.length > 0) {
737
713
  for (let i = 0; i < json_to_tidy.length; i++) {
738
714
  if (json_to_tidy[i] !== "") {
@@ -754,7 +730,7 @@
754
730
  }
755
731
  }
756
732
 
757
- const xml_to_tidy = clean_markdown.match(/```xml[\r\n](\s|.)*?```/g);
733
+ const xml_to_tidy = clean_markdown.match(/```xml[\r\n][\s\S]*?```/g);
758
734
  if (xml_to_tidy && xml_to_tidy.length > 0) {
759
735
  for (let i = 0; i < xml_to_tidy.length; i++) {
760
736
  if (xml_to_tidy[i] !== "") {
@@ -1229,7 +1205,7 @@
1229
1205
  for (let i = 0; i < md_files.length; i++) {
1230
1206
  mdPromiseArray.push(md_files[i]);
1231
1207
  }
1232
- const chunkSize = 3;
1208
+ const chunkSize = 8;
1233
1209
  for (let i = 0; i < mdPromiseArray.length; i += chunkSize) {
1234
1210
  const chunk = mdPromiseArray.slice(i, i + chunkSize);
1235
1211
  // do whatever
@@ -1357,7 +1333,7 @@
1357
1333
  process.exit(1);
1358
1334
  }
1359
1335
  // Populate primary index tables
1360
- const index = await hdoc_build_db.populate_index(
1336
+ const index = hdoc_build_db.populate_index(
1361
1337
  db.db,
1362
1338
  doc_id,
1363
1339
  hdocbook_config,
package/hdoc-db.js CHANGED
@@ -1,4 +1,5 @@
1
1
  (() => {
2
+ const cheerio = require("cheerio");
2
3
  const path = require("node:path");
3
4
  const hdoc = require(path.join(__dirname, "hdoc-module.js"));
4
5
 
@@ -63,23 +64,44 @@
63
64
  sections: [],
64
65
  };
65
66
 
66
- // Get frontmatter properties
67
- const fm_headers = hdoc.getHTMLFrontmatterHeader(html_txt);
68
- response.fm_props = fm_headers.fm_properties;
67
+ // Single parse covers frontmatter extraction, full-text, and preview —
68
+ // previously three separate cheerio.load() calls.
69
+ const $ = cheerio.load(html_txt, { decodeEntities: false });
69
70
 
70
- // Convert HTML into plain text
71
- response.text = hdoc.html_to_text(html_txt);
71
+ // Extract frontmatter properties from the leading HTML comment
72
+ if ($._root?.children && Array.isArray($._root.children)) {
73
+ for (const child of $._root.children) {
74
+ if (child.type === "comment" && child.data?.startsWith("[[FRONTMATTER")) {
75
+ for (const line of child.data.split(/\r?\n/)) {
76
+ if (line.includes(":")) {
77
+ const parts = line.split(/:(.*)/s);
78
+ if (parts.length > 1) {
79
+ const key = parts[0].trim().toLowerCase();
80
+ let val = parts[1].trim();
81
+ if (/^".*"$/.test(val)) val = val.slice(1, -1);
82
+ if (key === "title") {
83
+ val = val.replace(
84
+ /&amp;|&lt;|&gt;|&quot;|&#39;|&apos;|&#(\d+);|&#x([0-9a-fA-F]+);/g,
85
+ (m, dec, hex) => dec ? String.fromCharCode(+dec) : hex ? String.fromCharCode(parseInt(hex, 16)) : ({ "&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"', "&#39;": "'", "&apos;": "'" })[m],
86
+ );
87
+ }
88
+ response.fm_props[key] = val;
89
+ }
90
+ }
91
+ }
92
+ break;
93
+ }
94
+ }
95
+ }
96
+
97
+ // Full-document plain text for search indexing
98
+ const text = $("body").text();
99
+
100
+ // Preview: first paragraph texts joined, then truncated
101
+ let preview = $("p").map((_i, el) => $(el).text()).get().join("\n");
102
+ preview = hdoc.truncate_string(preview, 200, true).replace(/(?:\r\n|\r|\n)/g, " ");
72
103
 
73
- // Convert HTML into preview text
74
- let preview = hdoc.html_to_text(html_txt, { baseElement: "p" });
75
- preview = hdoc
76
- .truncate_string(preview, 200, true)
77
- .replace(/(?:\r\n|\r|\n)/g, " ");
78
- response.sections.push({
79
- text: response.text,
80
- preview: preview,
81
- });
82
- //}
104
+ response.sections.push({ text, preview });
83
105
  return response;
84
106
  };
85
107
  })();
package/hdoc-module.js CHANGED
@@ -48,6 +48,8 @@
48
48
  }
49
49
  };
50
50
 
51
+ exports.fetchWithRetry = fetchWithRetry;
52
+
51
53
  exports.content_type_for_ext = (ext) => {
52
54
  switch (ext) {
53
55
  case ".z":
@@ -220,15 +222,9 @@
220
222
  // Looks for h1 tags first, then hX, hY, hZ in order
221
223
  exports.getFirstHTMLHeading = (html_body, h_to_search = ["h1"]) => {
222
224
  const $ = cheerio.load(html_body);
223
- for (let i = 0; i < h_to_search.length; i++) {
224
- const heading = $(h_to_search[i])
225
- .map(function (i) {
226
- return $(this);
227
- })
228
- .get();
229
- if (heading.length > 0) {
230
- return heading[0];
231
- }
225
+ for (const tag of h_to_search) {
226
+ const el = $(tag).first();
227
+ if (el.length > 0) return el;
232
228
  }
233
229
  return false;
234
230
  };
@@ -277,6 +273,58 @@
277
273
  return `<html><head></head><body>${result}</body></html>`;
278
274
  };
279
275
 
276
+ // Combined single-pass version of wrapHContent + getFirstHTMLHeading + get_html_read_time.
277
+ // Iterates body contents once to wrap h2/h3 divs AND extract the first matching heading text,
278
+ // first paragraph text, and reading-time estimate — avoiding 3 extra cheerio.load() calls.
279
+ exports.wrapAndExtract = (htmlContent, h_tags_to_search = ["h1"]) => {
280
+ const $ = cheerio.load(htmlContent, { decodeEntities: false });
281
+ let result = '';
282
+ let inH2 = false;
283
+ let inH3 = false;
284
+ let firstHeadingText = null;
285
+ let firstParagraphText = null;
286
+
287
+ $('body').contents().each(function() {
288
+ const tagName = this.type === 'tag' ? this.name?.toLowerCase() : null;
289
+ const text = tagName ? $(this).text().trim() : null;
290
+
291
+ if (firstHeadingText === null && tagName && h_tags_to_search.includes(tagName)) {
292
+ firstHeadingText = text;
293
+ }
294
+ if (firstParagraphText === null && tagName === 'p') {
295
+ firstParagraphText = text;
296
+ }
297
+
298
+ if (tagName === 'h2') {
299
+ if (inH3) { result += '</div>'; inH3 = false; }
300
+ if (inH2) { result += '</div>'; inH2 = false; }
301
+ result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
302
+ inH2 = true;
303
+ } else if (tagName === 'h3') {
304
+ if (inH3) { result += '</div>'; inH3 = false; }
305
+ result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
306
+ inH3 = true;
307
+ } else {
308
+ result += $.html(this);
309
+ }
310
+ });
311
+
312
+ if (inH3) result += '</div>';
313
+ if (inH2) result += '</div>';
314
+
315
+ // Word count re-uses the already-parsed DOM — no extra cheerio.load()
316
+ const bodyText = $("body").text();
317
+ const wordCount = bodyText.trim().split(/\s+/).filter(Boolean).length;
318
+ const readTimeMins = wordCount === 0 ? 0 : (Math.round(wordCount / 200) || 1);
319
+
320
+ return {
321
+ html: `<html><head></head><body>${result}</body></html>`,
322
+ firstHeadingText,
323
+ firstParagraphText,
324
+ readTimeMins,
325
+ };
326
+ };
327
+
280
328
  exports.getIDDivs = (html_body) => {
281
329
  const $ = cheerio.load(html_body, {
282
330
  decodeEntities: false,
package/hdoc-validate.js CHANGED
@@ -8,7 +8,6 @@ const { error } = require("node:console");
8
8
  const path = require("node:path");
9
9
  const hdoc = require(path.join(__dirname, "hdoc-module.js"));
10
10
  const translator = require("american-british-english-translator");
11
- const puppeteer = require("puppeteer");
12
11
 
13
12
  const spellcheck_options = {
14
13
  british: true,
@@ -26,10 +25,11 @@ const { error } = require("node:console");
26
25
  let private_repo = false;
27
26
  let redirects = {};
28
27
  let skip_link_file = '';
28
+ let _on_int_net_cached = null; // null = not yet checked; cached after first DNS lookup
29
29
  const exclude_h1_count = {};
30
30
  const exclude_spellcheck_output = [];
31
31
 
32
- const excludeLink = async (url) => {
32
+ const excludeLink = (url) => {
33
33
  if (exclude_links[url]) return true;
34
34
  for (let key in exclude_links) {
35
35
  if (Object.hasOwn(exclude_links, key)) {
@@ -465,18 +465,67 @@ const { error } = require("node:console");
465
465
  return returnPaths;
466
466
  }
467
467
 
468
- const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, browser, global_links_checked, output_links) => {
468
+ // Headers that mimic a real Chrome browser request sites doing bot detection
469
+ // check far more than just User-Agent (Accept, Sec-Fetch-*, client hints, etc.).
470
+ const _fetch_headers = {
471
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
472
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
473
+ 'Accept-Language': 'en-US,en;q=0.9',
474
+ 'Accept-Encoding': 'gzip, deflate, br',
475
+ 'Cache-Control': 'no-cache',
476
+ 'Pragma': 'no-cache',
477
+ 'Sec-Fetch-Dest': 'document',
478
+ 'Sec-Fetch-Mode': 'navigate',
479
+ 'Sec-Fetch-Site': 'none',
480
+ 'Sec-Fetch-User': '?1',
481
+ 'Sec-Ch-Ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
482
+ 'Sec-Ch-Ua-Mobile': '?0',
483
+ 'Sec-Ch-Ua-Platform': '"Windows"',
484
+ 'Upgrade-Insecure-Requests': '1',
485
+ };
486
+
487
+ // Checks a single external URL by sending a HEAD request, falling back to GET
488
+ // if the server returns 405 (Method Not Allowed) or 404 (some servers, e.g.
489
+ // marketplace.visualstudio.com, return 404 for HEAD even when the page exists).
490
+ // Retries up to 5 times on transient errors (5xx, 429, network failures).
491
+ // Returns the HTTP status code.
492
+ const fetchExternalLinkStatus = async (url) => {
493
+ const opts = { method: 'HEAD', headers: _fetch_headers, timeoutMs: 10000, redirect: 'follow' };
494
+ const resp = await hdoc.fetchWithRetry(url, opts);
495
+ if (resp.status === 404 || resp.status === 405) {
496
+ const getResp = await hdoc.fetchWithRetry(url, { ...opts, method: 'GET' });
497
+ return getResp.status;
498
+ }
499
+ return resp.status;
500
+ };
501
+
502
+ const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, global_links_checked, output_links) => {
469
503
  const markdown_paths = getMDPathFromHtmlPath(htmlFile);
470
504
  const markdown_content = fs.readFileSync(markdown_paths.markdownPath, 'utf8');
471
505
 
472
-
506
+ // Resolve the "are we on the internal network?" question once per process
507
+ // rather than once per internal.hornbill.com link.
508
+ const ensureIntNetCached = async () => {
509
+ if (_on_int_net_cached === null) {
510
+ try {
511
+ _on_int_net_cached = await checkHostExistsInDNS('docs-internal.hornbill.com');
512
+ } catch (_e) {
513
+ _on_int_net_cached = false;
514
+ }
515
+ }
516
+ return _on_int_net_cached;
517
+ };
518
+
519
+ // Collect external links that need an HTTP check so they can be run
520
+ // concurrently rather than one-at-a-time.
521
+ const externalChecks = [];
522
+
473
523
  for (let i = 0; i < links.length; i++) {
474
- // Validate that link is a valid URL first
475
524
  if (output_links) console.log(` - ${links[i]}`);
476
525
  if (exclude_links[links[i]]) continue;
477
526
  if (global_links_checked.includes(links[i])) continue;
478
527
  global_links_checked.push(links[i]);
479
-
528
+
480
529
  const valid_url = hdoc.valid_url(links[i]);
481
530
  if (!valid_url) {
482
531
  // Could be a relative path, check
@@ -509,7 +558,7 @@ const { error } = require("node:console");
509
558
  messages[htmlFile.relativePath].push(
510
559
  `Link is a properly formatted external URL: ${links[i]}`,
511
560
  );
512
-
561
+
513
562
  // Skip if it's the auto-generated edit url, as these could be part of a private repo which would return a 404
514
563
  if (
515
564
  hdocbook_config.publicSource !== undefined &&
@@ -524,38 +573,14 @@ const { error } = require("node:console");
524
573
  fs.appendFileSync(skip_link_file, `${links[i]}\n`);
525
574
  continue;
526
575
  }
527
-
576
+
528
577
  if (valid_url.protocol === "mailto:") {
529
578
  fs.appendFileSync(skip_link_file, `${links[i]}\n`);
530
579
  continue;
531
580
  }
532
581
 
533
- // Skip internal.hornbill.com link validation if run outside of the Hornbill network
534
- if (links[i].toLowerCase().includes("internal.hornbill.com")) {
535
- // DNS lookup internal docs endpoint
536
- const hostname = 'docs-internal.hornbill.com';
537
- let on_int_net = false;
538
- try {
539
- on_int_net = await checkHostExistsInDNS(hostname);
540
- } catch (e) {
541
- // Don't need to do anything here
542
- }
543
-
544
- if (!on_int_net) {
545
- messages[htmlFile.relativePath].push(
546
- `Outside of Hornbill network - skipping internal link validation for: ${links[i]}`,
547
- );
548
- fs.appendFileSync(skip_link_file, `${links[i]}\n`);
549
- continue;
550
- }
551
- messages[htmlFile.relativePath].push(
552
- `Inside of Hornbill network - performing internal link validation for: ${links[i]}`,
553
- );
554
- }
555
-
556
582
  // Skip if the link is excluded in the project config
557
- const skip_link = await excludeLink(links[i]);
558
- if (skip_link) {
583
+ if (excludeLink(links[i])) {
559
584
  messages[htmlFile.relativePath].push(
560
585
  `Skipping link validation for: ${links[i]}`,
561
586
  );
@@ -563,99 +588,76 @@ const { error } = require("node:console");
563
588
  }
564
589
 
565
590
  if (
566
- (links[i].toLowerCase().includes("docs.hornbill.com") ||
591
+ (links[i].toLowerCase().includes("docs.hornbill.com") ||
567
592
  links[i].toLowerCase().includes("docs-internal.hornbill.com")) &&
568
593
  !markdown_paths.relativePath.includes('/_inline/')
569
594
  ) {
570
595
  const error_message = processErrorMessage(`Hornbill Docs links should not be fully-qualified: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
571
- errors[htmlFile.relativePath].push( error_message );
596
+ errors[htmlFile.relativePath].push(error_message);
572
597
  continue;
573
598
  }
574
599
 
575
- if (
600
+ if (
576
601
  links[i].toLowerCase().includes("docs-internal.hornbill.com") &&
577
602
  markdown_paths.relativePath.includes('/_inline/') &&
578
603
  !private_repo
579
604
  ) {
580
605
  // Is the parent book in a public repo? If so, flag this as an error.
581
606
  const error_message = processErrorMessage(`Hornbill docs-internal links should not be used in public book inline content: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
582
- errors[htmlFile.relativePath].push( error_message );
607
+ errors[htmlFile.relativePath].push(error_message);
583
608
  continue;
584
609
  }
585
610
 
586
- // Use Puppeteer to validate link address works
587
- const page = await browser.newPage();
588
-
589
- try {
590
- // Set a user-agent to mimic a real browser
591
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
611
+ // Capture url in closure for the async check below
612
+ const url = links[i];
613
+ const isInternal = url.toLowerCase().includes("internal.hornbill.com");
614
+
615
+ externalChecks.push(async () => {
616
+ // For internal.hornbill.com links, check network reachability first (result cached)
617
+ if (isInternal) {
618
+ const on_int_net = await ensureIntNetCached();
619
+ if (!on_int_net) {
620
+ messages[htmlFile.relativePath].push(
621
+ `Outside of Hornbill network - skipping internal link validation for: ${url}`,
622
+ );
623
+ fs.appendFileSync(skip_link_file, `${url}\n`);
624
+ return;
625
+ }
626
+ messages[htmlFile.relativePath].push(
627
+ `Inside of Hornbill network - performing internal link validation for: ${url}`,
628
+ );
629
+ }
592
630
 
593
631
  try {
594
- let response = null;
595
-
596
- // Capture redirects and final response
597
- page.on('request', (request) => {
598
- if (request.isNavigationRequest() && request.redirectChain().length) {
599
- redirectChain = request.redirectChain().map((req) => req.url());
600
- }
601
- });
602
-
603
- // Capture the response
604
- page.on('response', (res) => {
605
- const chain = res.request().redirectChain();
606
- if (chain.length > 0) {
607
- redirectChain = chain.map((req) => req.url());
608
- lastRedirectStatus = res.status(); // Status of the last redirect
609
- }
610
- });
611
-
612
- // Try loading the URL
613
- response = await page.goto(links[i], { waitUntil: 'networkidle2', timeout: 10000 });
614
-
615
- if (response) {
616
- let status = response.status();
617
- const contentType = response.headers()['content-type'];
618
-
619
- // If it's a PDF switch to direct fetching
620
- if (contentType && contentType.includes('application/')) {
621
- status = await page.evaluate(async (url) => {
622
- const res = await fetch(url, { method: 'HEAD' });
623
- return res.status;
624
- }, links[i]);
625
- }
626
- if ((status < 200 || status > 299) && status !== 304) {
627
- if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && links[i].includes(".hornbill.com")) {
628
- // STEVEG - do nothing here, as it always returns a 403 for Hornbill sites when accessing through GitHub Actions
629
- // Works totally fine locally or in hdocpub, still trying to work out what's causing this in GitHub
630
- } else {
631
- throw `Unexpected Status Returned: ${status}`;
632
- }
632
+ const status = await fetchExternalLinkStatus(url);
633
+ if ((status < 200 || status > 299) && status !== 304) {
634
+ if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && url.includes(".hornbill.com")) {
635
+ // Always returns 403 for Hornbill sites through GitHub Actions — not a real error
633
636
  } else {
634
- fs.appendFileSync(skip_link_file, `${links[i]}\n`);
637
+ throw `Unexpected Status Returned: ${status}`;
635
638
  }
636
639
  } else {
637
- throw `No response from: ${links[i]}`;
640
+ fs.appendFileSync(skip_link_file, `${url}\n`);
638
641
  }
639
- } catch (error) {
640
- throw error;
641
- }
642
- } catch (e) {
643
- let error_message;
644
- if (e instanceof AggregateError) {
645
- error_message = processErrorMessage(`Issue with external link [${links[i]}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, links[i]);
646
- } else {
647
- error_message = processErrorMessage(`Issue with external link [${links[i]}]: ${e}`, markdown_paths.relativePath, markdown_content, links[i]);
642
+ } catch (e) {
643
+ let error_message;
644
+ if (e instanceof AggregateError) {
645
+ error_message = processErrorMessage(`Issue with external link [${url}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, url);
646
+ } else {
647
+ error_message = processErrorMessage(`Issue with external link [${url}]: ${e}`, markdown_paths.relativePath, markdown_content, url);
648
+ }
649
+ if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
650
+ warnings[htmlFile.relativePath].push(error_message);
651
+ else
652
+ errors[htmlFile.relativePath].push(error_message);
648
653
  }
649
- if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
650
- warnings[htmlFile.relativePath].push(error_message);
651
- else
652
- errors[htmlFile.relativePath].push(error_message);
653
-
654
- }
655
- // Close the headless browser tab
656
- page.close();
654
+ });
657
655
  }
658
656
  }
657
+
658
+ // Run all external HTTP checks concurrently — fetch is lightweight enough
659
+ // that uncapped concurrency is fine for the link counts seen in practice.
660
+ await Promise.all(externalChecks.map(fn => fn()));
659
661
  };
660
662
 
661
663
  const checkHostExistsInDNS = async (hostname) => {
@@ -1069,8 +1071,7 @@ const { error } = require("node:console");
1069
1071
 
1070
1072
 
1071
1073
  const global_links_checked = [];
1072
- const validateBrowser = await puppeteer.launch({ args: ['--no-sandbox'] });
1073
-
1074
+
1074
1075
  for (const key in html_to_validate) {
1075
1076
  const file = html_to_validate[key];
1076
1077
  // Check for British spellings in static HTML content
@@ -1095,7 +1096,7 @@ const { error } = require("node:console");
1095
1096
  messages[file.relativePath].push("No links found in file");
1096
1097
  } else {
1097
1098
  console.log(`\r\nChecking ${links.href.length} Links in ${file.relativePath}`);
1098
- await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, validateBrowser, global_links_checked, output_links);
1099
+ await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, global_links_checked, output_links);
1099
1100
  }
1100
1101
  if (links.img.length === 0) {
1101
1102
  messages[file.relativePath].push("No images found in file");
@@ -1107,9 +1108,6 @@ const { error } = require("node:console");
1107
1108
  await checkTags(file);
1108
1109
  }
1109
1110
 
1110
- // Close the Chromium browser instance
1111
- await validateBrowser.close();
1112
-
1113
1111
  if (gen_exclude) console.log(JSON.stringify(excl_output, null, 2));
1114
1112
 
1115
1113
  if (verbose) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "hdoc-tools",
3
- "version": "0.47.2",
3
+ "version": "0.47.4",
4
4
  "description": "Hornbill HDocBook Development Support Tool",
5
5
  "main": "hdoc.js",
6
6
  "bin": {