hdoc-tools 0.47.2 → 0.47.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,7 +35,7 @@ Returns statistics regarding the book you are working on:
35
35
  - Number of Markdown Files in the Book
36
36
  - Number of Static HTML Files in the Book
37
37
 
38
- If the -v switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
38
+ If the `-v` switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
39
39
 
40
40
  The book statistics do not include counts for any externally hosted content injected into the book content using the [[INCLUDE]] tags.
41
41
 
@@ -43,25 +43,25 @@ The book statistics do not include counts for any externally hosted content inje
43
43
 
44
44
  Performs a local build of the book, validates the links and static content are present and correct and outputs as a ZIP file.
45
45
 
46
- If the -v switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
46
+ If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
47
47
 
48
- Use the --set-version argument to set the version number of the built book.
48
+ Use the `--set-version` argument to set the version number of the built book.
49
49
 
50
- Use the --no-color argument to remove any color control characters from the output.
50
+ Use the `--no-color` argument to remove any color control characters from the output.
51
51
 
52
- Use the '--no-links' argument to skip link output to CLI during validation.
52
+ Use the `--no-links` argument to skip link output to CLI during validation.
53
53
 
54
54
  ### validate
55
55
 
56
56
  Performs a minimum local build of the book, then validates the links and static content are present and correct.
57
57
 
58
- If the -v switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
58
+ If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
59
59
 
60
- Use the --set-version argument to set the version number of the built book.
60
+ Use the `--set-version` argument to set the version number of the built book.
61
61
 
62
- Use the --no-color argument to remove any color control characters from the output.
62
+ Use the `--no-color` argument to remove any color control characters from the output.
63
63
 
64
- Use the '--no-links' argument to skip link output to CLI during validation.
64
+ Use the `--no-links` argument to skip link output to CLI during validation.
65
65
 
66
66
  ### serve
67
67
 
@@ -72,4 +72,4 @@ command `hdoc serve` and in a local browser go to the URL `http://localhost:3000
72
72
 
73
73
  ## Installation
74
74
 
75
- npm install hdoc-tools -g
75
+ > `npm install hdoc-tools -g`
package/hdoc-build-db.js CHANGED
@@ -87,41 +87,43 @@
87
87
  return response;
88
88
  };
89
89
 
90
- exports.populate_redirects = (db, redirect_records, verbose = false) => {
90
+ exports.populate_redirects = (db, redirect_records, _verbose = false) => {
91
91
  const response = {
92
92
  success: true,
93
93
  errors: [],
94
94
  index_success_count: 0,
95
95
  };
96
96
 
97
- for (let i = 0; i < redirect_records.length; i++) {
98
- const index_vals = [
99
- redirect_records[i].url,
100
- redirect_records[i].location ? redirect_records[i].location : "",
101
- redirect_records[i].code,
102
- ];
103
- const index_response = hdoc_index.insert_record(
104
- db,
105
- "hdoc_redirects",
106
- db_schema.hdoc_redirects,
107
- index_vals,
108
- );
109
- if (!index_response.success) {
110
- response.success = false;
111
- response.errors.push(
112
- `Redirect record creation failed - ${redirect_records[i].url}: ${index_response.error}`,
113
- );
114
- } else {
115
- response.index_success_count++;
97
+ // Prepare once, insert all in one transaction
98
+ const cols = db_schema.hdoc_redirects.map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
99
+ const stmt = db.prepare(`INSERT INTO hdoc_redirects (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
100
+
101
+ const run_all = db.transaction(() => {
102
+ for (const record of redirect_records) {
103
+ try {
104
+ stmt.run(record.url, record.location ? record.location : "", record.code);
105
+ response.index_success_count++;
106
+ } catch (e) {
107
+ response.success = false;
108
+ response.errors.push(`Redirect record creation failed - ${record.url}: ${e}`);
109
+ }
116
110
  }
111
+ });
112
+
113
+ try {
114
+ run_all();
115
+ } catch (e) {
116
+ response.success = false;
117
+ response.errors.push(`Redirect index transaction failed: ${e}`);
117
118
  }
119
+
118
120
  console.log(
119
121
  `\nRedirect Index Build Complete: ${response.index_success_count} document records created.`,
120
122
  );
121
123
  return response;
122
124
  };
123
125
 
124
- exports.populate_index = async (
126
+ exports.populate_index = (
125
127
  db,
126
128
  doc_id,
127
129
  book_config,
@@ -136,124 +138,109 @@
136
138
 
137
139
  if (!book_config.tags) book_config.tags = [];
138
140
 
139
- const indexPromises = [];
140
- for (let i = 0; i < index_records.length; i++) {
141
- indexPromises.push(index_records[i]);
142
- }
143
- let curr_file = "";
144
- await Promise.all(
145
- indexPromises.map(async (file) => {
141
+ // Build a prepared statement from a schema entry once, reusing it for every row.
142
+ // Previously insert_record() called db.prepare() on every single insert.
143
+ const make_stmt = (table) => {
144
+ const cols = db_schema[table].map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
145
+ return db.prepare(`INSERT INTO ${table} (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
146
+ };
147
+ const stmt_index = make_stmt("hdoc_index");
148
+ const stmt_meta = make_stmt("hdoc_meta");
149
+ const stmt_contrib = make_stmt("hdoc_contributors");
150
+
151
+ // A single transaction batches all disk flushes into one — critical for
152
+ // FTS5 which otherwise re-indexes on every individual insert.
153
+ const run_all = db.transaction(() => {
154
+ let curr_file = "";
155
+ for (const file of index_records) {
146
156
  let index_path_name = file.relative_path.replaceAll("\\", "/");
147
157
  if (
148
158
  index_path_name.endsWith("/index.md") ||
149
159
  index_path_name.endsWith("/index.html") ||
150
160
  index_path_name.endsWith("/index.htm")
151
161
  ) {
152
- index_path_name = index_path_name.substring(
153
- 0,
154
- index_path_name.lastIndexOf("/"),
155
- );
162
+ index_path_name = index_path_name.substring(0, index_path_name.lastIndexOf("/"));
156
163
  }
157
164
  index_path_name = `/${index_path_name.replace(path.extname(file.relative_path), "")}`;
158
165
 
159
- let index_response = {
160
- success: true,
161
- row_id: 0,
162
- };
163
- let index_content_path = index_path_name;
164
- if (file.index_html.id !== null)
165
- index_content_path += `#${file.index_html.id}`;
166
+ let inserted_row_id = null;
167
+ const index_content_path = file.index_html.id !== null
168
+ ? `${index_path_name}#${file.index_html.id}`
169
+ : index_path_name;
170
+
166
171
  if (!file.inline) {
167
- const index_vals = [
168
- index_content_path,
169
- doc_id,
170
- book_config.audience.join(","),
171
- book_config.tags.join(","),
172
- file.index_html.fm_props.title,
173
- file.index_html.text,
174
- file.index_html.preview,
175
- book_config.productFamily,
176
- file.md5,
177
- file.lastmod,
178
- file.status,
179
- file.keywords,
180
- ];
181
- index_response = hdoc_index.insert_record(
182
- db,
183
- "hdoc_index",
184
- db_schema.hdoc_index,
185
- index_vals,
186
- );
172
+ try {
173
+ const info = stmt_index.run(
174
+ index_content_path,
175
+ doc_id,
176
+ book_config.audience.join(","),
177
+ book_config.tags.join(","),
178
+ file.index_html.fm_props.title,
179
+ file.index_html.text,
180
+ file.index_html.preview,
181
+ book_config.productFamily,
182
+ file.md5,
183
+ file.lastmod,
184
+ file.status,
185
+ file.keywords,
186
+ );
187
+ inserted_row_id = info.lastInsertRowid;
188
+ } catch (e) {
189
+ console.error(`Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${e}`);
190
+ continue;
191
+ }
187
192
  }
188
- if (!index_response.success) {
189
- console.error(
190
- `Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${index_response.error}`,
191
- );
192
- } else {
193
- if (curr_file === index_path_name) return;
194
- curr_file = index_path_name;
195
- // Now add metadata
196
- const meta_vals = [
193
+
194
+ if (curr_file === index_path_name) continue;
195
+ curr_file = index_path_name;
196
+
197
+ try {
198
+ const meta_info = stmt_meta.run(
197
199
  index_path_name,
198
200
  doc_id,
199
201
  file.metadata.contributor_count,
200
202
  file.metadata.edit_url,
201
203
  file.metadata.last_commit,
202
204
  file.pdf_size,
203
- ];
204
- const meta_response = await hdoc_index.insert_record(
205
- db,
206
- "hdoc_meta",
207
- db_schema.hdoc_meta,
208
- meta_vals,
209
205
  );
210
- if (!meta_response.success) {
211
- console.error(
212
- `Index metadata record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${meta_response.error}`,
206
+ if (verbose) {
207
+ console.log(`Inserted index record ${inserted_row_id}: ${doc_id} - ${file.index_html.fm_props.title}`);
208
+ console.log(`Inserted index metadata record for index ID: ${meta_info.lastInsertRowid}`);
209
+ }
210
+ } catch (e) {
211
+ console.error(`Index metadata record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
212
+ continue;
213
+ }
214
+
215
+ for (const contrib of file.contributors) {
216
+ try {
217
+ const cont_info = stmt_contrib.run(
218
+ index_path_name,
219
+ doc_id,
220
+ contrib.login,
221
+ contrib.name,
222
+ contrib.avatar_url,
223
+ contrib.html_url,
213
224
  );
214
- } else {
215
225
  if (verbose) {
216
- console.log(
217
- `Inserted index record ${index_response.row_id}: ${doc_id} - ${file.index_html.fm_props.title}`,
218
- );
219
- console.log(
220
- `Inserted index metadata record for index ID: ${meta_response.row_id}`,
221
- );
222
- }
223
-
224
- // Now add contributor records
225
- for (let j = 0; j < file.contributors.length; j++) {
226
- const contrib_vals = [
227
- index_path_name,
228
- doc_id,
229
- file.contributors[j].login,
230
- file.contributors[j].name,
231
- file.contributors[j].avatar_url,
232
- file.contributors[j].html_url,
233
- ];
234
- const cont_response = await hdoc_index.insert_record(
235
- db,
236
- "hdoc_contributors",
237
- db_schema.hdoc_contributors,
238
- contrib_vals,
239
- );
240
- if (!cont_response.success) {
241
- console.error(
242
- `Index document contributor record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${cont_response.error}`,
243
- );
244
- continue;
245
- }
246
- if (verbose) {
247
- console.log(
248
- `Inserted document contributor record ${cont_response.row_id}`,
249
- );
250
- }
226
+ console.log(`Inserted document contributor record ${cont_info.lastInsertRowid}`);
251
227
  }
252
- response.index_success_count++;
228
+ } catch (e) {
229
+ console.error(`Index document contributor record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
253
230
  }
254
231
  }
255
- }),
256
- );
232
+
233
+ response.index_success_count++;
234
+ }
235
+ });
236
+
237
+ try {
238
+ run_all();
239
+ } catch (e) {
240
+ response.error = e.message;
241
+ console.error(`Index build transaction failed: ${e}`);
242
+ return response;
243
+ }
257
244
 
258
245
  response.success = true;
259
246
  console.log(
package/hdoc-build.js CHANGED
@@ -252,6 +252,11 @@
252
252
  // Render markdown into HTML
253
253
  html_txt = md.render(md_txt);
254
254
 
255
+ // Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
256
+ // Replaces separate wrapHContent + getFirstHTMLHeading + get_html_read_time calls.
257
+ const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
258
+ html_txt = extracted.html;
259
+
255
260
  // Parse frontmatter properties from the YAML block
256
261
  let fm_contains_title = false;
257
262
  let fm_contains_reading_time = false;
@@ -297,17 +302,12 @@
297
302
 
298
303
  // Title from heading if not in frontmatter
299
304
  if (!fm_contains_title) {
300
- const html_heading = hdoc.getFirstHTMLHeading(
301
- html_txt,
302
- h_tags_to_search,
303
- );
304
-
305
- if (html_heading?.[0]?.children?.[0]?.data) {
305
+ if (extracted.firstHeadingText) {
306
306
  fm_headers.push({
307
307
  id: "title",
308
- value: html_heading[0].children[0].data.trim(),
308
+ value: extracted.firstHeadingText,
309
309
  });
310
- doc_title = html_heading[0].children[0].data.trim();
310
+ doc_title = extracted.firstHeadingText;
311
311
  } else if (
312
312
  file_path.name !== "description_ext.md" &&
313
313
  file_path.name !== "article_ext.md" &&
@@ -320,24 +320,19 @@
320
320
  }
321
321
 
322
322
  // Description from first paragraph if not in frontmatter
323
- if (!fm_contains_description) {
324
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
325
- if (html_p_tag?.[0]?.children?.[0]?.data) {
326
- fm_headers.push({
327
- id: "description",
328
- value:
329
- `${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
330
- });
331
- }
323
+ if (!fm_contains_description && extracted.firstParagraphText) {
324
+ fm_headers.push({
325
+ id: "description",
326
+ value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
327
+ });
332
328
  }
333
329
 
334
330
  // Reading time from content if not in frontmatter
335
331
  if (!fm_contains_reading_time) {
336
- const read_time_mins = hdoc.get_html_read_time(html_txt);
337
- book_read_time += read_time_mins;
332
+ book_read_time += extracted.readTimeMins;
338
333
  fm_headers.push({
339
334
  id: "reading-time",
340
- value: read_time_mins,
335
+ value: extracted.readTimeMins,
341
336
  });
342
337
  }
343
338
  } else {
@@ -348,6 +343,12 @@
348
343
  // Check if we have a frontmatter comment
349
344
  html_fm = hdoc.getHTMLFrontmatterHeader(html_txt);
350
345
 
346
+ // Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
347
+ // Must run after getHTMLFrontmatterHeader (which reads the top-level comment)
348
+ // but before any per-field extraction; the resulting html replaces html_txt.
349
+ const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
350
+ html_txt = extracted.html;
351
+
351
352
  if (Object.keys(html_fm.fm_properties).length > 0) {
352
353
  existing_fm_headers = true;
353
354
 
@@ -374,9 +375,8 @@
374
375
 
375
376
  // Is reading-time in the fm headers?
376
377
  if (html_fm.fm_properties["reading-time"] === undefined) {
377
- const read_time_mins = hdoc.get_html_read_time(html_txt);
378
- book_read_time += read_time_mins;
379
- html_fm.fm_properties["reading-time"] = read_time_mins;
378
+ book_read_time += extracted.readTimeMins;
379
+ html_fm.fm_properties["reading-time"] = extracted.readTimeMins;
380
380
  }
381
381
 
382
382
  for (const key in html_fm.fm_properties) {
@@ -397,21 +397,13 @@
397
397
  file_path.name !== "article_ext.md" &&
398
398
  file_path.name !== "internal_ext.md"
399
399
  ) {
400
- // No frontmatter title found in properties - go get title from h tags in html
401
- const html_heading = hdoc.getFirstHTMLHeading(
402
- html_txt,
403
- h_tags_to_search,
404
- );
405
-
406
- if (html_heading?.[0]?.children?.[0]?.data) {
407
- // We've found a heading tag, add that as a title to the existing frontmatter properties
400
+ if (extracted.firstHeadingText) {
408
401
  fm_headers.push({
409
402
  id: "title",
410
- value: html_heading[0].children[0].data,
403
+ value: extracted.firstHeadingText,
411
404
  });
412
- doc_title = html_heading[0].children[0].data;
405
+ doc_title = extracted.firstHeadingText;
413
406
  } else {
414
- // No header tag, no frontmatter title, output a warning
415
407
  console.info(
416
408
  `[WARNING] No frontmatter title property, or ${h_tags_to_search.join(
417
409
  ", ",
@@ -426,12 +418,10 @@
426
418
  html_fm.fm_properties.description !== undefined
427
419
  ) {
428
420
  if (html_fm.fm_properties.description === "") {
429
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
430
- if (html_p_tag?.[0]?.children?.[0]?.data) {
421
+ if (extracted.firstParagraphText) {
431
422
  fm_headers.push({
432
423
  id: "description",
433
- value:
434
- `${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
424
+ value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
435
425
  });
436
426
  }
437
427
  } else {
@@ -440,30 +430,22 @@
440
430
  value: html_fm.fm_properties.description.trim(),
441
431
  });
442
432
  }
443
- } else {
444
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
445
- if (html_p_tag?.[0]?.children?.[0]?.data) {
446
- fm_headers.push({
447
- id: "description",
448
- value:
449
- `${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
450
- });
451
- }
433
+ } else if (extracted.firstParagraphText) {
434
+ fm_headers.push({
435
+ id: "description",
436
+ value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
437
+ });
452
438
  }
453
439
  } else {
454
440
  // We have no frontmatter headers, get and build one from the html headings
455
- const html_heading = hdoc.getFirstHTMLHeading(
456
- html_txt,
457
- h_tags_to_search,
458
- );
459
441
  let doc_title_local = "";
460
442
  // Add the title
461
- if (html_heading?.[0]?.children?.[0]?.data) {
443
+ if (extracted.firstHeadingText) {
462
444
  fm_headers.push({
463
445
  id: "title",
464
- value: html_heading[0].children[0].data,
446
+ value: extracted.firstHeadingText,
465
447
  });
466
- doc_title_local = html_heading[0].children[0].data;
448
+ doc_title_local = extracted.firstHeadingText;
467
449
  doc_title = doc_title_local;
468
450
  } else if (
469
451
  file_path.name !== "description_ext.md" &&
@@ -478,19 +460,16 @@
478
460
  }
479
461
 
480
462
  // Add the reading time
481
- const read_time_mins = hdoc.get_html_read_time(html_txt);
482
- book_read_time += read_time_mins;
463
+ book_read_time += extracted.readTimeMins;
483
464
  fm_headers.push({
484
465
  id: "reading-time",
485
- value: read_time_mins,
466
+ value: extracted.readTimeMins,
486
467
  });
487
468
 
488
- const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
489
- if (html_p_tag?.[0]?.children?.[0]?.data) {
469
+ if (extracted.firstParagraphText) {
490
470
  fm_headers.push({
491
471
  id: "description",
492
- value:
493
- `${doc_title_local}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
472
+ value: `${doc_title_local}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
494
473
  });
495
474
  }
496
475
  }
@@ -675,9 +654,6 @@
675
654
  }
676
655
  if (pdf_size > 0) pdf_created++;
677
656
 
678
- // Wrap h2 and h3 tags, plus content, in id'd divs
679
- html_txt = hdoc.wrapHContent(html_txt);
680
-
681
657
  if (inline_content) html_txt = `${fm_header_str}\n${html_txt}`;
682
658
  else html_txt = `${fm_header_str}\n${doc_header}\n${html_txt}`;
683
659
 
@@ -732,7 +708,7 @@
732
708
 
733
709
  const tidy_code_tags = (markdown, file) => {
734
710
  let clean_markdown = markdown;
735
- const json_to_tidy = clean_markdown.match(/```json[\r\n](\s|.)*?```/g);
711
+ const json_to_tidy = clean_markdown.match(/```json[\r\n][\s\S]*?```/g);
736
712
  if (json_to_tidy && json_to_tidy.length > 0) {
737
713
  for (let i = 0; i < json_to_tidy.length; i++) {
738
714
  if (json_to_tidy[i] !== "") {
@@ -754,7 +730,7 @@
754
730
  }
755
731
  }
756
732
 
757
- const xml_to_tidy = clean_markdown.match(/```xml[\r\n](\s|.)*?```/g);
733
+ const xml_to_tidy = clean_markdown.match(/```xml[\r\n][\s\S]*?```/g);
758
734
  if (xml_to_tidy && xml_to_tidy.length > 0) {
759
735
  for (let i = 0; i < xml_to_tidy.length; i++) {
760
736
  if (xml_to_tidy[i] !== "") {
@@ -1229,7 +1205,7 @@
1229
1205
  for (let i = 0; i < md_files.length; i++) {
1230
1206
  mdPromiseArray.push(md_files[i]);
1231
1207
  }
1232
- const chunkSize = 3;
1208
+ const chunkSize = 8;
1233
1209
  for (let i = 0; i < mdPromiseArray.length; i += chunkSize) {
1234
1210
  const chunk = mdPromiseArray.slice(i, i + chunkSize);
1235
1211
  // do whatever
@@ -1357,7 +1333,7 @@
1357
1333
  process.exit(1);
1358
1334
  }
1359
1335
  // Populate primary index tables
1360
- const index = await hdoc_build_db.populate_index(
1336
+ const index = hdoc_build_db.populate_index(
1361
1337
  db.db,
1362
1338
  doc_id,
1363
1339
  hdocbook_config,
package/hdoc-db.js CHANGED
@@ -1,4 +1,5 @@
1
1
  (() => {
2
+ const cheerio = require("cheerio");
2
3
  const path = require("node:path");
3
4
  const hdoc = require(path.join(__dirname, "hdoc-module.js"));
4
5
 
@@ -63,23 +64,44 @@
63
64
  sections: [],
64
65
  };
65
66
 
66
- // Get frontmatter properties
67
- const fm_headers = hdoc.getHTMLFrontmatterHeader(html_txt);
68
- response.fm_props = fm_headers.fm_properties;
67
+ // Single parse covers frontmatter extraction, full-text, and preview —
68
+ // previously three separate cheerio.load() calls.
69
+ const $ = cheerio.load(html_txt, { decodeEntities: false });
69
70
 
70
- // Convert HTML into plain text
71
- response.text = hdoc.html_to_text(html_txt);
71
+ // Extract frontmatter properties from the leading HTML comment
72
+ if ($._root?.children && Array.isArray($._root.children)) {
73
+ for (const child of $._root.children) {
74
+ if (child.type === "comment" && child.data?.startsWith("[[FRONTMATTER")) {
75
+ for (const line of child.data.split(/\r?\n/)) {
76
+ if (line.includes(":")) {
77
+ const parts = line.split(/:(.*)/s);
78
+ if (parts.length > 1) {
79
+ const key = parts[0].trim().toLowerCase();
80
+ let val = parts[1].trim();
81
+ if (/^".*"$/.test(val)) val = val.slice(1, -1);
82
+ if (key === "title") {
83
+ val = val.replace(
84
+ /&amp;|&lt;|&gt;|&quot;|&#39;|&apos;|&#(\d+);|&#x([0-9a-fA-F]+);/g,
85
+ (m, dec, hex) => dec ? String.fromCharCode(+dec) : hex ? String.fromCharCode(parseInt(hex, 16)) : ({ "&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"', "&#39;": "'", "&apos;": "'" })[m],
86
+ );
87
+ }
88
+ response.fm_props[key] = val;
89
+ }
90
+ }
91
+ }
92
+ break;
93
+ }
94
+ }
95
+ }
96
+
97
+ // Full-document plain text for search indexing
98
+ const text = $("body").text();
99
+
100
+ // Preview: first paragraph texts joined, then truncated
101
+ let preview = $("p").map((_i, el) => $(el).text()).get().join("\n");
102
+ preview = hdoc.truncate_string(preview, 200, true).replace(/(?:\r\n|\r|\n)/g, " ");
72
103
 
73
- // Convert HTML into preview text
74
- let preview = hdoc.html_to_text(html_txt, { baseElement: "p" });
75
- preview = hdoc
76
- .truncate_string(preview, 200, true)
77
- .replace(/(?:\r\n|\r|\n)/g, " ");
78
- response.sections.push({
79
- text: response.text,
80
- preview: preview,
81
- });
82
- //}
104
+ response.sections.push({ text, preview });
83
105
  return response;
84
106
  };
85
107
  })();
package/hdoc-module.js CHANGED
@@ -220,15 +220,9 @@
220
220
  // Looks for h1 tags first, then hX, hY, hZ in order
221
221
  exports.getFirstHTMLHeading = (html_body, h_to_search = ["h1"]) => {
222
222
  const $ = cheerio.load(html_body);
223
- for (let i = 0; i < h_to_search.length; i++) {
224
- const heading = $(h_to_search[i])
225
- .map(function (i) {
226
- return $(this);
227
- })
228
- .get();
229
- if (heading.length > 0) {
230
- return heading[0];
231
- }
223
+ for (const tag of h_to_search) {
224
+ const el = $(tag).first();
225
+ if (el.length > 0) return el;
232
226
  }
233
227
  return false;
234
228
  };
@@ -277,6 +271,58 @@
277
271
  return `<html><head></head><body>${result}</body></html>`;
278
272
  };
279
273
 
274
+ // Combined single-pass version of wrapHContent + getFirstHTMLHeading + get_html_read_time.
275
+ // Iterates body contents once to wrap h2/h3 divs AND extract the first matching heading text,
276
+ // first paragraph text, and reading-time estimate — avoiding 3 extra cheerio.load() calls.
277
+ exports.wrapAndExtract = (htmlContent, h_tags_to_search = ["h1"]) => {
278
+ const $ = cheerio.load(htmlContent, { decodeEntities: false });
279
+ let result = '';
280
+ let inH2 = false;
281
+ let inH3 = false;
282
+ let firstHeadingText = null;
283
+ let firstParagraphText = null;
284
+
285
+ $('body').contents().each(function() {
286
+ const tagName = this.type === 'tag' ? this.name?.toLowerCase() : null;
287
+ const text = tagName ? $(this).text().trim() : null;
288
+
289
+ if (firstHeadingText === null && tagName && h_tags_to_search.includes(tagName)) {
290
+ firstHeadingText = text;
291
+ }
292
+ if (firstParagraphText === null && tagName === 'p') {
293
+ firstParagraphText = text;
294
+ }
295
+
296
+ if (tagName === 'h2') {
297
+ if (inH3) { result += '</div>'; inH3 = false; }
298
+ if (inH2) { result += '</div>'; inH2 = false; }
299
+ result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
300
+ inH2 = true;
301
+ } else if (tagName === 'h3') {
302
+ if (inH3) { result += '</div>'; inH3 = false; }
303
+ result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
304
+ inH3 = true;
305
+ } else {
306
+ result += $.html(this);
307
+ }
308
+ });
309
+
310
+ if (inH3) result += '</div>';
311
+ if (inH2) result += '</div>';
312
+
313
+ // Word count re-uses the already-parsed DOM — no extra cheerio.load()
314
+ const bodyText = $("body").text();
315
+ const wordCount = bodyText.trim().split(/\s+/).filter(Boolean).length;
316
+ const readTimeMins = wordCount === 0 ? 0 : (Math.round(wordCount / 200) || 1);
317
+
318
+ return {
319
+ html: `<html><head></head><body>${result}</body></html>`,
320
+ firstHeadingText,
321
+ firstParagraphText,
322
+ readTimeMins,
323
+ };
324
+ };
325
+
280
326
  exports.getIDDivs = (html_body) => {
281
327
  const $ = cheerio.load(html_body, {
282
328
  decodeEntities: false,
package/hdoc-validate.js CHANGED
@@ -8,7 +8,6 @@ const { error } = require("node:console");
8
8
  const path = require("node:path");
9
9
  const hdoc = require(path.join(__dirname, "hdoc-module.js"));
10
10
  const translator = require("american-british-english-translator");
11
- const puppeteer = require("puppeteer");
12
11
 
13
12
  const spellcheck_options = {
14
13
  british: true,
@@ -26,10 +25,11 @@ const { error } = require("node:console");
26
25
  let private_repo = false;
27
26
  let redirects = {};
28
27
  let skip_link_file = '';
28
+ let _on_int_net_cached = null; // null = not yet checked; cached after first DNS lookup
29
29
  const exclude_h1_count = {};
30
30
  const exclude_spellcheck_output = [];
31
31
 
32
- const excludeLink = async (url) => {
32
+ const excludeLink = (url) => {
33
33
  if (exclude_links[url]) return true;
34
34
  for (let key in exclude_links) {
35
35
  if (Object.hasOwn(exclude_links, key)) {
@@ -465,18 +465,46 @@ const { error } = require("node:console");
465
465
  return returnPaths;
466
466
  }
467
467
 
468
- const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, browser, global_links_checked, output_links) => {
468
+ const _fetch_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' };
469
+
470
+ // Checks a single external URL by sending a HEAD request (falling back to GET
471
+ // if the server returns 405 Method Not Allowed). Returns the HTTP status code.
472
+ const fetchExternalLinkStatus = async (url) => {
473
+ const resp = await fetch(url, { method: 'HEAD', headers: _fetch_headers, signal: AbortSignal.timeout(10000), redirect: 'follow' });
474
+ if (resp.status === 405) {
475
+ const getResp = await fetch(url, { method: 'GET', headers: _fetch_headers, signal: AbortSignal.timeout(10000), redirect: 'follow' });
476
+ return getResp.status;
477
+ }
478
+ return resp.status;
479
+ };
480
+
481
+ const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, global_links_checked, output_links) => {
469
482
  const markdown_paths = getMDPathFromHtmlPath(htmlFile);
470
483
  const markdown_content = fs.readFileSync(markdown_paths.markdownPath, 'utf8');
471
484
 
472
-
485
+ // Resolve the "are we on the internal network?" question once per process
486
+ // rather than once per internal.hornbill.com link.
487
+ const ensureIntNetCached = async () => {
488
+ if (_on_int_net_cached === null) {
489
+ try {
490
+ _on_int_net_cached = await checkHostExistsInDNS('docs-internal.hornbill.com');
491
+ } catch (_e) {
492
+ _on_int_net_cached = false;
493
+ }
494
+ }
495
+ return _on_int_net_cached;
496
+ };
497
+
498
+ // Collect external links that need an HTTP check so they can be run
499
+ // concurrently rather than one-at-a-time.
500
+ const externalChecks = [];
501
+
473
502
  for (let i = 0; i < links.length; i++) {
474
- // Validate that link is a valid URL first
475
503
  if (output_links) console.log(` - ${links[i]}`);
476
504
  if (exclude_links[links[i]]) continue;
477
505
  if (global_links_checked.includes(links[i])) continue;
478
506
  global_links_checked.push(links[i]);
479
-
507
+
480
508
  const valid_url = hdoc.valid_url(links[i]);
481
509
  if (!valid_url) {
482
510
  // Could be a relative path, check
@@ -509,7 +537,7 @@ const { error } = require("node:console");
509
537
  messages[htmlFile.relativePath].push(
510
538
  `Link is a properly formatted external URL: ${links[i]}`,
511
539
  );
512
-
540
+
513
541
  // Skip if it's the auto-generated edit url, as these could be part of a private repo which would return a 404
514
542
  if (
515
543
  hdocbook_config.publicSource !== undefined &&
@@ -524,38 +552,14 @@ const { error } = require("node:console");
524
552
  fs.appendFileSync(skip_link_file, `${links[i]}\n`);
525
553
  continue;
526
554
  }
527
-
555
+
528
556
  if (valid_url.protocol === "mailto:") {
529
557
  fs.appendFileSync(skip_link_file, `${links[i]}\n`);
530
558
  continue;
531
559
  }
532
560
 
533
- // Skip internal.hornbill.com link validation if run outside of the Hornbill network
534
- if (links[i].toLowerCase().includes("internal.hornbill.com")) {
535
- // DNS lookup internal docs endpoint
536
- const hostname = 'docs-internal.hornbill.com';
537
- let on_int_net = false;
538
- try {
539
- on_int_net = await checkHostExistsInDNS(hostname);
540
- } catch (e) {
541
- // Don't need to do anything here
542
- }
543
-
544
- if (!on_int_net) {
545
- messages[htmlFile.relativePath].push(
546
- `Outside of Hornbill network - skipping internal link validation for: ${links[i]}`,
547
- );
548
- fs.appendFileSync(skip_link_file, `${links[i]}\n`);
549
- continue;
550
- }
551
- messages[htmlFile.relativePath].push(
552
- `Inside of Hornbill network - performing internal link validation for: ${links[i]}`,
553
- );
554
- }
555
-
556
561
  // Skip if the link is excluded in the project config
557
- const skip_link = await excludeLink(links[i]);
558
- if (skip_link) {
562
+ if (excludeLink(links[i])) {
559
563
  messages[htmlFile.relativePath].push(
560
564
  `Skipping link validation for: ${links[i]}`,
561
565
  );
@@ -563,99 +567,76 @@ const { error } = require("node:console");
563
567
  }
564
568
 
565
569
  if (
566
- (links[i].toLowerCase().includes("docs.hornbill.com") ||
570
+ (links[i].toLowerCase().includes("docs.hornbill.com") ||
567
571
  links[i].toLowerCase().includes("docs-internal.hornbill.com")) &&
568
572
  !markdown_paths.relativePath.includes('/_inline/')
569
573
  ) {
570
574
  const error_message = processErrorMessage(`Hornbill Docs links should not be fully-qualified: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
571
- errors[htmlFile.relativePath].push( error_message );
575
+ errors[htmlFile.relativePath].push(error_message);
572
576
  continue;
573
577
  }
574
578
 
575
- if (
579
+ if (
576
580
  links[i].toLowerCase().includes("docs-internal.hornbill.com") &&
577
581
  markdown_paths.relativePath.includes('/_inline/') &&
578
582
  !private_repo
579
583
  ) {
580
584
  // Is the parent book in a public repo? If so, flag this as an error.
581
585
  const error_message = processErrorMessage(`Hornbill docs-internal links should not be used in public book inline content: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
582
- errors[htmlFile.relativePath].push( error_message );
586
+ errors[htmlFile.relativePath].push(error_message);
583
587
  continue;
584
588
  }
585
589
 
586
- // Use Puppeteer to validate link address works
587
- const page = await browser.newPage();
588
-
589
- try {
590
- // Set a user-agent to mimic a real browser
591
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
590
+ // Capture url in closure for the async check below
591
+ const url = links[i];
592
+ const isInternal = url.toLowerCase().includes("internal.hornbill.com");
593
+
594
+ externalChecks.push(async () => {
595
+ // For internal.hornbill.com links, check network reachability first (result cached)
596
+ if (isInternal) {
597
+ const on_int_net = await ensureIntNetCached();
598
+ if (!on_int_net) {
599
+ messages[htmlFile.relativePath].push(
600
+ `Outside of Hornbill network - skipping internal link validation for: ${url}`,
601
+ );
602
+ fs.appendFileSync(skip_link_file, `${url}\n`);
603
+ return;
604
+ }
605
+ messages[htmlFile.relativePath].push(
606
+ `Inside of Hornbill network - performing internal link validation for: ${url}`,
607
+ );
608
+ }
592
609
 
593
610
  try {
594
- let response = null;
595
-
596
- // Capture redirects and final response
597
- page.on('request', (request) => {
598
- if (request.isNavigationRequest() && request.redirectChain().length) {
599
- redirectChain = request.redirectChain().map((req) => req.url());
600
- }
601
- });
602
-
603
- // Capture the response
604
- page.on('response', (res) => {
605
- const chain = res.request().redirectChain();
606
- if (chain.length > 0) {
607
- redirectChain = chain.map((req) => req.url());
608
- lastRedirectStatus = res.status(); // Status of the last redirect
609
- }
610
- });
611
-
612
- // Try loading the URL
613
- response = await page.goto(links[i], { waitUntil: 'networkidle2', timeout: 10000 });
614
-
615
- if (response) {
616
- let status = response.status();
617
- const contentType = response.headers()['content-type'];
618
-
619
- // If it's a PDF switch to direct fetching
620
- if (contentType && contentType.includes('application/')) {
621
- status = await page.evaluate(async (url) => {
622
- const res = await fetch(url, { method: 'HEAD' });
623
- return res.status;
624
- }, links[i]);
625
- }
626
- if ((status < 200 || status > 299) && status !== 304) {
627
- if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && links[i].includes(".hornbill.com")) {
628
- // STEVEG - do nothing here, as it always returns a 403 for Hornbill sites when accessing through GitHub Actions
629
- // Works totally fine locally or in hdocpub, still trying to work out what's causing this in GitHub
630
- } else {
631
- throw `Unexpected Status Returned: ${status}`;
632
- }
611
+ const status = await fetchExternalLinkStatus(url);
612
+ if ((status < 200 || status > 299) && status !== 304) {
613
+ if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && url.includes(".hornbill.com")) {
614
+ // Always returns 403 for Hornbill sites through GitHub Actions — not a real error
633
615
  } else {
634
- fs.appendFileSync(skip_link_file, `${links[i]}\n`);
616
+ throw `Unexpected Status Returned: ${status}`;
635
617
  }
636
618
  } else {
637
- throw `No response from: ${links[i]}`;
619
+ fs.appendFileSync(skip_link_file, `${url}\n`);
638
620
  }
639
- } catch (error) {
640
- throw error;
641
- }
642
- } catch (e) {
643
- let error_message;
644
- if (e instanceof AggregateError) {
645
- error_message = processErrorMessage(`Issue with external link [${links[i]}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, links[i]);
646
- } else {
647
- error_message = processErrorMessage(`Issue with external link [${links[i]}]: ${e}`, markdown_paths.relativePath, markdown_content, links[i]);
621
+ } catch (e) {
622
+ let error_message;
623
+ if (e instanceof AggregateError) {
624
+ error_message = processErrorMessage(`Issue with external link [${url}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, url);
625
+ } else {
626
+ error_message = processErrorMessage(`Issue with external link [${url}]: ${e}`, markdown_paths.relativePath, markdown_content, url);
627
+ }
628
+ if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
629
+ warnings[htmlFile.relativePath].push(error_message);
630
+ else
631
+ errors[htmlFile.relativePath].push(error_message);
648
632
  }
649
- if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
650
- warnings[htmlFile.relativePath].push(error_message);
651
- else
652
- errors[htmlFile.relativePath].push(error_message);
653
-
654
- }
655
- // Close the headless browser tab
656
- page.close();
633
+ });
657
634
  }
658
635
  }
636
+
637
+ // Run all external HTTP checks concurrently — fetch is lightweight enough
638
+ // that uncapped concurrency is fine for the link counts seen in practice.
639
+ await Promise.all(externalChecks.map(fn => fn()));
659
640
  };
660
641
 
661
642
  const checkHostExistsInDNS = async (hostname) => {
@@ -1069,8 +1050,7 @@ const { error } = require("node:console");
1069
1050
 
1070
1051
 
1071
1052
  const global_links_checked = [];
1072
- const validateBrowser = await puppeteer.launch({ args: ['--no-sandbox'] });
1073
-
1053
+
1074
1054
  for (const key in html_to_validate) {
1075
1055
  const file = html_to_validate[key];
1076
1056
  // Check for British spellings in static HTML content
@@ -1095,7 +1075,7 @@ const { error } = require("node:console");
1095
1075
  messages[file.relativePath].push("No links found in file");
1096
1076
  } else {
1097
1077
  console.log(`\r\nChecking ${links.href.length} Links in ${file.relativePath}`);
1098
- await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, validateBrowser, global_links_checked, output_links);
1078
+ await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, global_links_checked, output_links);
1099
1079
  }
1100
1080
  if (links.img.length === 0) {
1101
1081
  messages[file.relativePath].push("No images found in file");
@@ -1107,9 +1087,6 @@ const { error } = require("node:console");
1107
1087
  await checkTags(file);
1108
1088
  }
1109
1089
 
1110
- // Close the Chromium browser instance
1111
- await validateBrowser.close();
1112
-
1113
1090
  if (gen_exclude) console.log(JSON.stringify(excl_output, null, 2));
1114
1091
 
1115
1092
  if (verbose) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "hdoc-tools",
3
- "version": "0.47.2",
3
+ "version": "0.47.3",
4
4
  "description": "Hornbill HDocBook Development Support Tool",
5
5
  "main": "hdoc.js",
6
6
  "bin": {