npm - hdoc-tools - Versions diffs - 0.47.2 → 0.47.4 - Mend

hdoc-tools 0.47.2 → 0.47.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md CHANGED Viewed

@@ -35,7 +35,7 @@ Returns statistics regarding the book you are working on:
 - Number of Markdown Files in the Book
 - Number of Static HTML Files in the Book
-If the -v switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
+If the `-v` switch is provided, then more verbose output is output, which includes a list of each MD and HTML file found, the file sizes, and file-specific word count.
 The book statistics do not include counts for any externally hosted content injected into the book content using the [[INCLUDE]] tags.
@@ -43,25 +43,25 @@ The book statistics do not include counts for any externally hosted content inje
 Performs a local build of the book, validates the links and static content are present and correct and outputs as a ZIP file.
-If the -v switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
+If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
-Use the --set-version argument to set the version number of the built book.
+Use the `--set-version` argument to set the version number of the built book.
-Use the --no-color argument to remove any color control characters from the output.
+Use the `--no-color` argument to remove any color control characters from the output.
-Use the '--no-links' argument to skip link output to CLI during validation.
+Use the `--no-links` argument to skip link output to CLI during validation.
 ### validate
 Performs a minimum local build of the book, then validates the links and static content are present and correct.
-If the -v switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
+If the `-v` switch is provided, then a more verbose output is provided, which includes a list of all HTML files created and checked for embedded links and static content.
-Use the --set-version argument to set the version number of the built book.
+Use the `--set-version` argument to set the version number of the built book.
-Use the --no-color argument to remove any color control characters from the output.
+Use the `--no-color` argument to remove any color control characters from the output.
-Use the '--no-links' argument to skip link output to CLI during validation.
+Use the `--no-links` argument to skip link output to CLI during validation.
 ### serve
@@ -72,4 +72,4 @@ command `hdoc serve` and in a local browser go to the URL `http://localhost:3000
 ## Installation
-  npm install hdoc-tools -g
+> `npm install hdoc-tools -g`

package/hdoc-build-db.js CHANGED Viewed

@@ -87,41 +87,43 @@
 		return response;
 	};
-	exports.populate_redirects = (db, redirect_records, verbose = false) => {
+	exports.populate_redirects = (db, redirect_records, _verbose = false) => {
 		const response = {
 			success: true,
 			errors: [],
 			index_success_count: 0,
 		};
-		for (let i = 0; i < redirect_records.length; i++) {
-			const index_vals = [
-				redirect_records[i].url,
-				redirect_records[i].location ? redirect_records[i].location : "",
-				redirect_records[i].code,
-			];
-			const index_response = hdoc_index.insert_record(
-				db,
-				"hdoc_redirects",
-				db_schema.hdoc_redirects,
-				index_vals,
-			);
-			if (!index_response.success) {
-				response.success = false;
-				response.errors.push(
-					`Redirect record creation failed - ${redirect_records[i].url}: ${index_response.error}`,
-				);
-			} else {
-				response.index_success_count++;
+		// Prepare once, insert all in one transaction
+		const cols = db_schema.hdoc_redirects.map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
+		const stmt = db.prepare(`INSERT INTO hdoc_redirects (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
+		const run_all = db.transaction(() => {
+			for (const record of redirect_records) {
+				try {
+					stmt.run(record.url, record.location ? record.location : "", record.code);
+					response.index_success_count++;
+				} catch (e) {
+					response.success = false;
+					response.errors.push(`Redirect record creation failed - ${record.url}: ${e}`);
+				}
 			}
+		});
+		try {
+			run_all();
+		} catch (e) {
+			response.success = false;
+			response.errors.push(`Redirect index transaction failed: ${e}`);
 		}
 		console.log(
 			`\nRedirect Index Build Complete: ${response.index_success_count} document records created.`,
 		);
 		return response;
 	};
-	exports.populate_index = async (
+	exports.populate_index = (
 		db,
 		doc_id,
 		book_config,
@@ -136,124 +138,109 @@
 		if (!book_config.tags) book_config.tags = [];
-		const indexPromises = [];
-		for (let i = 0; i < index_records.length; i++) {
-			indexPromises.push(index_records[i]);
-		}
-		let curr_file = "";
-		await Promise.all(
-			indexPromises.map(async (file) => {
+		// Build a prepared statement from a schema entry once, reusing it for every row.
+		// Previously insert_record() called db.prepare() on every single insert.
+		const make_stmt = (table) => {
+			const cols = db_schema[table].map(c => c.replace(/\b(UNINDEXED|INTEGER)\b/g, "").trim());
+			return db.prepare(`INSERT INTO ${table} (${cols.join(", ")}) VALUES (${cols.map(() => "?").join(", ")})`);
+		};
+		const stmt_index = make_stmt("hdoc_index");
+		const stmt_meta = make_stmt("hdoc_meta");
+		const stmt_contrib = make_stmt("hdoc_contributors");
+		// A single transaction batches all disk flushes into one — critical for
+		// FTS5 which otherwise re-indexes on every individual insert.
+		const run_all = db.transaction(() => {
+			let curr_file = "";
+			for (const file of index_records) {
 				let index_path_name = file.relative_path.replaceAll("\\", "/");
 				if (
 					index_path_name.endsWith("/index.md") ||
 					index_path_name.endsWith("/index.html") ||
 					index_path_name.endsWith("/index.htm")
 				) {
-					index_path_name = index_path_name.substring(
-						0,
-						index_path_name.lastIndexOf("/"),
-					);
+					index_path_name = index_path_name.substring(0, index_path_name.lastIndexOf("/"));
 				}
 				index_path_name = `/${index_path_name.replace(path.extname(file.relative_path), "")}`;
-				let index_response = {
-					success: true,
-					row_id: 0,
-				};
-				let index_content_path = index_path_name;
-				if (file.index_html.id !== null)
-					index_content_path += `#${file.index_html.id}`;
+				let inserted_row_id = null;
+				const index_content_path = file.index_html.id !== null
+					? `${index_path_name}#${file.index_html.id}`
+					: index_path_name;
 				if (!file.inline) {
-					const index_vals = [
-						index_content_path,
-						doc_id,
-						book_config.audience.join(","),
-						book_config.tags.join(","),
-						file.index_html.fm_props.title,
-						file.index_html.text,
-						file.index_html.preview,
-						book_config.productFamily,
-						file.md5,
-						file.lastmod,
-						file.status,
-						file.keywords,
-					];
-					index_response = hdoc_index.insert_record(
-						db,
-						"hdoc_index",
-						db_schema.hdoc_index,
-						index_vals,
-					);
+					try {
+						const info = stmt_index.run(
+							index_content_path,
+							doc_id,
+							book_config.audience.join(","),
+							book_config.tags.join(","),
+							file.index_html.fm_props.title,
+							file.index_html.text,
+							file.index_html.preview,
+							book_config.productFamily,
+							file.md5,
+							file.lastmod,
+							file.status,
+							file.keywords,
+						);
+						inserted_row_id = info.lastInsertRowid;
+					} catch (e) {
+						console.error(`Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${e}`);
+						continue;
+					}
 				}
-				if (!index_response.success) {
-					console.error(
-						`Index record creation failed - ${doc_id}/${file.index_html.fm_props.title}: ${index_response.error}`,
-					);
-				} else {
-					if (curr_file === index_path_name) return;
-					curr_file = index_path_name;
-					// Now add metadata
-					const meta_vals = [
+				if (curr_file === index_path_name) continue;
+				curr_file = index_path_name;
+				try {
+					const meta_info = stmt_meta.run(
 						index_path_name,
 						doc_id,
 						file.metadata.contributor_count,
 						file.metadata.edit_url,
 						file.metadata.last_commit,
 						file.pdf_size,
-					];
-					const meta_response = await hdoc_index.insert_record(
-						db,
-						"hdoc_meta",
-						db_schema.hdoc_meta,
-						meta_vals,
 					);
-					if (!meta_response.success) {
-						console.error(
-							`Index metadata record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${meta_response.error}`,
+					if (verbose) {
+						console.log(`Inserted index record ${inserted_row_id}: ${doc_id} - ${file.index_html.fm_props.title}`);
+						console.log(`Inserted index metadata record for index ID: ${meta_info.lastInsertRowid}`);
+					}
+				} catch (e) {
+					console.error(`Index metadata record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
+					continue;
+				}
+				for (const contrib of file.contributors) {
+					try {
+						const cont_info = stmt_contrib.run(
+							index_path_name,
+							doc_id,
+							contrib.login,
+							contrib.name,
+							contrib.avatar_url,
+							contrib.html_url,
 						);
-					} else {
 						if (verbose) {
-							console.log(
-								`Inserted index record ${index_response.row_id}: ${doc_id} - ${file.index_html.fm_props.title}`,
-							);
-							console.log(
-								`Inserted index metadata record for index ID: ${meta_response.row_id}`,
-							);
-						}
-						// Now add contributor records
-						for (let j = 0; j < file.contributors.length; j++) {
-							const contrib_vals = [
-								index_path_name,
-								doc_id,
-								file.contributors[j].login,
-								file.contributors[j].name,
-								file.contributors[j].avatar_url,
-								file.contributors[j].html_url,
-							];
-							const cont_response = await hdoc_index.insert_record(
-								db,
-								"hdoc_contributors",
-								db_schema.hdoc_contributors,
-								contrib_vals,
-							);
-							if (!cont_response.success) {
-								console.error(
-									`Index document contributor record creation failed - ${doc_id}/${index_response.row_id}/${file.index_html.fm_props.title}: ${cont_response.error}`,
-								);
-								continue;
-							}
-							if (verbose) {
-								console.log(
-									`Inserted document contributor record ${cont_response.row_id}`,
-								);
-							}
+							console.log(`Inserted document contributor record ${cont_info.lastInsertRowid}`);
 						}
-						response.index_success_count++;
+					} catch (e) {
+						console.error(`Index document contributor record creation failed - ${doc_id}/${inserted_row_id}/${file.index_html.fm_props.title}: ${e}`);
 					}
 				}
-			}),
-		);
+				response.index_success_count++;
+			}
+		});
+		try {
+			run_all();
+		} catch (e) {
+			response.error = e.message;
+			console.error(`Index build transaction failed: ${e}`);
+			return response;
+		}
 		response.success = true;
 		console.log(

package/hdoc-build.js CHANGED Viewed

@@ -252,6 +252,11 @@
 			// Render markdown into HTML
 			html_txt = md.render(md_txt);
+			// Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
+			// Replaces separate wrapHContent + getFirstHTMLHeading + get_html_read_time calls.
+			const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
+			html_txt = extracted.html;
 			// Parse frontmatter properties from the YAML block
 			let fm_contains_title = false;
 			let fm_contains_reading_time = false;
@@ -297,17 +302,12 @@
 			// Title from heading if not in frontmatter
 			if (!fm_contains_title) {
-				const html_heading = hdoc.getFirstHTMLHeading(
-					html_txt,
-					h_tags_to_search,
-				);
-				if (html_heading?.[0]?.children?.[0]?.data) {
+				if (extracted.firstHeadingText) {
 					fm_headers.push({
 						id: "title",
-						value: html_heading[0].children[0].data.trim(),
+						value: extracted.firstHeadingText,
 					});
-					doc_title = html_heading[0].children[0].data.trim();
+					doc_title = extracted.firstHeadingText;
 				} else if (
 					file_path.name !== "description_ext.md" &&
 					file_path.name !== "article_ext.md" &&
@@ -320,24 +320,19 @@
 			}
 			// Description from first paragraph if not in frontmatter
-			if (!fm_contains_description) {
-				const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
-				if (html_p_tag?.[0]?.children?.[0]?.data) {
-					fm_headers.push({
-						id: "description",
-						value:
-							`${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
-					});
-				}
+			if (!fm_contains_description && extracted.firstParagraphText) {
+				fm_headers.push({
+					id: "description",
+					value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
+				});
 			}
 			// Reading time from content if not in frontmatter
 			if (!fm_contains_reading_time) {
-				const read_time_mins = hdoc.get_html_read_time(html_txt);
-				book_read_time += read_time_mins;
+				book_read_time += extracted.readTimeMins;
 				fm_headers.push({
 					id: "reading-time",
-					value: read_time_mins,
+					value: extracted.readTimeMins,
 				});
 			}
 		} else {
@@ -348,6 +343,12 @@
 			// Check if we have a frontmatter comment
 			html_fm = hdoc.getHTMLFrontmatterHeader(html_txt);
+			// Single pass: wrap h2/h3 divs + extract heading, paragraph, read-time.
+			// Must run after getHTMLFrontmatterHeader (which reads the top-level comment)
+			// but before any per-field extraction; the resulting html replaces html_txt.
+			const extracted = hdoc.wrapAndExtract(html_txt, h_tags_to_search);
+			html_txt = extracted.html;
 			if (Object.keys(html_fm.fm_properties).length > 0) {
 				existing_fm_headers = true;
@@ -374,9 +375,8 @@
 				// Is reading-time in the fm headers?
 				if (html_fm.fm_properties["reading-time"] === undefined) {
-					const read_time_mins = hdoc.get_html_read_time(html_txt);
-					book_read_time += read_time_mins;
-					html_fm.fm_properties["reading-time"] = read_time_mins;
+					book_read_time += extracted.readTimeMins;
+					html_fm.fm_properties["reading-time"] = extracted.readTimeMins;
 				}
 				for (const key in html_fm.fm_properties) {
@@ -397,21 +397,13 @@
 					file_path.name !== "article_ext.md" &&
 					file_path.name !== "internal_ext.md"
 				) {
-					// No frontmatter title found in properties - go get title from h tags in html
-					const html_heading = hdoc.getFirstHTMLHeading(
-						html_txt,
-						h_tags_to_search,
-					);
-					if (html_heading?.[0]?.children?.[0]?.data) {
-						// We've found a heading tag, add that as a title to the existing frontmatter properties
+					if (extracted.firstHeadingText) {
 						fm_headers.push({
 							id: "title",
-							value: html_heading[0].children[0].data,
+							value: extracted.firstHeadingText,
 						});
-						doc_title = html_heading[0].children[0].data;
+						doc_title = extracted.firstHeadingText;
 					} else {
-						// No header tag, no frontmatter title, output a warning
 						console.info(
 							`[WARNING] No frontmatter title property, or ${h_tags_to_search.join(
 								", ",
@@ -426,12 +418,10 @@
 					html_fm.fm_properties.description !== undefined
 				) {
 					if (html_fm.fm_properties.description === "") {
-						const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
-						if (html_p_tag?.[0]?.children?.[0]?.data) {
+						if (extracted.firstParagraphText) {
 							fm_headers.push({
 								id: "description",
-								value:
-									`${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
+								value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
 							});
 						}
 					} else {
@@ -440,30 +430,22 @@
 							value: html_fm.fm_properties.description.trim(),
 						});
 					}
-				} else {
-					const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
-					if (html_p_tag?.[0]?.children?.[0]?.data) {
-						fm_headers.push({
-							id: "description",
-							value:
-								`${doc_title}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
-						});
-					}
+				} else if (extracted.firstParagraphText) {
+					fm_headers.push({
+						id: "description",
+						value: `${doc_title}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
+					});
 				}
 			} else {
 				// We have no frontmatter headers, get and build one from the html headings
-				const html_heading = hdoc.getFirstHTMLHeading(
-					html_txt,
-					h_tags_to_search,
-				);
 				let doc_title_local = "";
 				// Add the title
-				if (html_heading?.[0]?.children?.[0]?.data) {
+				if (extracted.firstHeadingText) {
 					fm_headers.push({
 						id: "title",
-						value: html_heading[0].children[0].data,
+						value: extracted.firstHeadingText,
 					});
-					doc_title_local = html_heading[0].children[0].data;
+					doc_title_local = extracted.firstHeadingText;
 					doc_title = doc_title_local;
 				} else if (
 					file_path.name !== "description_ext.md" &&
@@ -478,19 +460,16 @@
 				}
 				// Add the reading time
-				const read_time_mins = hdoc.get_html_read_time(html_txt);
-				book_read_time += read_time_mins;
+				book_read_time += extracted.readTimeMins;
 				fm_headers.push({
 					id: "reading-time",
-					value: read_time_mins,
+					value: extracted.readTimeMins,
 				});
-				const html_p_tag = hdoc.getFirstHTMLHeading(html_txt, ["p"]);
-				if (html_p_tag?.[0]?.children?.[0]?.data) {
+				if (extracted.firstParagraphText) {
 					fm_headers.push({
 						id: "description",
-						value:
-							`${doc_title_local}: ${html_p_tag[0].children[0].data.split(".")[0]}.`.trim(),
+						value: `${doc_title_local}: ${extracted.firstParagraphText.split(".")[0]}.`.trim(),
 					});
 				}
 			}
@@ -675,9 +654,6 @@
 		}
 		if (pdf_size > 0) pdf_created++;
-		// Wrap h2 and h3 tags, plus content, in id'd divs
-		html_txt = hdoc.wrapHContent(html_txt);
 		if (inline_content) html_txt = `${fm_header_str}\n${html_txt}`;
 		else html_txt = `${fm_header_str}\n${doc_header}\n${html_txt}`;
@@ -732,7 +708,7 @@
 	const tidy_code_tags = (markdown, file) => {
 		let clean_markdown = markdown;
-		const json_to_tidy = clean_markdown.match(/```json[\r\n](\s|.)*?```/g);
+		const json_to_tidy = clean_markdown.match(/```json[\r\n][\s\S]*?```/g);
 		if (json_to_tidy && json_to_tidy.length > 0) {
 			for (let i = 0; i < json_to_tidy.length; i++) {
 				if (json_to_tidy[i] !== "") {
@@ -754,7 +730,7 @@
 			}
 		}
-		const xml_to_tidy = clean_markdown.match(/```xml[\r\n](\s|.)*?```/g);
+		const xml_to_tidy = clean_markdown.match(/```xml[\r\n][\s\S]*?```/g);
 		if (xml_to_tidy && xml_to_tidy.length > 0) {
 			for (let i = 0; i < xml_to_tidy.length; i++) {
 				if (xml_to_tidy[i] !== "") {
@@ -1229,7 +1205,7 @@
 		for (let i = 0; i < md_files.length; i++) {
 			mdPromiseArray.push(md_files[i]);
 		}
-		const chunkSize = 3;
+		const chunkSize = 8;
 		for (let i = 0; i < mdPromiseArray.length; i += chunkSize) {
 			const chunk = mdPromiseArray.slice(i, i + chunkSize);
 			// do whatever
@@ -1357,7 +1333,7 @@
 			process.exit(1);
 		}
 		// Populate primary index tables
-		const index = await hdoc_build_db.populate_index(
+		const index = hdoc_build_db.populate_index(
 			db.db,
 			doc_id,
 			hdocbook_config,

package/hdoc-db.js CHANGED Viewed

@@ -1,4 +1,5 @@
 (() => {
+	const cheerio = require("cheerio");
 	const path = require("node:path");
 	const hdoc = require(path.join(__dirname, "hdoc-module.js"));
@@ -63,23 +64,44 @@
 			sections: [],
 		};
-		// Get frontmatter properties
-		const fm_headers = hdoc.getHTMLFrontmatterHeader(html_txt);
-		response.fm_props = fm_headers.fm_properties;
+		// Single parse covers frontmatter extraction, full-text, and preview —
+		// previously three separate cheerio.load() calls.
+		const $ = cheerio.load(html_txt, { decodeEntities: false });
-		// Convert HTML into plain text
-		response.text = hdoc.html_to_text(html_txt);
+		// Extract frontmatter properties from the leading HTML comment
+		if ($._root?.children && Array.isArray($._root.children)) {
+			for (const child of $._root.children) {
+				if (child.type === "comment" && child.data?.startsWith("[[FRONTMATTER")) {
+					for (const line of child.data.split(/\r?\n/)) {
+						if (line.includes(":")) {
+							const parts = line.split(/:(.*)/s);
+							if (parts.length > 1) {
+								const key = parts[0].trim().toLowerCase();
+								let val = parts[1].trim();
+								if (/^".*"$/.test(val)) val = val.slice(1, -1);
+								if (key === "title") {
+									val = val.replace(
+										/&amp;|&lt;|&gt;|&quot;|&#39;|&apos;|&#(\d+);|&#x([0-9a-fA-F]+);/g,
+										(m, dec, hex) => dec ? String.fromCharCode(+dec) : hex ? String.fromCharCode(parseInt(hex, 16)) : ({ "&amp;": "&", "&lt;": "<", "&gt;": ">", "&quot;": '"', "&#39;": "'", "&apos;": "'" })[m],
+									);
+								}
+								response.fm_props[key] = val;
+							}
+						}
+					}
+					break;
+				}
+			}
+		}
+		// Full-document plain text for search indexing
+		const text = $("body").text();
+		// Preview: first paragraph texts joined, then truncated
+		let preview = $("p").map((_i, el) => $(el).text()).get().join("\n");
+		preview = hdoc.truncate_string(preview, 200, true).replace(/(?:\r\n|\r|\n)/g, " ");
-		// Convert HTML into preview text
-		let preview = hdoc.html_to_text(html_txt, { baseElement: "p" });
-		preview = hdoc
-			.truncate_string(preview, 200, true)
-			.replace(/(?:\r\n|\r|\n)/g, " ");
-		response.sections.push({
-			text: response.text,
-			preview: preview,
-		});
-		//}
+		response.sections.push({ text, preview });
 		return response;
 	};
 })();

package/hdoc-module.js CHANGED Viewed

@@ -48,6 +48,8 @@
 		}
 	};
+	exports.fetchWithRetry = fetchWithRetry;
 	exports.content_type_for_ext = (ext) => {
 		switch (ext) {
 			case ".z":
@@ -220,15 +222,9 @@
 	// Looks for h1 tags first, then hX, hY, hZ in order
 	exports.getFirstHTMLHeading = (html_body, h_to_search = ["h1"]) => {
 		const $ = cheerio.load(html_body);
-		for (let i = 0; i < h_to_search.length; i++) {
-			const heading = $(h_to_search[i])
-				.map(function (i) {
-					return $(this);
-				})
-				.get();
-			if (heading.length > 0) {
-				return heading[0];
-			}
+		for (const tag of h_to_search) {
+			const el = $(tag).first();
+			if (el.length > 0) return el;
 		}
 		return false;
 	};
@@ -277,6 +273,58 @@
 		return `<html><head></head><body>${result}</body></html>`;
 	};
+	// Combined single-pass version of wrapHContent + getFirstHTMLHeading + get_html_read_time.
+	// Iterates body contents once to wrap h2/h3 divs AND extract the first matching heading text,
+	// first paragraph text, and reading-time estimate — avoiding 3 extra cheerio.load() calls.
+	exports.wrapAndExtract = (htmlContent, h_tags_to_search = ["h1"]) => {
+		const $ = cheerio.load(htmlContent, { decodeEntities: false });
+		let result = '';
+		let inH2 = false;
+		let inH3 = false;
+		let firstHeadingText = null;
+		let firstParagraphText = null;
+		$('body').contents().each(function() {
+			const tagName = this.type === 'tag' ? this.name?.toLowerCase() : null;
+			const text = tagName ? $(this).text().trim() : null;
+			if (firstHeadingText === null && tagName && h_tags_to_search.includes(tagName)) {
+				firstHeadingText = text;
+			}
+			if (firstParagraphText === null && tagName === 'p') {
+				firstParagraphText = text;
+			}
+			if (tagName === 'h2') {
+				if (inH3) { result += '</div>'; inH3 = false; }
+				if (inH2) { result += '</div>'; inH2 = false; }
+				result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
+				inH2 = true;
+			} else if (tagName === 'h3') {
+				if (inH3) { result += '</div>'; inH3 = false; }
+				result += `<div id="${makeAnchorIdFriendly(text)}">${$.html(this)}`;
+				inH3 = true;
+			} else {
+				result += $.html(this);
+			}
+		});
+		if (inH3) result += '</div>';
+		if (inH2) result += '</div>';
+		// Word count re-uses the already-parsed DOM — no extra cheerio.load()
+		const bodyText = $("body").text();
+		const wordCount = bodyText.trim().split(/\s+/).filter(Boolean).length;
+		const readTimeMins = wordCount === 0 ? 0 : (Math.round(wordCount / 200) || 1);
+		return {
+			html: `<html><head></head><body>${result}</body></html>`,
+			firstHeadingText,
+			firstParagraphText,
+			readTimeMins,
+		};
+	};
 	exports.getIDDivs = (html_body) => {
 		const $ = cheerio.load(html_body, {
 			decodeEntities: false,

package/hdoc-validate.js CHANGED Viewed

@@ -8,7 +8,6 @@ const { error } = require("node:console");
 	const path = require("node:path");
 	const hdoc = require(path.join(__dirname, "hdoc-module.js"));
 	const translator = require("american-british-english-translator");
-	const puppeteer = require("puppeteer");
 	const spellcheck_options = {
 		british: true,
@@ -26,10 +25,11 @@ const { error } = require("node:console");
 	let private_repo = false;
 	let redirects = {};
 	let skip_link_file = '';
+	let _on_int_net_cached = null; // null = not yet checked; cached after first DNS lookup
 	const exclude_h1_count = {};
 	const exclude_spellcheck_output = [];
-	const excludeLink = async (url) => {
+	const excludeLink = (url) => {
 		if (exclude_links[url]) return true;
 		for (let key in exclude_links) {
 			if (Object.hasOwn(exclude_links, key)) {
@@ -465,18 +465,67 @@ const { error } = require("node:console");
 		return returnPaths;
 	}
-	const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, browser, global_links_checked, output_links) => {
+	// Headers that mimic a real Chrome browser request — sites doing bot detection
+	// check far more than just User-Agent (Accept, Sec-Fetch-*, client hints, etc.).
+	const _fetch_headers = {
+		'User-Agent':      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+		'Accept':          'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+		'Accept-Language': 'en-US,en;q=0.9',
+		'Accept-Encoding': 'gzip, deflate, br',
+		'Cache-Control':   'no-cache',
+		'Pragma':          'no-cache',
+		'Sec-Fetch-Dest':  'document',
+		'Sec-Fetch-Mode':  'navigate',
+		'Sec-Fetch-Site':  'none',
+		'Sec-Fetch-User':  '?1',
+		'Sec-Ch-Ua':       '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
+		'Sec-Ch-Ua-Mobile':   '?0',
+		'Sec-Ch-Ua-Platform': '"Windows"',
+		'Upgrade-Insecure-Requests': '1',
+	};
+	// Checks a single external URL by sending a HEAD request, falling back to GET
+	// if the server returns 405 (Method Not Allowed) or 404 (some servers, e.g.
+	// marketplace.visualstudio.com, return 404 for HEAD even when the page exists).
+	// Retries up to 5 times on transient errors (5xx, 429, network failures).
+	// Returns the HTTP status code.
+	const fetchExternalLinkStatus = async (url) => {
+		const opts = { method: 'HEAD', headers: _fetch_headers, timeoutMs: 10000, redirect: 'follow' };
+		const resp = await hdoc.fetchWithRetry(url, opts);
+		if (resp.status === 404 || resp.status === 405) {
+			const getResp = await hdoc.fetchWithRetry(url, { ...opts, method: 'GET' });
+			return getResp.status;
+		}
+		return resp.status;
+	};
+	const checkLinks = async (source_path, htmlFile, links, hdocbook_config, hdocbook_project, global_links_checked, output_links) => {
 		const markdown_paths = getMDPathFromHtmlPath(htmlFile);
 		const markdown_content = fs.readFileSync(markdown_paths.markdownPath, 'utf8');
+		// Resolve the "are we on the internal network?" question once per process
+		// rather than once per internal.hornbill.com link.
+		const ensureIntNetCached = async () => {
+			if (_on_int_net_cached === null) {
+				try {
+					_on_int_net_cached = await checkHostExistsInDNS('docs-internal.hornbill.com');
+				} catch (_e) {
+					_on_int_net_cached = false;
+				}
+			}
+			return _on_int_net_cached;
+		};
+		// Collect external links that need an HTTP check so they can be run
+		// concurrently rather than one-at-a-time.
+		const externalChecks = [];
 		for (let i = 0; i < links.length; i++) {
-			// Validate that link is a valid URL first
 			if (output_links) console.log(` - ${links[i]}`);
 			if (exclude_links[links[i]]) continue;
 			if (global_links_checked.includes(links[i])) continue;
 			global_links_checked.push(links[i]);
 			const valid_url = hdoc.valid_url(links[i]);
 			if (!valid_url) {
 				// Could be a relative path, check
@@ -509,7 +558,7 @@ const { error } = require("node:console");
 				messages[htmlFile.relativePath].push(
 					`Link is a properly formatted external URL: ${links[i]}`,
 				);
 				// Skip if it's the auto-generated edit url, as these could be part of a private repo which would return a 404
 				if (
 					hdocbook_config.publicSource !== undefined &&
@@ -524,38 +573,14 @@ const { error } = require("node:console");
 					fs.appendFileSync(skip_link_file, `${links[i]}\n`);
 					continue;
 				}
 				if (valid_url.protocol === "mailto:") {
 					fs.appendFileSync(skip_link_file, `${links[i]}\n`);
 					continue;
 				}
-				// Skip internal.hornbill.com link validation if run outside of the Hornbill network
-				if (links[i].toLowerCase().includes("internal.hornbill.com")) {
-					// DNS lookup internal docs endpoint
-					const hostname = 'docs-internal.hornbill.com';
-					let on_int_net = false;
-					try {
-						on_int_net = await checkHostExistsInDNS(hostname);
-					} catch (e) {
-						// Don't need to do anything here
-					}
-					if (!on_int_net) {
-						messages[htmlFile.relativePath].push(
-							`Outside of Hornbill network - skipping internal link validation for: ${links[i]}`,
-						);
-						fs.appendFileSync(skip_link_file, `${links[i]}\n`);
-						continue;
-					}
-					messages[htmlFile.relativePath].push(
-						`Inside of Hornbill network - performing internal link validation for: ${links[i]}`,
-					);
-				}
 				// Skip if the link is excluded in the project config
-				const skip_link = await excludeLink(links[i]);
-				if (skip_link) {
+				if (excludeLink(links[i])) {
 					messages[htmlFile.relativePath].push(
 						`Skipping link validation for: ${links[i]}`,
 					);
@@ -563,99 +588,76 @@ const { error } = require("node:console");
 				}
 				if (
-					(links[i].toLowerCase().includes("docs.hornbill.com") ||
+					(links[i].toLowerCase().includes("docs.hornbill.com") ||
 					links[i].toLowerCase().includes("docs-internal.hornbill.com")) &&
 					!markdown_paths.relativePath.includes('/_inline/')
 				) {
 					const error_message = processErrorMessage(`Hornbill Docs links should not be fully-qualified: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
-					errors[htmlFile.relativePath].push( error_message );
+					errors[htmlFile.relativePath].push(error_message);
 					continue;
 				}
-				if (
+				if (
 					links[i].toLowerCase().includes("docs-internal.hornbill.com") &&
 					markdown_paths.relativePath.includes('/_inline/') &&
 					!private_repo
 				) {
 					// Is the parent book in a public repo? If so, flag this as an error.
 					const error_message = processErrorMessage(`Hornbill docs-internal links should not be used in public book inline content: ${links[i]}`, markdown_paths.relativePath, markdown_content, links[i]);
-					errors[htmlFile.relativePath].push( error_message );
+					errors[htmlFile.relativePath].push(error_message);
 					continue;
 				}
-				// Use Puppeteer to validate link address works
-				const page = await browser.newPage();
-				try {
-					// Set a user-agent to mimic a real browser
-					await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36');
+				// Capture url in closure for the async check below
+				const url = links[i];
+				const isInternal = url.toLowerCase().includes("internal.hornbill.com");
+				externalChecks.push(async () => {
+					// For internal.hornbill.com links, check network reachability first (result cached)
+					if (isInternal) {
+						const on_int_net = await ensureIntNetCached();
+						if (!on_int_net) {
+							messages[htmlFile.relativePath].push(
+								`Outside of Hornbill network - skipping internal link validation for: ${url}`,
+							);
+							fs.appendFileSync(skip_link_file, `${url}\n`);
+							return;
+						}
+						messages[htmlFile.relativePath].push(
+							`Inside of Hornbill network - performing internal link validation for: ${url}`,
+						);
+					}
 					try {
-						let response = null;
-						// Capture redirects and final response
-						page.on('request', (request) => {
-							if (request.isNavigationRequest() && request.redirectChain().length) {
-								redirectChain = request.redirectChain().map((req) => req.url());
-							}
-						});
-						// Capture the response
-						page.on('response', (res) => {
-							const chain = res.request().redirectChain();
-							if (chain.length > 0) {
-								redirectChain = chain.map((req) => req.url());
-								lastRedirectStatus = res.status(); // Status of the last redirect
-							}
-						});
-						// Try loading the URL
-						response = await page.goto(links[i], { waitUntil: 'networkidle2', timeout: 10000 });
-						if (response) {
-							let status = response.status();
-							const contentType = response.headers()['content-type'];
-							// If it's a PDF switch to direct fetching
-							if (contentType && contentType.includes('application/')) {
-								status = await page.evaluate(async (url) => {
-									const res = await fetch(url, { method: 'HEAD' });
-									return res.status;
-								}, links[i]);
-							}
-							if ((status < 200 || status > 299) && status !== 304) {
-								if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && links[i].includes(".hornbill.com")) {
-									// STEVEG - do nothing here, as it always returns a 403 for Hornbill sites when accessing through GitHub Actions
-									// Works totally fine locally or in hdocpub, still trying to work out what's causing this in GitHub
-								} else {
-									throw `Unexpected Status Returned: ${status}`;
-								}
+						const status = await fetchExternalLinkStatus(url);
+						if ((status < 200 || status > 299) && status !== 304) {
+							if (process.env.GITHUB_ACTIONS === 'true' && status === 403 && url.includes(".hornbill.com")) {
+								// Always returns 403 for Hornbill sites through GitHub Actions — not a real error
 							} else {
-								fs.appendFileSync(skip_link_file, `${links[i]}\n`);
+								throw `Unexpected Status Returned: ${status}`;
 							}
 						} else {
-							throw `No response from: ${links[i]}`;
+							fs.appendFileSync(skip_link_file, `${url}\n`);
 						}
-					} catch (error) {
-						throw error;
-					}
-				} catch (e) {
-					let error_message;
-					if (e instanceof AggregateError) {
-						error_message = processErrorMessage(`Issue with external link [${links[i]}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, links[i]);
-					} else {
-						error_message = processErrorMessage(`Issue with external link [${links[i]}]: ${e}`, markdown_paths.relativePath, markdown_content, links[i]);
+					} catch (e) {
+						let error_message;
+						if (e instanceof AggregateError) {
+							error_message = processErrorMessage(`Issue with external link [${url}]: ${e.message} - ${JSON.stringify(e.errors)}`, markdown_paths.relativePath, markdown_content, url);
+						} else {
+							error_message = processErrorMessage(`Issue with external link [${url}]: ${e}`, markdown_paths.relativePath, markdown_content, url);
+						}
+						if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
+							warnings[htmlFile.relativePath].push(error_message);
+						else
+							errors[htmlFile.relativePath].push(error_message);
 					}
-					if (hdocbook_project.validation.external_link_warnings || process.env.GITHUB_ACTIONS === 'true')
-						warnings[htmlFile.relativePath].push(error_message);
-					else
-						errors[htmlFile.relativePath].push(error_message);
-				}
-				// Close the headless browser tab
-				page.close();
+				});
 			}
 		}
+		// Run all external HTTP checks concurrently — fetch is lightweight enough
+		// that uncapped concurrency is fine for the link counts seen in practice.
+		await Promise.all(externalChecks.map(fn => fn()));
 	};
 	const checkHostExistsInDNS = async (hostname) => {
@@ -1069,8 +1071,7 @@ const { error } = require("node:console");
 		const global_links_checked = [];
-		const validateBrowser = await puppeteer.launch({ args: ['--no-sandbox'] });
 		for (const key in html_to_validate) {
 			const file = html_to_validate[key];
 			// Check for British spellings in static HTML content
@@ -1095,7 +1096,7 @@ const { error } = require("node:console");
 				messages[file.relativePath].push("No links found in file");
 			} else {
 				console.log(`\r\nChecking ${links.href.length} Links in ${file.relativePath}`);
-				await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, validateBrowser, global_links_checked, output_links);
+				await checkLinks(source_path, file, links.href, hdocbook_config, hdocbook_project, global_links_checked, output_links);
 			}
 			if (links.img.length === 0) {
 				messages[file.relativePath].push("No images found in file");
@@ -1107,9 +1108,6 @@ const { error } = require("node:console");
 			await checkTags(file);
 		}
-		// Close the Chromium browser instance
-		await validateBrowser.close();
 		if (gen_exclude) console.log(JSON.stringify(excl_output, null, 2));
 		if (verbose) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "hdoc-tools",
-	"version": "0.47.2",
+	"version": "0.47.4",
 	"description": "Hornbill HDocBook Development Support Tool",
 	"main": "hdoc.js",
 	"bin": {