markdown_link_checker_sc 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +18 -0
  3. package/index.js +448 -0
  4. package/package.json +27 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Hamish Willee
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,18 @@
1
+ # markdown link checker sc
2
+
3
+ ALPHA - Mostly just attempting better handling of internal links.
4
+
5
+ Markdown link checker in node.
6
+ Better handling of internal link checking.
7
+
8
+
9
+ Current version only does internal link checking
10
+
11
+ ```
12
+ Usage: index [options]
13
+
14
+ Options:
15
+ -d, --directory [directory] The directory to search for markdown and html files (default: is current directory)
16
+ -h, --headingAnchorSlugify [value] Slugify approach for turning markdown headings into heading anchors. Currently support vuepress only and always (default: "vuepress")
17
+ --help display help for command
18
+ ```
package/index.js ADDED
@@ -0,0 +1,448 @@
1
+ #!/usr/bin/env node
2
+
3
+ const fs = require("fs");
4
+ const path = require("path");
5
+ const { program } = require("commander");
6
+
7
+ program
8
+ .option(
9
+ "-d, --directory [directory]",
10
+ "The directory to search for markdown and html files",
11
+ process.cwd()
12
+ )
13
+ .option(
14
+ "-h, --headingAnchorSlugify [value]",
15
+ "Slugify approach for turning markdown headings into heading anchors. Currently support vuepress only and always",
16
+ "vuepress"
17
+ )
18
+ .option(
19
+ "-t, --tryMarkdownforHTML [value]",
20
+ "Try a markdown file extension check if a link to HTML fails.",
21
+ true
22
+ )
23
+ .parse(process.argv);
24
+
25
+ const options = program.opts();
26
+
27
+ const isMarkdown = (file) => path.extname(file).toLowerCase() === ".md";
28
+ const isHtml = (file) => path.extname(file).toLowerCase() === ".html";
29
+ const replaceDelimiter = (str, underscore) =>
30
+ underscore ? str.replace(/\s+/g, "_") : str.replace(/\s+/g, "-");
31
+
32
+ function slugifyVuepress(str) {
33
+ const slug = str
34
+ .toLowerCase()
35
+ .replace(/\/+/g, "-") // replace / with hyphens
36
+ .replace(/[^A-Za-z0-9/]+/g, "-") // replace non-word characters except / with hyphens
37
+ .replace(/[\s_-]+/g, "-") // Replace spaces and underscores with hyphens
38
+ .replace(/^-+|-+$/g, ""); // Remove extra hyphens from the beginning or end of the string
39
+
40
+ if (str.includes("/")) {
41
+ //console.log(`DEBUG: SLUG: str: ${str} slug: ${slug}`);
42
+ }
43
+ return `${slug}`;
44
+ }
45
+
46
+ const processHeading = (line, slugifyApproach) => {
47
+ const matches = line.match(/^#+\s+(.+)$/);
48
+ if (matches) {
49
+ //slugifyApproach is currently only slugifyVuepress so we do no test.
50
+ return slugifyVuepress(matches[1]);
51
+ }
52
+ return null;
53
+ };
54
+
55
+ const processMarkdownLink = (
56
+ line,
57
+ relativeLinks,
58
+ relativeImageLinks,
59
+ absoluteLinks,
60
+ absoluteImageLinks,
61
+ unHandledLinkTypes
62
+ ) => {
63
+ const matches = line.matchAll(/([!@]?)\[([^\]]+)\]\((\S+?)\)/g);
64
+
65
+ // TODO - THIS matches @[youtube](gjHj6YsxcZk) valid link which is used for vuepress plugin URLs. We probably want to exclude it and deal with it separately
66
+ // Maybe a backwards lookup on @
67
+ // Not sure if we can generalize
68
+
69
+ for (const match of matches) {
70
+ const isMarkdownImageLink = match[1] == "!" ? true : false;
71
+ const isVuepressYouTubeLink = match[1] == "@" ? true : false;
72
+
73
+ const linkText = match[2];
74
+ let linkUrl = match[3];
75
+ const linkAnchorSplit = linkUrl.split("#");
76
+ linkUrl = linkAnchorSplit[0].trim();
77
+ const linkAnchor = linkAnchorSplit[1] ? linkAnchorSplit[1] : null;
78
+
79
+ const link = { linkText, linkUrl, linkAnchor };
80
+
81
+ if (isVuepressYouTubeLink) {
82
+ if (linkUrl.startsWith("http")) {
83
+ absoluteLinks.push(link);
84
+ } else {
85
+ unHandledLinkTypes.push(link); // Not going to handle this (yet)
86
+ // TODO - prepend the standard URL
87
+ }
88
+ } else if (linkUrl.startsWith("http")) {
89
+ isMarkdownImageLink
90
+ ? absoluteImageLinks.push(link)
91
+ : absoluteLinks.push(link);
92
+ } else if (
93
+ linkUrl.startsWith("ftp:") ||
94
+ linkUrl.startsWith("ftps") ||
95
+ linkUrl.startsWith("mailto")
96
+ ) {
97
+ // One of the types we specifically do not handle
98
+ unHandledLinkTypes.push(link);
99
+ } else if (
100
+ linkUrl.endsWith(".png") ||
101
+ linkUrl.endsWith(".jpg") ||
102
+ linkUrl.endsWith(".jpeg") ||
103
+ linkUrl.endsWith(".gif") ||
104
+ linkUrl.endsWith(".webp")
105
+ ) {
106
+ //console.log("???Markdown");
107
+ //Catch case where image link is inside
108
+ relativeImageLinks.push(link);
109
+ } else {
110
+ isMarkdownImageLink
111
+ ? relativeImageLinks.push(link)
112
+ : relativeLinks.push(link);
113
+ }
114
+ }
115
+
116
+ //Match for html img and a - append to the lists
117
+ const regexHTMLLinks =
118
+ /<(a|img)[^>]*(href|src)="([^"]+)"[^>]*(?:title="([^"]+)"|>([^<]+)<\/\1>)/gi;
119
+
120
+ for (const match of line.matchAll(regexHTMLLinks)) {
121
+ const isMarkdownImageLink = match[1] == "img" ? true : false;
122
+ //const tagType = match[1];
123
+ //const hrefOrSrc = match[2];
124
+ let linkUrl = match[3];
125
+ const linkText = match[4] || match[5] || "";
126
+ const linkAnchorSplit = linkUrl.split("#");
127
+ linkUrl = linkAnchorSplit[0];
128
+ const linkAnchor = linkAnchorSplit[1] ? linkAnchorSplit[1] : null;
129
+ const link = { linkText, linkUrl, linkAnchor };
130
+
131
+ if (linkUrl.startsWith("http")) {
132
+ isMarkdownImageLink
133
+ ? absoluteImageLinks.push(link)
134
+ : absoluteLinks.push(link);
135
+ } else {
136
+ isMarkdownImageLink
137
+ ? relativeImageLinks.push(link)
138
+ : relativeLinks.push(link);
139
+ }
140
+ }
141
+
142
+ return {
143
+ relativeLinks,
144
+ absoluteLinks,
145
+ absoluteImageLinks,
146
+ relativeImageLinks,
147
+ };
148
+ };
149
+
150
+ const processFile = async (file, slugifyApproach) => {
151
+ try {
152
+ const contents = await fs.promises.readFile(file, "utf8");
153
+ const lines = contents.split(/\r?\n/);
154
+ const anchors = [];
155
+ const htmlAnchors = []; //{};
156
+ const relativeLinks = [];
157
+ const absoluteLinks = [];
158
+ const absoluteImageLinks = [];
159
+ const relativeImageLinks = [];
160
+ const unHandledLinkTypes = [];
161
+ const allErrors = [];
162
+ for (let i = 0; i < lines.length; i++) {
163
+ const line = lines[i];
164
+ const heading = processHeading(line, slugifyApproach);
165
+ if (heading) {
166
+ anchors.push(heading);
167
+ }
168
+
169
+ const links = processMarkdownLink(
170
+ line,
171
+ relativeLinks,
172
+ relativeImageLinks,
173
+ absoluteLinks,
174
+ absoluteImageLinks,
175
+ unHandledLinkTypes
176
+ );
177
+ }
178
+
179
+ const htmlTagsWithIdsMatches = contents.match(
180
+ /<([a-z]+)(?:\s+[^>]*?\bid=(["'])(.*?)\2[^>]*?)?>/gi
181
+ );
182
+ if (htmlTagsWithIdsMatches) {
183
+ htmlTagsWithIdsMatches.forEach((match) => {
184
+ const tagMatches = match.match(/^<([a-z]+)/i);
185
+ const idMatches = match.match(/id=(["'])(.*?)\1/);
186
+ if (tagMatches && idMatches) {
187
+ const tag = tagMatches[1].toLowerCase();
188
+ const id = idMatches[2];
189
+
190
+ if (tag && id) {
191
+ /*
192
+ if (!htmlAnchors[tag]) {
193
+ htmlAnchors[tag] = [];
194
+ }
195
+ htmlAnchors[tag].push(id);
196
+ */
197
+ htmlAnchors.push(id);
198
+ }
199
+ }
200
+ });
201
+ }
202
+
203
+ return {
204
+ page_file: file,
205
+ anchors_auto_headings: anchors,
206
+ anchors_tag_ids: htmlAnchors,
207
+ relativeLinks,
208
+ absoluteLinks,
209
+ absoluteImageLinks,
210
+ relativeImageLinks,
211
+ unHandledLinkTypes,
212
+ allErrors,
213
+ };
214
+ } catch (err) {
215
+ console.error(`Error processing file ${file}: ${err.message}`);
216
+ console.error(err);
217
+ return null;
218
+ }
219
+ };
220
+
221
+ const processDirectory = async (dir, slugifyApproach) => {
222
+ const files = await fs.promises.readdir(dir, { withFileTypes: true });
223
+ const results = [];
224
+ for (let i = 0; i < files.length; i++) {
225
+ const file = path.join(dir, files[i].name);
226
+ if (files[i].isDirectory()) {
227
+ const subResults = await processDirectory(file, slugifyApproach);
228
+ results.push(...subResults);
229
+ } else if (isMarkdown(file) || isHtml(file)) {
230
+ const result = await processFile(file, slugifyApproach);
231
+ if (result) {
232
+ results.push(result);
233
+ }
234
+ }
235
+ }
236
+ return results;
237
+ };
238
+
239
+ function processRelativeLinks(results) {
240
+ if (!results.allErrors) {
241
+ results["allErrors"] = [];
242
+ }
243
+ results.forEach((page, index, array) => {
244
+ //console.log(page);
245
+
246
+ page.relativeLinks.forEach((link, index, array) => {
247
+ //console.log(link);
248
+ //resolve the path for the link
249
+ const page_rel_path = page.page_file.split(options.directory)[1];
250
+ if (link.linkUrl === "") {
251
+ //page local link - check current page for headings
252
+ //console.log(link);
253
+
254
+ if (
255
+ page.anchors_auto_headings.includes(link.linkAnchor) ||
256
+ page.anchors_tag_ids.includes(link.linkAnchor)
257
+ ) {
258
+ //do nothing - we're good
259
+ } else {
260
+ const error = {
261
+ type: "InternalLocalMissingAnchor",
262
+ page: `${page.page_file}`,
263
+ linkAnchor: `${link.linkAnchor}`,
264
+ linkText: `${link.linkText}`,
265
+ };
266
+
267
+ results.allErrors.push(error);
268
+ //console.log(error);
269
+ //console.log( `ERROR: ${page_rel_path}: Missing local anchor [${link.linkText}](#${link.linkAnchor})` );
270
+ }
271
+ } else {
272
+ // relative link on another page.
273
+
274
+ //find the path of the linked page.
275
+ const linkAbsoluteFilePath = path.resolve(
276
+ path.dirname(page.page_file),
277
+ link.linkUrl
278
+ );
279
+
280
+ // Get the matching file matching our link, if it exists
281
+ let linkedFile =
282
+ results.find(
283
+ (linkedFile) =>
284
+ linkedFile.hasOwnProperty("page_file") &&
285
+ path.normalize(linkedFile.page_file) === linkAbsoluteFilePath
286
+ ) || null;
287
+
288
+ if (!linkedFile) {
289
+ if (
290
+ options.tryMarkdownforHTML &&
291
+ linkAbsoluteFilePath.endsWith(".html")
292
+ ) {
293
+ // The file was HTML so it might be a file extension mistake (linking to html instead of md)
294
+ // In this case we'll try find it.
295
+
296
+ const markdownAbsoluteFilePath = `${
297
+ linkAbsoluteFilePath.split(".html")[0]
298
+ }.md`;
299
+ const linkedHTMLFile =
300
+ results.find(
301
+ (linkedHTMLFile) =>
302
+ linkedHTMLFile.hasOwnProperty("page_file") &&
303
+ path.normalize(linkedHTMLFile.page_file) ===
304
+ markdownAbsoluteFilePath
305
+ ) || null;
306
+
307
+ if (linkedHTMLFile) {
308
+ const error = {
309
+ type: "InternalLinkToHTML",
310
+ page: `${page.page_file}`,
311
+ linkUrl: `${link.linkUrl}`,
312
+ linkText: `${link.linkText}`,
313
+ linkUrlFilePath: `${linkAbsoluteFilePath}`,
314
+ };
315
+ results.allErrors.push(error);
316
+ // console.log(`: ${page_rel_path}: WARN: Link to .html not .md '${link.linkUrl}' with text '${link.linkText}' (${linkAbsoluteFilePath} )` );
317
+ linkedFile = linkedHTMLFile;
318
+ }
319
+ }
320
+ }
321
+
322
+ if (!linkedFile) {
323
+ //File not found as .html or md
324
+ const error = {
325
+ type: "InternalLinkMissingFile",
326
+ page: `${page.page_file}`,
327
+ linkUrl: `${link.linkUrl}`,
328
+ linkText: `${link.linkText}`,
329
+ linkUrlFilePath: `${linkAbsoluteFilePath}`,
330
+ };
331
+ results.allErrors.push(error);
332
+ // console.log(`ERROR: ${page_rel_path}: ERROR Broken rel. link '${link.linkUrl}' with text '${link.linkText}' (${linkAbsoluteFilePath} )` );
333
+ } else {
334
+ // There is a link, so now see if there are anchors, and whether they work
335
+ if (!link.linkAnchor) {
336
+ //null
337
+ return;
338
+ } else if (
339
+ linkedFile.anchors_auto_headings.includes(link.linkAnchor) ||
340
+ linkedFile.anchors_tag_ids.includes(link.linkAnchor)
341
+ ) {
342
+ //
343
+ //do nothing - we're good
344
+ } else {
345
+ // Link exists, but anchor broken
346
+
347
+ const link_rel_path = linkedFile.page_file.split(
348
+ options.directory
349
+ )[1];
350
+ const error = {
351
+ type: "InternalMissingAnchor",
352
+ page: `${page.page_file}`,
353
+ linkAnchor: `${link.linkAnchor}`,
354
+ linkUrl: `${link.linkUrl}`,
355
+ linkText: `${link.linkText}`,
356
+ linkUrlFilePath: `${linkAbsoluteFilePath}`,
357
+ };
358
+ results.allErrors.push(error);
359
+ //console.log( `WARN: ${page_rel_path}: Missing anchor \`${link.linkAnchor}\` linked in '${link_rel_path}' (linkText '${link.linkText}')` );
360
+ //console.log(`ERRORS CAUSED BY INCORRECT GUESS ABOUT FORMAT OF / in the new URL - e.g. mounting/orientation`)
361
+ }
362
+ }
363
+ }
364
+ });
365
+ });
366
+ }
367
+
368
+ function outputResults(results) {
369
+ //console.log(results.allErrors);
370
+
371
+ //Sort results by page and type.
372
+ // Perhaps next step is to create only get info for paricular pages.
373
+ const sortedByPageErrors = {};
374
+ for (const error of results.allErrors) {
375
+ //console.log("error:");
376
+ //console.log(error);
377
+ //console.log(error.page);
378
+ if (!sortedByPageErrors[error.page]) {
379
+ sortedByPageErrors[error.page] = [];
380
+ }
381
+ sortedByPageErrors[error.page].push(error);
382
+
383
+ // Sort by type as well.
384
+ for (const page in sortedByPageErrors) {
385
+ sortedByPageErrors[page].sort((a, b) => a.type.localeCompare(b.type));
386
+ }
387
+ }
388
+
389
+ //console.log(sortedByPageErrors);
390
+ for (page in sortedByPageErrors) {
391
+ console.log(`${page}`);
392
+ for (const error of sortedByPageErrors[page]) {
393
+ if (error.type == "InternalLinkMissingFile") {
394
+ console.log(` ${error.type}: ${error.linkUrl}`);
395
+ //console.log(` ${error.type}: ${error.linkAnchor}, linkURL: ${error.linkUrl}`);
396
+ // { "type": "InternalLinkMissingFile", "page": `${page.page_file}`, "linkUrl": `${link.linkUrl}`, "linkText": `${link.linkText}`, "linkUrlFilePath": `${linkAbsoluteFilePath}` };
397
+ } else if (error.type == "InternalLocalMissingAnchor") {
398
+ // missing anchor in linked file that exists.
399
+ //console.log(error);
400
+ console.log(
401
+ ` ${error.type}: #${error.linkAnchor} (not found in current file)`
402
+ );
403
+ //console.log(` ${error.type}: #${error.linkAnchor} (heading/anchor missing?)`);
404
+ //console.log(` #${error.linkAnchor} - Internal anchor not found`);
405
+ //console.log(` [${error.linkText}](#${error.linkAnchor}) - Anchor not found`);
406
+ //console.log(` Internal anchor not found: #${error.linkAnchor} `);
407
+ // `{ "type": "InternalLocalMissingAnchor", "page": "${page.page_file}", "anchor": "${link.linkAnchor}", "linktext", "${link.linkText}" }`;
408
+ } else if (error.type == "InternalMissingAnchor") {
409
+ // missing anchor in linked file that exists.
410
+ //console.log(error);
411
+ console.log(
412
+ ` ${error.type}: #${error.linkAnchor} not found in ${error.linkUrlFilePath}`
413
+ );
414
+ //console.log(` ${error.type}: #${error.linkAnchor} (heading/anchor missing?)`);
415
+ //console.log(` #${error.linkAnchor} - Internal anchor not found`);
416
+ //console.log(` [${error.linkText}](#${error.linkAnchor}) - Anchor not found`);
417
+ //console.log(` Internal anchor not found: #${error.linkAnchor} `);
418
+ // { "type": "InternalMissingAnchor", "page": `${page.page_file}`, "linkAnchor": `${link.linkAnchor}`, "linkUrl": `${link.linkUrl}`, "linktext": `${link.linkText}`, "linkUrlFilePath": `${linkAbsoluteFilePath}` };
419
+ } else if (error.type == "InternalLinkToHTML") {
420
+ console.log(` ${error.type}: ${error.linkUrl} (should be ".md"?)`);
421
+ //console.log(` ${error.type}: linkURL: ${error.linkUrl} ends in ".html"`);
422
+ // { "type": "InternalLinkToHTML", "page": `${page.page_file}`, "linkUrl": `${link.linkUrl}`, "linkText": `${link.linkText}`, "linkUrlFilePath": `${linkAbsoluteFilePath}` };
423
+ } else {
424
+ console.log(error);
425
+ }
426
+ }
427
+ //console.log(page)
428
+ //console.log(page.errors);
429
+ }
430
+ }
431
+
432
+ (async () => {
433
+ const results = await processDirectory(
434
+ options.directory,
435
+ options.headingAnchorSlugify
436
+ );
437
+
438
+ processRelativeLinks(results);
439
+ outputResults(results);
440
+
441
+ //console.log(JSON.stringify(results, null, 2));
442
+ //console.log("AllErrors");
443
+
444
+ //console.log(JSON.stringify(results.allErrors, null, 2));
445
+ })();
446
+
447
+ //OpenQuestions
448
+ // Handle page link to #itself
package/package.json ADDED
@@ -0,0 +1,27 @@
1
+ {
2
+ "name": "markdown_link_checker_sc",
3
+ "version": "0.0.1",
4
+ "description": "Markdown Link Checker",
5
+ "main": "index.js",
6
+ "scripts": {
7
+ "test": "test"
8
+ },
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "git+https://github.com/hamishwillee/markdown_link_checker_sc.git"
12
+ },
13
+ "keywords": [
14
+ "link",
15
+ "checker",
16
+ "markdown"
17
+ ],
18
+ "author": "Hamish Willee",
19
+ "license": "MIT",
20
+ "bugs": {
21
+ "url": "https://github.com/hamishwillee/markdown_link_checker_sc/issues"
22
+ },
23
+ "homepage": "https://github.com/hamishwillee/markdown_link_checker_sc#readme",
24
+ "dependencies": {
25
+ "commander": "^10.0.0"
26
+ }
27
+ }