w3c-validate-html 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +4 -11
  2. package/index.js +82 -6
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -124,21 +124,14 @@ jobs:
124
124
  html-validate:
125
125
  runs-on: ubuntu-latest
126
126
  steps:
127
- - uses: actions/checkout@v4
128
127
  - uses: actions/setup-node@v4
129
128
  with:
130
129
  node-version: 18
131
130
 
132
- - run: npm ci
133
- - run: npm start &
134
-
135
- - run: |
136
- for i in {1..30}; do
137
- curl -fsS http://localhost:8080 >/dev/null && break
138
- sleep 1
139
- done
140
-
141
- - run: npx w3c-validate-html --url http://localhost:8080 --depth 3 --concurrency 4 --errors-only --json > html-report.json
131
+ - name: validate url
132
+ env:
133
+ TARGET_URL: https://example.com
134
+ run: npx w3c-validate-html --target "$TARGET_URL" --depth 2 --errors-only --json > html-report.json
142
135
 
143
136
  - uses: actions/upload-artifact@v4
144
137
  with:
package/index.js CHANGED
@@ -154,14 +154,49 @@ function toList(v) {
154
154
  */
155
155
  function toSafeName(href) {
156
156
  var s = String(href || '');
157
+ var out = '';
158
+ var i;
159
+ var ch;
160
+
157
161
  s = s.replace(/^https?:\/\//i, '');
158
- s = s.replace(/[?#].*$/, '');
159
162
  s = s.replace(/\/+/g, '/');
160
- s = s.replace(/[^a-z0-9/._-]+/gi, '_');
161
- s = s.replace(/\//g, '_');
162
- if (!s) { s = 'index.html'; }
163
- if (!/\.html?$/i.test(s)) { s += '.html'; }
164
- return s;
163
+
164
+ /* convert url chars to filename safe chars without stripping query */
165
+ for (i = 0; i < s.length; i++) {
166
+ ch = s.charAt(i);
167
+
168
+ /* keep common safe chars */
169
+ if (/[a-z0-9]/i.test(ch) || ch === '/' || ch === '.' || ch === '_' || ch === '-') {
170
+ out += ch;
171
+ }
172
+ /* map separators to readable tokens */
173
+ else if (ch === '?') {
174
+ out += '__q__';
175
+ }
176
+ else if (ch === '&') {
177
+ out += '__and__';
178
+ }
179
+ else if (ch === '=') {
180
+ out += '__eq__';
181
+ }
182
+ else if (ch === '#') {
183
+ out += '__hash__';
184
+ }
185
+ /* everything else becomes underscore */
186
+ else {
187
+ out += '_';
188
+ }
189
+ }
190
+
191
+ out = out.replace(/\/+/g, '/');
192
+ out = out.replace(/_+/g, '_');
193
+ out = out.replace(/\//g, '_');
194
+ out = out.replace(/^_+|_+$/g, '');
195
+
196
+ if (!out) { out = 'index.html'; }
197
+ if (!/\.html?$/i.test(out)) { out += '.html'; }
198
+
199
+ return out;
165
200
  }
166
201
 
167
202
  /**
@@ -389,6 +424,15 @@ function isCrawlable(href, cfg, origin) {
389
424
  return false;
390
425
  }
391
426
 
427
+ // skip common non-HTML file types (e.g., pdf, zip, docx, etc)
428
+ if (/\.(pdf|zip|docx?|xlsx?|pptx?|jpg|jpeg|png|gif|svg|mp3|mp4|avi|mov|wmv|exe|dmg|tar|gz|rar|7z)(\?|#|$)/i.test(href)) {
429
+ return false;
430
+ }
431
+ // skip links that look like downloads
432
+ if (/download(=|\b|\/|\.)|attachment(=|\b|\/|\.)|file(=|\b|\/|\.)/i.test(href)) {
433
+ return false;
434
+ }
435
+
392
436
  if (cfg && cfg.sameOrigin) {
393
437
  try {
394
438
  if (new URL(href).origin !== origin) {
@@ -432,6 +476,12 @@ async function fetchHtml(pageUrl, cfg) {
432
476
  throw new Error('request failed ' + res.status + ' ' + pageUrl);
433
477
  }
434
478
 
479
+ // Only process HTML or XHTML
480
+ var contentType = res.headers.get('content-type') || '';
481
+ if (!/text\/html|application\/xhtml\+xml/i.test(contentType)) {
482
+ return null;
483
+ }
484
+
435
485
  var finalUrl = (res.url && String(res.url)) ? String(res.url) : pageUrl;
436
486
  var html = await res.text();
437
487
 
@@ -560,7 +610,19 @@ async function asyncPool(items, concurrency, worker) {
560
610
  * @returns {Promise<{url:string,ok:boolean,errors:Array,warnings:Array,finalUrl:string,links:Array}>} - Result
561
611
  */
562
612
  async function validateOneUrl(pageUrl, cfg, tmpDir) {
613
+
563
614
  var fetched = await fetchHtml(pageUrl, cfg);
615
+ if (!fetched) {
616
+ // Not HTML, skip crawling and validation
617
+ return {
618
+ url: pageUrl,
619
+ finalUrl: pageUrl,
620
+ ok: true,
621
+ errors: [],
622
+ warnings: [],
623
+ links: []
624
+ };
625
+ }
564
626
  var finalUrl = fetched.finalUrl;
565
627
  var html = fetched.html;
566
628
 
@@ -973,11 +1035,25 @@ if (require.main === module) {
973
1035
  userAgent: argv['user-agent']
974
1036
  };
975
1037
 
1038
+ const startTime = Date.now();
976
1039
  validate(target, cfg).then(function (summary) {
977
1040
  if (argv.json) {
978
1041
  try { console.log(JSON.stringify(summary)); }
979
1042
  catch (e) { console.error('{"error":"failed to stringify results"}'); }
980
1043
  }
1044
+
1045
+ // Jasmine-style summary (simplified)
1046
+ const total = summary.passed + summary.failed;
1047
+ const duration = ((Date.now() - startTime) / 1000).toFixed(3);
1048
+ console.log('\nSummary:');
1049
+ if (summary.failed === 0) {
1050
+ console.log('\nšŸ‘Š Passed');
1051
+ } else {
1052
+ console.log('\nāŒ Failed');
1053
+ }
1054
+ console.log('Pages: ' + summary.passed + ' of ' + total);
1055
+ console.log('Errors: ' + summary.failed);
1056
+ console.log('Finished in ' + duration + ' seconds');
981
1057
  process.exit(summary.failed > 0 ? 1 : 0);
982
1058
  })
983
1059
  .catch(function (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "w3c-validate-html",
3
- "version": "1.0.1",
3
+ "version": "1.1.0",
4
4
  "description": "Validate HTML offline using the official W3C vnu.jar",
5
5
  "type": "commonjs",
6
6
  "main": "index.js",