@nuasite/checks 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/dist/types/check-runner.d.ts +16 -0
  2. package/dist/types/check-runner.d.ts.map +1 -0
  3. package/dist/types/checks/accessibility/aria-landmarks-check.d.ts +3 -0
  4. package/dist/types/checks/accessibility/aria-landmarks-check.d.ts.map +1 -0
  5. package/dist/types/checks/accessibility/form-label-check.d.ts +3 -0
  6. package/dist/types/checks/accessibility/form-label-check.d.ts.map +1 -0
  7. package/dist/types/checks/accessibility/index.d.ts +6 -0
  8. package/dist/types/checks/accessibility/index.d.ts.map +1 -0
  9. package/dist/types/checks/accessibility/lang-attribute-check.d.ts +3 -0
  10. package/dist/types/checks/accessibility/lang-attribute-check.d.ts.map +1 -0
  11. package/dist/types/checks/accessibility/link-text-check.d.ts +3 -0
  12. package/dist/types/checks/accessibility/link-text-check.d.ts.map +1 -0
  13. package/dist/types/checks/accessibility/tabindex-check.d.ts +3 -0
  14. package/dist/types/checks/accessibility/tabindex-check.d.ts.map +1 -0
  15. package/dist/types/checks/geo/agents-md-check.d.ts.map +1 -0
  16. package/dist/types/checks/geo/content-quality-check.d.ts +4 -0
  17. package/dist/types/checks/geo/content-quality-check.d.ts.map +1 -0
  18. package/dist/types/checks/geo/index.d.ts +3 -0
  19. package/dist/types/checks/geo/index.d.ts.map +1 -0
  20. package/dist/types/checks/geo/llms-txt-check.d.ts +3 -0
  21. package/dist/types/checks/geo/llms-txt-check.d.ts.map +1 -0
  22. package/dist/types/checks/performance/html-size-check.d.ts +3 -0
  23. package/dist/types/checks/performance/html-size-check.d.ts.map +1 -0
  24. package/dist/types/checks/performance/image-optimization-check.d.ts +4 -0
  25. package/dist/types/checks/performance/image-optimization-check.d.ts.map +1 -0
  26. package/dist/types/checks/performance/index.d.ts +7 -0
  27. package/dist/types/checks/performance/index.d.ts.map +1 -0
  28. package/dist/types/checks/performance/inline-size-check.d.ts +3 -0
  29. package/dist/types/checks/performance/inline-size-check.d.ts.map +1 -0
  30. package/dist/types/checks/performance/lazy-loading-check.d.ts +3 -0
  31. package/dist/types/checks/performance/lazy-loading-check.d.ts.map +1 -0
  32. package/dist/types/checks/performance/render-blocking-check.d.ts +3 -0
  33. package/dist/types/checks/performance/render-blocking-check.d.ts.map +1 -0
  34. package/dist/types/checks/performance/total-requests-check.d.ts +3 -0
  35. package/dist/types/checks/performance/total-requests-check.d.ts.map +1 -0
  36. package/dist/types/checks/seo/broken-internal-links-check.d.ts +3 -0
  37. package/dist/types/checks/seo/broken-internal-links-check.d.ts.map +1 -0
  38. package/dist/types/checks/seo/canonical-check.d.ts +5 -0
  39. package/dist/types/checks/seo/canonical-check.d.ts.map +1 -0
  40. package/dist/types/checks/seo/description-check.d.ts +4 -0
  41. package/dist/types/checks/seo/description-check.d.ts.map +1 -0
  42. package/dist/types/checks/seo/heading-hierarchy-check.d.ts +5 -0
  43. package/dist/types/checks/seo/heading-hierarchy-check.d.ts.map +1 -0
  44. package/dist/types/checks/seo/image-alt-check.d.ts +3 -0
  45. package/dist/types/checks/seo/image-alt-check.d.ts.map +1 -0
  46. package/dist/types/checks/seo/image-alt-quality-check.d.ts +3 -0
  47. package/dist/types/checks/seo/image-alt-quality-check.d.ts.map +1 -0
  48. package/dist/types/checks/seo/index.d.ts +15 -0
  49. package/dist/types/checks/seo/index.d.ts.map +1 -0
  50. package/dist/types/checks/seo/json-ld-check.d.ts +3 -0
  51. package/dist/types/checks/seo/json-ld-check.d.ts.map +1 -0
  52. package/dist/types/checks/seo/meta-duplicates-check.d.ts +3 -0
  53. package/dist/types/checks/seo/meta-duplicates-check.d.ts.map +1 -0
  54. package/dist/types/checks/seo/noindex-check.d.ts +3 -0
  55. package/dist/types/checks/seo/noindex-check.d.ts.map +1 -0
  56. package/dist/types/checks/seo/open-graph-check.d.ts +5 -0
  57. package/dist/types/checks/seo/open-graph-check.d.ts.map +1 -0
  58. package/dist/types/checks/seo/sitemap-robots-check.d.ts +4 -0
  59. package/dist/types/checks/seo/sitemap-robots-check.d.ts.map +1 -0
  60. package/dist/types/checks/seo/title-check.d.ts +5 -0
  61. package/dist/types/checks/seo/title-check.d.ts.map +1 -0
  62. package/dist/types/checks/seo/twitter-card-check.d.ts +3 -0
  63. package/dist/types/checks/seo/twitter-card-check.d.ts.map +1 -0
  64. package/dist/types/checks/seo/viewport-check.d.ts +3 -0
  65. package/dist/types/checks/seo/viewport-check.d.ts.map +1 -0
  66. package/dist/types/checks-integration.d.ts +4 -0
  67. package/dist/types/checks-integration.d.ts.map +1 -0
  68. package/dist/types/config.d.ts +3 -0
  69. package/dist/types/config.d.ts.map +1 -0
  70. package/dist/types/html-analyzer.d.ts +11 -0
  71. package/dist/types/html-analyzer.d.ts.map +1 -0
  72. package/dist/types/i18n/poor-texts.d.ts +14 -0
  73. package/dist/types/i18n/poor-texts.d.ts.map +1 -0
  74. package/dist/types/index.d.ts +4 -0
  75. package/dist/types/index.d.ts.map +1 -0
  76. package/dist/types/register.d.ts +4 -0
  77. package/dist/types/register.d.ts.map +1 -0
  78. package/dist/types/report.d.ts +5 -0
  79. package/dist/types/report.d.ts.map +1 -0
  80. package/dist/types/tsconfig.tsbuildinfo +1 -0
  81. package/dist/types/types.d.ts +220 -0
  82. package/dist/types/types.d.ts.map +1 -0
  83. package/package.json +49 -0
  84. package/src/check-runner.ts +92 -0
  85. package/src/checks/accessibility/aria-landmarks-check.ts +23 -0
  86. package/src/checks/accessibility/form-label-check.ts +29 -0
  87. package/src/checks/accessibility/index.ts +5 -0
  88. package/src/checks/accessibility/lang-attribute-check.ts +22 -0
  89. package/src/checks/accessibility/link-text-check.ts +30 -0
  90. package/src/checks/accessibility/tabindex-check.ts +28 -0
  91. package/src/checks/geo/content-quality-check.ts +48 -0
  92. package/src/checks/geo/index.ts +2 -0
  93. package/src/checks/geo/llms-txt-check.ts +37 -0
  94. package/src/checks/performance/html-size-check.ts +26 -0
  95. package/src/checks/performance/image-optimization-check.ts +82 -0
  96. package/src/checks/performance/index.ts +6 -0
  97. package/src/checks/performance/inline-size-check.ts +29 -0
  98. package/src/checks/performance/lazy-loading-check.ts +29 -0
  99. package/src/checks/performance/render-blocking-check.ts +27 -0
  100. package/src/checks/performance/total-requests-check.ts +27 -0
  101. package/src/checks/seo/broken-internal-links-check.ts +49 -0
  102. package/src/checks/seo/canonical-check.ts +77 -0
  103. package/src/checks/seo/description-check.ts +55 -0
  104. package/src/checks/seo/heading-hierarchy-check.ts +76 -0
  105. package/src/checks/seo/image-alt-check.ts +28 -0
  106. package/src/checks/seo/image-alt-quality-check.ts +31 -0
  107. package/src/checks/seo/index.ts +14 -0
  108. package/src/checks/seo/json-ld-check.ts +26 -0
  109. package/src/checks/seo/meta-duplicates-check.ts +44 -0
  110. package/src/checks/seo/noindex-check.ts +22 -0
  111. package/src/checks/seo/open-graph-check.ts +55 -0
  112. package/src/checks/seo/sitemap-robots-check.ts +55 -0
  113. package/src/checks/seo/title-check.ts +63 -0
  114. package/src/checks/seo/twitter-card-check.ts +22 -0
  115. package/src/checks/seo/viewport-check.ts +22 -0
  116. package/src/checks-integration.ts +126 -0
  117. package/src/config.ts +27 -0
  118. package/src/html-analyzer.ts +325 -0
  119. package/src/i18n/poor-texts.ts +66 -0
  120. package/src/index.ts +22 -0
  121. package/src/register.ts +110 -0
  122. package/src/report.ts +78 -0
  123. package/src/tsconfig.json +6 -0
  124. package/src/types.ts +244 -0
@@ -0,0 +1,55 @@
1
+ import type { Check, CheckIssue, PageCheckContext } from '../../types'
2
+
3
+ export function createOgTitleCheck(): Check {
4
+ return {
5
+ kind: 'page',
6
+ id: 'seo/og-title',
7
+ name: 'Open Graph Title',
8
+ domain: 'seo',
9
+ defaultSeverity: 'warning',
10
+ description: 'Pages should have an og:title meta tag',
11
+ essential: false,
12
+ run(ctx: PageCheckContext): CheckIssue[] {
13
+ if (!ctx.pageData.openGraph.title) {
14
+ return [{ message: 'Page is missing og:title meta tag', suggestion: 'Add <meta property="og:title" content="..."> inside <head>' }]
15
+ }
16
+ return []
17
+ },
18
+ }
19
+ }
20
+
21
+ export function createOgDescriptionCheck(): Check {
22
+ return {
23
+ kind: 'page',
24
+ id: 'seo/og-description',
25
+ name: 'Open Graph Description',
26
+ domain: 'seo',
27
+ defaultSeverity: 'warning',
28
+ description: 'Pages should have an og:description meta tag',
29
+ essential: false,
30
+ run(ctx: PageCheckContext): CheckIssue[] {
31
+ if (!ctx.pageData.openGraph.description) {
32
+ return [{ message: 'Page is missing og:description meta tag', suggestion: 'Add <meta property="og:description" content="..."> inside <head>' }]
33
+ }
34
+ return []
35
+ },
36
+ }
37
+ }
38
+
39
+ export function createOgImageCheck(): Check {
40
+ return {
41
+ kind: 'page',
42
+ id: 'seo/og-image',
43
+ name: 'Open Graph Image',
44
+ domain: 'seo',
45
+ defaultSeverity: 'warning',
46
+ description: 'Pages should have an og:image meta tag',
47
+ essential: false,
48
+ run(ctx: PageCheckContext): CheckIssue[] {
49
+ if (!ctx.pageData.openGraph.image) {
50
+ return [{ message: 'Page is missing og:image meta tag', suggestion: 'Add <meta property="og:image" content="..."> inside <head>' }]
51
+ }
52
+ return []
53
+ },
54
+ }
55
+ }
@@ -0,0 +1,55 @@
1
+ import fs from 'node:fs/promises'
2
+ import path from 'node:path'
3
+ import type { SiteCheck, SiteCheckContext, SiteCheckIssue } from '../../types'
4
+
5
+ export function createRobotsTxtCheck(): SiteCheck {
6
+ return {
7
+ kind: 'site',
8
+ id: 'seo/robots-txt',
9
+ name: 'robots.txt Present',
10
+ domain: 'seo',
11
+ defaultSeverity: 'warning',
12
+ description: 'Site should have a robots.txt file',
13
+ essential: false,
14
+ async run(ctx: SiteCheckContext): Promise<SiteCheckIssue[]> {
15
+ try {
16
+ await fs.access(path.join(ctx.distDir, 'robots.txt'))
17
+ return []
18
+ } catch {
19
+ return [{
20
+ message: 'Site is missing a robots.txt file',
21
+ suggestion: 'Add a robots.txt file to the public directory or use an Astro integration to generate one',
22
+ pagePath: '/robots.txt',
23
+ }]
24
+ }
25
+ },
26
+ }
27
+ }
28
+
29
+ export function createSitemapXmlCheck(): SiteCheck {
30
+ return {
31
+ kind: 'site',
32
+ id: 'seo/sitemap-xml',
33
+ name: 'Sitemap Present',
34
+ domain: 'seo',
35
+ defaultSeverity: 'warning',
36
+ description: 'Site should have a sitemap',
37
+ essential: false,
38
+ async run(ctx: SiteCheckContext): Promise<SiteCheckIssue[]> {
39
+ const files = ['sitemap-index.xml', 'sitemap-0.xml', 'sitemap.xml']
40
+ for (const file of files) {
41
+ try {
42
+ await fs.access(path.join(ctx.distDir, file))
43
+ return []
44
+ } catch {
45
+ // continue checking next file
46
+ }
47
+ }
48
+ return [{
49
+ message: 'Site is missing a sitemap',
50
+ suggestion: 'Add @astrojs/sitemap to generate a sitemap automatically',
51
+ pagePath: '/sitemap-index.xml',
52
+ }]
53
+ },
54
+ }
55
+ }
@@ -0,0 +1,63 @@
1
+ import type { Check, CheckIssue, PageCheckContext } from '../../types'
2
+
3
+ export function createTitleMissingCheck(): Check {
4
+ return {
5
+ kind: 'page',
6
+ id: 'seo/title-missing',
7
+ name: 'Title Present',
8
+ domain: 'seo',
9
+ defaultSeverity: 'error',
10
+ description: 'Every page must have a <title> element',
11
+ essential: true,
12
+ run(ctx: PageCheckContext): CheckIssue[] {
13
+ if (!ctx.pageData.title) {
14
+ return [{ message: 'Page is missing a <title> element', suggestion: 'Add a <title> tag inside <head>' }]
15
+ }
16
+ return []
17
+ },
18
+ }
19
+ }
20
+
21
+ export function createTitleEmptyCheck(): Check {
22
+ return {
23
+ kind: 'page',
24
+ id: 'seo/title-empty',
25
+ name: 'Title Not Empty',
26
+ domain: 'seo',
27
+ defaultSeverity: 'error',
28
+ description: 'Page title must not be empty',
29
+ essential: true,
30
+ run(ctx: PageCheckContext): CheckIssue[] {
31
+ if (ctx.pageData.title && ctx.pageData.title.content.length === 0) {
32
+ return [{ message: 'Page title is empty', suggestion: 'Add meaningful text to the <title> element', line: ctx.pageData.title.line }]
33
+ }
34
+ return []
35
+ },
36
+ }
37
+ }
38
+
39
+ export function createTitleLengthCheck(maxLength: number): Check {
40
+ return {
41
+ kind: 'page',
42
+ id: 'seo/title-length',
43
+ name: 'Title Length',
44
+ domain: 'seo',
45
+ defaultSeverity: 'warning',
46
+ description: `Title should be under ${maxLength} characters`,
47
+ essential: true,
48
+ run(ctx: PageCheckContext): CheckIssue[] {
49
+ if (!ctx.pageData.title) return []
50
+ const { content, line } = ctx.pageData.title
51
+ if (content.length > maxLength) {
52
+ return [{
53
+ message: `Title is ${content.length} characters (max: ${maxLength})`,
54
+ suggestion: `Shorten the title to under ${maxLength} characters`,
55
+ line,
56
+ actual: content,
57
+ expected: `<= ${maxLength} characters`,
58
+ }]
59
+ }
60
+ return []
61
+ },
62
+ }
63
+ }
@@ -0,0 +1,22 @@
1
+ import type { Check, CheckIssue, PageCheckContext } from '../../types'
2
+
3
+ export function createTwitterCardCheck(): Check {
4
+ return {
5
+ kind: 'page',
6
+ id: 'seo/twitter-card',
7
+ name: 'Twitter Card',
8
+ domain: 'seo',
9
+ defaultSeverity: 'info',
10
+ description: 'Pages should have a twitter:card meta tag for rich social sharing',
11
+ essential: false,
12
+ run(ctx: PageCheckContext): CheckIssue[] {
13
+ if (!ctx.pageData.twitterCard.card) {
14
+ return [{
15
+ message: 'Page is missing twitter:card meta tag',
16
+ suggestion: 'Add <meta name="twitter:card" content="summary_large_image"> inside <head>',
17
+ }]
18
+ }
19
+ return []
20
+ },
21
+ }
22
+ }
@@ -0,0 +1,22 @@
1
+ import type { Check, CheckIssue, PageCheckContext } from '../../types'
2
+
3
+ export function createViewportMissingCheck(): Check {
4
+ return {
5
+ kind: 'page',
6
+ id: 'seo/viewport-missing',
7
+ name: 'Viewport Meta Tag',
8
+ domain: 'seo',
9
+ defaultSeverity: 'warning',
10
+ description: 'Pages should have a viewport meta tag for mobile responsiveness',
11
+ essential: true,
12
+ run(ctx: PageCheckContext): CheckIssue[] {
13
+ if (!ctx.pageData.hasViewport) {
14
+ return [{
15
+ message: 'Page is missing a viewport meta tag',
16
+ suggestion: 'Add <meta name="viewport" content="width=device-width, initial-scale=1"> inside <head>',
17
+ }]
18
+ }
19
+ return []
20
+ },
21
+ }
22
+ }
@@ -0,0 +1,126 @@
1
+ import type { AstroIntegration } from 'astro'
2
+ import fs from 'node:fs/promises'
3
+ import path from 'node:path'
4
+ import { fileURLToPath } from 'node:url'
5
+ import { CheckRunner } from './check-runner'
6
+ import { resolveChecksOptions } from './config'
7
+ import { analyzeHtml } from './html-analyzer'
8
+ import { registerAllChecks } from './register'
9
+ import { logReport, writeJsonReport } from './report'
10
+ import type { CheckResult, ChecksOptions, ExtractedPageData } from './types'
11
+
12
+ /**
13
+ * Try to read an HTML file for a given page pathname in the dist directory.
14
+ * Returns the file content and path, or undefined if not found.
15
+ */
16
+ async function readPageHtml(distDir: string, pathname: string): Promise<{ html: string; filePath: string } | undefined> {
17
+ const candidates = [
18
+ path.join(distDir, pathname, 'index.html'),
19
+ path.join(distDir, `${pathname.replace(/\/$/, '')}.html`),
20
+ ]
21
+ for (const candidate of candidates) {
22
+ try {
23
+ const html = await fs.readFile(candidate, 'utf8')
24
+ return { html, filePath: candidate }
25
+ } catch {
26
+ // Not found, try next
27
+ }
28
+ }
29
+ return undefined
30
+ }
31
+
32
+ export const checks = (options: ChecksOptions = {}): AstroIntegration => {
33
+ const resolved = resolveChecksOptions(options)
34
+ const isCI = !!process.env.CI
35
+ let siteUrl: string | undefined
36
+ let projectRoot: string | undefined
37
+
38
+ return {
39
+ name: '@nuasite/checks',
40
+ hooks: {
41
+ 'astro:config:done': ({ config }) => {
42
+ siteUrl = config.site
43
+ projectRoot = fileURLToPath(config.root)
44
+ },
45
+ 'astro:build:done': async ({ dir, pages, logger }) => {
46
+ const distDir = fileURLToPath(dir)
47
+ const runner = new CheckRunner(resolved, isCI)
48
+
49
+ // Register all built-in checks based on config
50
+ registerAllChecks(runner, resolved)
51
+
52
+ // Register custom checks
53
+ for (const check of resolved.customChecks) {
54
+ if (check.kind === 'site') {
55
+ runner.registerSiteCheck(check)
56
+ } else {
57
+ runner.registerCheck(check)
58
+ }
59
+ }
60
+
61
+ const allResults: CheckResult[] = []
62
+ const pagesData = new Map<string, ExtractedPageData>()
63
+
64
+ // Run per-page checks in parallel
65
+ const pageResults = await Promise.all(
66
+ pages.map(async (page) => {
67
+ const pagePath = `/${page.pathname}`.replace(/\/+/g, '/')
68
+ const result = await readPageHtml(distDir, page.pathname)
69
+ if (!result) {
70
+ logger.warn(`Skipping ${page.pathname}; no HTML output found.`)
71
+ return null
72
+ }
73
+
74
+ const { root, pageData } = analyzeHtml(result.html)
75
+ const results = await runner.runPageChecks({
76
+ pagePath,
77
+ filePath: result.filePath,
78
+ distDir,
79
+ html: result.html,
80
+ root,
81
+ pageData,
82
+ })
83
+ return { pagePath, pageData, results }
84
+ }),
85
+ )
86
+
87
+ for (const entry of pageResults) {
88
+ if (!entry) continue
89
+ pagesData.set(entry.pagePath, entry.pageData)
90
+ allResults.push(...entry.results)
91
+ }
92
+
93
+ // Run site-level checks
94
+ const siteResults = await runner.runSiteChecks({
95
+ distDir,
96
+ projectRoot: projectRoot ?? process.cwd(),
97
+ pages: pagesData,
98
+ siteUrl,
99
+ })
100
+ allResults.push(...siteResults)
101
+
102
+ // Generate and log report
103
+ const report = runner.generateReport(allResults, pagesData.size)
104
+ logReport(report, logger)
105
+
106
+ // Write JSON report if configured
107
+ if (resolved.reportJson) {
108
+ const reportPath = await writeJsonReport(report, distDir, resolved.reportJson)
109
+ logger.info(`JSON report written to ${reportPath}`)
110
+ }
111
+
112
+ // Fail build if configured
113
+ if (resolved.failOnError && report.errors.length > 0) {
114
+ throw new Error(
115
+ `@nuasite/checks: ${report.errors.length} error(s) found. Set failOnError: false to continue.`,
116
+ )
117
+ }
118
+ if (resolved.failOnWarning && (report.errors.length > 0 || report.warnings.length > 0)) {
119
+ throw new Error(
120
+ `@nuasite/checks: ${report.errors.length} error(s) and ${report.warnings.length} warning(s) found.`,
121
+ )
122
+ }
123
+ },
124
+ },
125
+ }
126
+ }
package/src/config.ts ADDED
@@ -0,0 +1,27 @@
1
+ import type { ChecksOptions, ResolvedChecksOptions } from './types'
2
+
3
+ function resolveOption<T extends object>(value: boolean | T | undefined, defaultOptions: T = {} as T): T | false {
4
+ if (value === false) return false
5
+ if (value === true || value === undefined) return defaultOptions
6
+ return value
7
+ }
8
+
9
+ export function resolveChecksOptions(options: ChecksOptions = {}): ResolvedChecksOptions {
10
+ return {
11
+ mode: options.mode ?? 'auto',
12
+ seo: resolveOption(options.seo),
13
+ geo: resolveOption(options.geo),
14
+ performance: resolveOption(options.performance),
15
+ accessibility: resolveOption(options.accessibility),
16
+ ai: options.ai || false,
17
+ failOnError: options.failOnError ?? true,
18
+ failOnWarning: options.failOnWarning ?? false,
19
+ overrides: options.overrides ?? {},
20
+ customChecks: options.customChecks ?? [],
21
+ reportJson: options.reportJson === true
22
+ ? 'checks-report.json'
23
+ : typeof options.reportJson === 'string'
24
+ ? options.reportJson
25
+ : false,
26
+ }
27
+ }
@@ -0,0 +1,325 @@
1
+ import { type HTMLElement as ParsedHTMLElement, parse } from 'node-html-parser'
2
+ import type {
3
+ ExtractedFormData,
4
+ ExtractedPageData,
5
+ HeadingData,
6
+ ImageData,
7
+ JsonLdData,
8
+ LinkData,
9
+ MetaTagData,
10
+ ScriptData,
11
+ StylesheetData,
12
+ } from './types'
13
+
14
+ /**
15
+ * Parse HTML and extract all data needed by checks.
16
+ * Parses once per page — all checks receive the same ExtractedPageData.
17
+ */
18
+ export function analyzeHtml(html: string): { root: ParsedHTMLElement; pageData: ExtractedPageData } {
19
+ const root = parse(html, {
20
+ lowerCaseTagName: false,
21
+ comment: true,
22
+ blockTextElements: {
23
+ script: true,
24
+ noscript: true,
25
+ style: true,
26
+ pre: true,
27
+ },
28
+ })
29
+
30
+ const head = root.querySelector('head')
31
+ const body = root.querySelector('body')
32
+ const htmlElement = root.querySelector('html')
33
+ const lineIndex = buildLineIndex(html)
34
+
35
+ // Extract body text length once for content quality checks
36
+ const bodyTextLength = (body?.querySelector('main')?.textContent ?? body?.textContent ?? '').trim().length
37
+
38
+ const pageData: ExtractedPageData = {
39
+ metaTags: [],
40
+ openGraph: {},
41
+ twitterCard: {},
42
+ jsonLd: [],
43
+ headings: [],
44
+ images: [],
45
+ links: [],
46
+ scripts: [],
47
+ stylesheets: [],
48
+ forms: [],
49
+ htmlLang: htmlElement?.getAttribute('lang') || undefined,
50
+ htmlSize: Buffer.byteLength(html, 'utf8'),
51
+ bodyTextLength,
52
+ hasViewport: false,
53
+ hasNoindex: false,
54
+ inlineScriptBytes: 0,
55
+ inlineStyleBytes: 0,
56
+ }
57
+
58
+ if (head) {
59
+ pageData.title = extractTitle(head, html, lineIndex)
60
+ pageData.metaTags = extractMetaTags(head, html, lineIndex)
61
+ categorizeMetaTags(pageData)
62
+ pageData.canonical = extractCanonical(head, html, lineIndex)
63
+ pageData.jsonLd = extractJsonLd(root, html, lineIndex)
64
+ pageData.scripts = extractScripts(root, html, lineIndex)
65
+ pageData.stylesheets = extractStylesheets(head, html, lineIndex)
66
+
67
+ // Compute inline sizes from extracted data
68
+ pageData.inlineScriptBytes = pageData.scripts
69
+ .filter(s => s.isInline)
70
+ .reduce((sum, s) => sum + s.size, 0)
71
+
72
+ for (const style of root.querySelectorAll('style')) {
73
+ const content = style.textContent ?? ''
74
+ if (content) pageData.inlineStyleBytes += Buffer.byteLength(content, 'utf8')
75
+ }
76
+ }
77
+
78
+ if (body) {
79
+ pageData.headings = extractHeadings(body, html, lineIndex)
80
+ pageData.images = extractImages(root, html, lineIndex)
81
+ pageData.links = extractLinks(body, html, lineIndex)
82
+ pageData.forms = extractForms(body, html, lineIndex)
83
+ }
84
+
85
+ return { root, pageData }
86
+ }
87
+
88
+ // ── Line index for efficient offset → line conversion ──────────────────────────
89
+
90
+ /** Precompute newline offsets for O(log n) offset-to-line lookups */
91
+ function buildLineIndex(html: string): number[] {
92
+ const offsets = [0]
93
+ for (let i = 0; i < html.length; i++) {
94
+ if (html[i] === '\n') offsets.push(i + 1)
95
+ }
96
+ return offsets
97
+ }
98
+
99
+ /** Binary search to convert a character offset to a 1-based line number */
100
+ function offsetToLine(lineIndex: number[], offset: number): number {
101
+ let lo = 0
102
+ let hi = lineIndex.length - 1
103
+ while (lo <= hi) {
104
+ const mid = (lo + hi) >>> 1
105
+ if (lineIndex[mid]! <= offset) lo = mid + 1
106
+ else hi = mid - 1
107
+ }
108
+ return lo // 1-based since offsets[0] = 0 means line 1
109
+ }
110
+
111
+ /**
112
+ * Advancing line finder — tracks position to handle duplicate elements correctly.
113
+ * Each call advances the search start so identical markup gets distinct line numbers.
114
+ */
115
+ function createLineFinder(html: string, lineIndex: number[]) {
116
+ let pos = 0
117
+ return (search: string): number => {
118
+ const idx = html.indexOf(search, pos)
119
+ if (idx !== -1) {
120
+ pos = idx + 1
121
+ return offsetToLine(lineIndex, idx)
122
+ }
123
+ // Fallback: search from beginning for edge cases
124
+ const fallback = html.indexOf(search)
125
+ if (fallback !== -1) return offsetToLine(lineIndex, fallback)
126
+ return 1
127
+ }
128
+ }
129
+
130
+ function extractTitle(head: ParsedHTMLElement, html: string, lineIndex: number[]): ExtractedPageData['title'] {
131
+ const titleEl = head.querySelector('title')
132
+ if (!titleEl) return undefined
133
+ const content = titleEl.textContent?.trim() || ''
134
+ const findLine = createLineFinder(html, lineIndex)
135
+ return { content, line: findLine('<title') }
136
+ }
137
+
138
+ function extractMetaTags(head: ParsedHTMLElement, html: string, lineIndex: number[]): MetaTagData[] {
139
+ const tags: MetaTagData[] = []
140
+ const findLine = createLineFinder(html, lineIndex)
141
+ for (const meta of head.querySelectorAll('meta')) {
142
+ const name = meta.getAttribute('name')
143
+ const property = meta.getAttribute('property')
144
+ const content = meta.getAttribute('content')
145
+ if (!content || (!name && !property)) continue
146
+ tags.push({
147
+ name: name || undefined,
148
+ property: property || undefined,
149
+ content,
150
+ line: findLine(meta.toString().substring(0, 60)),
151
+ })
152
+ }
153
+ return tags
154
+ }
155
+
156
+ function categorizeMetaTags(pageData: ExtractedPageData): void {
157
+ for (const meta of pageData.metaTags) {
158
+ if (meta.name === 'description') {
159
+ pageData.metaDescription = { content: meta.content, line: meta.line }
160
+ }
161
+ if (meta.name === 'viewport') {
162
+ pageData.hasViewport = true
163
+ }
164
+ if (meta.name === 'robots' && meta.content.toLowerCase().includes('noindex')) {
165
+ pageData.hasNoindex = true
166
+ }
167
+ if (meta.property?.startsWith('og:')) {
168
+ const key = meta.property.replace('og:', '')
169
+ pageData.openGraph[key] = { content: meta.content, line: meta.line }
170
+ }
171
+ if ((meta.name ?? meta.property ?? '').startsWith('twitter:')) {
172
+ const key = (meta.name ?? meta.property ?? '').replace('twitter:', '')
173
+ pageData.twitterCard[key] = { content: meta.content, line: meta.line }
174
+ }
175
+ }
176
+ }
177
+
178
+ function extractCanonical(head: ParsedHTMLElement, html: string, lineIndex: number[]): ExtractedPageData['canonical'] {
179
+ const link = head.querySelector('link[rel="canonical"]')
180
+ if (!link) return undefined
181
+ const href = link.getAttribute('href')
182
+ if (!href) return undefined
183
+ const findLine = createLineFinder(html, lineIndex)
184
+ return { href, line: findLine('rel="canonical"') }
185
+ }
186
+
187
+ function extractJsonLd(root: ParsedHTMLElement, html: string, lineIndex: number[]): JsonLdData[] {
188
+ const entries: JsonLdData[] = []
189
+ const findLine = createLineFinder(html, lineIndex)
190
+ for (const script of root.querySelectorAll('script[type="application/ld+json"]')) {
191
+ const raw = script.textContent?.trim() || ''
192
+ if (!raw) continue
193
+ const line = findLine('application/ld+json')
194
+ try {
195
+ const data = JSON.parse(raw)
196
+ entries.push({ type: data['@type'] || 'Unknown', raw, valid: true, line })
197
+ } catch (e) {
198
+ entries.push({
199
+ type: 'Unknown',
200
+ raw,
201
+ valid: false,
202
+ error: e instanceof Error ? e.message : String(e),
203
+ line,
204
+ })
205
+ }
206
+ }
207
+ return entries
208
+ }
209
+
210
+ function extractHeadings(body: ParsedHTMLElement, html: string, lineIndex: number[]): HeadingData[] {
211
+ const headings: HeadingData[] = []
212
+ const findLine = createLineFinder(html, lineIndex)
213
+ for (const el of body.querySelectorAll('h1, h2, h3, h4, h5, h6')) {
214
+ const tag = el.tagName?.toLowerCase() || ''
215
+ const level = parseInt(tag.replace('h', ''), 10)
216
+ if (Number.isNaN(level)) continue
217
+ headings.push({
218
+ level,
219
+ text: el.textContent?.trim() || '',
220
+ line: findLine(el.toString().substring(0, 40)),
221
+ })
222
+ }
223
+ return headings
224
+ }
225
+
226
+ function extractImages(root: ParsedHTMLElement, html: string, lineIndex: number[]): ImageData[] {
227
+ const images: ImageData[] = []
228
+ const findLine = createLineFinder(html, lineIndex)
229
+ for (const img of root.querySelectorAll('img')) {
230
+ images.push({
231
+ src: img.getAttribute('src') || '',
232
+ alt: img.getAttribute('alt') ?? undefined,
233
+ loading: img.getAttribute('loading') || undefined,
234
+ line: findLine(img.toString().substring(0, 60)),
235
+ })
236
+ }
237
+ return images
238
+ }
239
+
240
+ function extractLinks(body: ParsedHTMLElement, html: string, lineIndex: number[]): LinkData[] {
241
+ const links: LinkData[] = []
242
+ const findLine = createLineFinder(html, lineIndex)
243
+ for (const a of body.querySelectorAll('a')) {
244
+ links.push({
245
+ href: a.getAttribute('href') || '',
246
+ text: a.textContent?.trim() || '',
247
+ rel: a.getAttribute('rel') || undefined,
248
+ line: findLine(a.toString().substring(0, 60)),
249
+ })
250
+ }
251
+ return links
252
+ }
253
+
254
+ function extractScripts(root: ParsedHTMLElement, html: string, lineIndex: number[]): ScriptData[] {
255
+ const scripts: ScriptData[] = []
256
+ const findLine = createLineFinder(html, lineIndex)
257
+ for (const script of root.querySelectorAll('script')) {
258
+ const src = script.getAttribute('src') || undefined
259
+ const content = script.textContent ?? ''
260
+ const isInline = !src && content.trim().length > 0
261
+ scripts.push({
262
+ src,
263
+ type: script.getAttribute('type') || undefined,
264
+ isAsync: script.hasAttribute('async'),
265
+ isDefer: script.hasAttribute('defer'),
266
+ isInline,
267
+ size: isInline ? Buffer.byteLength(content, 'utf8') : 0,
268
+ line: findLine(script.toString().substring(0, 60)),
269
+ })
270
+ }
271
+ return scripts
272
+ }
273
+
274
+ function extractStylesheets(head: ParsedHTMLElement, html: string, lineIndex: number[]): StylesheetData[] {
275
+ const stylesheets: StylesheetData[] = []
276
+ const findLine = createLineFinder(html, lineIndex)
277
+ for (const link of head.querySelectorAll('link[rel="stylesheet"]')) {
278
+ const href = link.getAttribute('href')
279
+ if (!href) continue
280
+ stylesheets.push({
281
+ href,
282
+ media: link.getAttribute('media') || undefined,
283
+ line: findLine(link.toString().substring(0, 60)),
284
+ })
285
+ }
286
+ return stylesheets
287
+ }
288
+
289
+ function extractForms(body: ParsedHTMLElement, html: string, lineIndex: number[]): ExtractedFormData[] {
290
+ const forms: ExtractedFormData[] = []
291
+ const findLine = createLineFinder(html, lineIndex)
292
+ for (const form of body.querySelectorAll('form')) {
293
+ const inputs: ExtractedFormData['inputs'] = []
294
+ const inputFinder = createLineFinder(html, lineIndex)
295
+ for (const input of form.querySelectorAll('input, select, textarea')) {
296
+ const id = input.getAttribute('id')
297
+ const name = input.getAttribute('name')
298
+ const type = input.getAttribute('type') || input.tagName?.toLowerCase() || 'text'
299
+
300
+ // Skip hidden inputs (they don't need labels)
301
+ if (type === 'hidden' || type === 'submit' || type === 'button') continue
302
+
303
+ // Check for associated label
304
+ const hasLabel = !!(
305
+ (id && form.querySelector(`label[for="${id}"]`))
306
+ || input.closest('label')
307
+ || input.getAttribute('aria-label')
308
+ || input.getAttribute('aria-labelledby')
309
+ )
310
+
311
+ inputs.push({
312
+ type,
313
+ name: name || undefined,
314
+ id: id || undefined,
315
+ hasLabel,
316
+ line: inputFinder(input.toString().substring(0, 60)),
317
+ })
318
+ }
319
+ forms.push({
320
+ inputs,
321
+ line: findLine(form.toString().substring(0, 40)),
322
+ })
323
+ }
324
+ return forms
325
+ }