@nuasite/cms-marker 0.0.75 → 0.0.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/types/build-processor.d.ts.map +1 -1
- package/dist/types/dev-middleware.d.ts.map +1 -1
- package/dist/types/html-processor.d.ts +5 -1
- package/dist/types/html-processor.d.ts.map +1 -1
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/manifest-writer.d.ts +3 -2
- package/dist/types/manifest-writer.d.ts.map +1 -1
- package/dist/types/seo-processor.d.ts +23 -0
- package/dist/types/seo-processor.d.ts.map +1 -0
- package/dist/types/source-finder/index.d.ts +3 -1
- package/dist/types/source-finder/index.d.ts.map +1 -1
- package/dist/types/source-finder/seo-finder.d.ts +32 -0
- package/dist/types/source-finder/seo-finder.d.ts.map +1 -0
- package/dist/types/source-finder/snippet-utils.d.ts +3 -3
- package/dist/types/source-finder/snippet-utils.d.ts.map +1 -1
- package/dist/types/source-finder/source-lookup.d.ts.map +1 -1
- package/dist/types/tsconfig.tsbuildinfo +1 -1
- package/dist/types/types.d.ts +89 -0
- package/dist/types/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/build-processor.ts +3 -1
- package/src/dev-middleware.ts +16 -13
- package/src/html-processor.ts +42 -2
- package/src/index.ts +2 -0
- package/src/manifest-writer.ts +12 -2
- package/src/seo-processor.ts +325 -0
- package/src/source-finder/index.ts +5 -1
- package/src/source-finder/seo-finder.ts +336 -0
- package/src/source-finder/snippet-utils.ts +7 -10
- package/src/source-finder/source-lookup.ts +3 -4
- package/src/types.ts +101 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import { type HTMLElement as ParsedHTMLElement, parse } from 'node-html-parser'
|
|
2
|
+
import type { CanonicalUrl, JsonLdEntry, OpenGraphData, PageSeoData, SeoKeywords, SeoMetaTag, SeoTitle, TwitterCardData } from './types'
|
|
3
|
+
|
|
4
|
+
/** Type for parsed HTML element nodes from node-html-parser */
|
|
5
|
+
type HTMLNode = ParsedHTMLElement
|
|
6
|
+
|
|
7
|
+
export interface ProcessSeoOptions {
|
|
8
|
+
/** Whether to mark the page title with a CMS ID (default: true) */
|
|
9
|
+
markTitle?: boolean
|
|
10
|
+
/** Whether to parse JSON-LD structured data (default: true) */
|
|
11
|
+
parseJsonLd?: boolean
|
|
12
|
+
/** Path to source file for source tracking */
|
|
13
|
+
sourcePath?: string
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface ProcessSeoResult {
|
|
17
|
+
/** Extracted SEO data */
|
|
18
|
+
seo: PageSeoData
|
|
19
|
+
/** The modified HTML with title CMS ID if markTitle is enabled */
|
|
20
|
+
html: string
|
|
21
|
+
/** The CMS ID assigned to the title element */
|
|
22
|
+
titleCmsId?: string
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Process HTML to extract SEO metadata from the <head> section.
|
|
27
|
+
* Returns structured SEO data with source tracking information.
|
|
28
|
+
*/
|
|
29
|
+
export function processSeoFromHtml(
|
|
30
|
+
html: string,
|
|
31
|
+
options: ProcessSeoOptions = {},
|
|
32
|
+
getNextId?: () => string,
|
|
33
|
+
): ProcessSeoResult {
|
|
34
|
+
const { markTitle = true, parseJsonLd = true, sourcePath } = options
|
|
35
|
+
|
|
36
|
+
const root = parse(html, {
|
|
37
|
+
lowerCaseTagName: false,
|
|
38
|
+
comment: true,
|
|
39
|
+
blockTextElements: {
|
|
40
|
+
script: true,
|
|
41
|
+
noscript: true,
|
|
42
|
+
style: true,
|
|
43
|
+
pre: true,
|
|
44
|
+
},
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
const head = root.querySelector('head')
|
|
48
|
+
const seo: PageSeoData = {}
|
|
49
|
+
let titleCmsId: string | undefined
|
|
50
|
+
|
|
51
|
+
// Extract title
|
|
52
|
+
const titleResult = extractTitle(root, html, sourcePath, markTitle, getNextId)
|
|
53
|
+
if (titleResult) {
|
|
54
|
+
seo.title = titleResult.title
|
|
55
|
+
titleCmsId = titleResult.cmsId
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Extract meta tags from head
|
|
59
|
+
if (head) {
|
|
60
|
+
const metaTags = extractMetaTags(head, html, sourcePath)
|
|
61
|
+
categorizeMetaTags(metaTags, seo)
|
|
62
|
+
|
|
63
|
+
// Extract canonical URL
|
|
64
|
+
const canonical = extractCanonical(head, html, sourcePath)
|
|
65
|
+
if (canonical) {
|
|
66
|
+
seo.canonical = canonical
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Extract JSON-LD
|
|
70
|
+
if (parseJsonLd) {
|
|
71
|
+
const jsonLdEntries = extractJsonLd(head, html, sourcePath)
|
|
72
|
+
if (jsonLdEntries.length > 0) {
|
|
73
|
+
seo.jsonLd = jsonLdEntries
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
seo,
|
|
80
|
+
html: root.toString(),
|
|
81
|
+
titleCmsId,
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Extract the page title from HTML
|
|
87
|
+
*/
|
|
88
|
+
function extractTitle(
|
|
89
|
+
root: HTMLNode,
|
|
90
|
+
html: string,
|
|
91
|
+
sourcePath?: string,
|
|
92
|
+
markTitle?: boolean,
|
|
93
|
+
getNextId?: () => string,
|
|
94
|
+
): { title: SeoTitle; cmsId?: string } | undefined {
|
|
95
|
+
const titleElement = root.querySelector('title')
|
|
96
|
+
if (!titleElement) return undefined
|
|
97
|
+
|
|
98
|
+
const content = titleElement.textContent?.trim() || ''
|
|
99
|
+
if (!content) return undefined
|
|
100
|
+
|
|
101
|
+
// Find source location
|
|
102
|
+
const sourceInfo = findElementSourceLocation(titleElement, html, sourcePath)
|
|
103
|
+
|
|
104
|
+
let cmsId: string | undefined
|
|
105
|
+
if (markTitle && getNextId) {
|
|
106
|
+
cmsId = getNextId()
|
|
107
|
+
titleElement.setAttribute('data-cms-id', cmsId)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
title: {
|
|
112
|
+
content,
|
|
113
|
+
cmsId,
|
|
114
|
+
...sourceInfo,
|
|
115
|
+
},
|
|
116
|
+
cmsId,
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Extract all meta tags from the head
|
|
122
|
+
*/
|
|
123
|
+
function extractMetaTags(
|
|
124
|
+
head: HTMLNode,
|
|
125
|
+
html: string,
|
|
126
|
+
sourcePath?: string,
|
|
127
|
+
): SeoMetaTag[] {
|
|
128
|
+
const metaTags: SeoMetaTag[] = []
|
|
129
|
+
const metas = head.querySelectorAll('meta')
|
|
130
|
+
|
|
131
|
+
for (const meta of metas) {
|
|
132
|
+
const name = meta.getAttribute('name')
|
|
133
|
+
const property = meta.getAttribute('property')
|
|
134
|
+
const content = meta.getAttribute('content')
|
|
135
|
+
|
|
136
|
+
// Skip meta tags without content or without name/property
|
|
137
|
+
if (!content || (!name && !property)) continue
|
|
138
|
+
|
|
139
|
+
const sourceInfo = findElementSourceLocation(meta, html, sourcePath)
|
|
140
|
+
|
|
141
|
+
metaTags.push({
|
|
142
|
+
name: name || undefined,
|
|
143
|
+
property: property || undefined,
|
|
144
|
+
content,
|
|
145
|
+
...sourceInfo,
|
|
146
|
+
})
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return metaTags
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Categorize meta tags into description, keywords, Open Graph and Twitter Card
|
|
154
|
+
*/
|
|
155
|
+
function categorizeMetaTags(metaTags: SeoMetaTag[], seo: PageSeoData): void {
|
|
156
|
+
const openGraph: OpenGraphData = {}
|
|
157
|
+
const twitterCard: TwitterCardData = {}
|
|
158
|
+
|
|
159
|
+
for (const meta of metaTags) {
|
|
160
|
+
const { name, property, content } = meta
|
|
161
|
+
|
|
162
|
+
// Description
|
|
163
|
+
if (name === 'description') {
|
|
164
|
+
seo.description = meta
|
|
165
|
+
continue
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Keywords
|
|
169
|
+
if (name === 'keywords') {
|
|
170
|
+
const keywords = content.split(',').map(k => k.trim()).filter(Boolean)
|
|
171
|
+
seo.keywords = {
|
|
172
|
+
...meta,
|
|
173
|
+
keywords,
|
|
174
|
+
} as SeoKeywords
|
|
175
|
+
continue
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Open Graph tags
|
|
179
|
+
if (property?.startsWith('og:')) {
|
|
180
|
+
const ogKey = property.replace('og:', '')
|
|
181
|
+
switch (ogKey) {
|
|
182
|
+
case 'title':
|
|
183
|
+
openGraph.title = meta
|
|
184
|
+
break
|
|
185
|
+
case 'description':
|
|
186
|
+
openGraph.description = meta
|
|
187
|
+
break
|
|
188
|
+
case 'image':
|
|
189
|
+
openGraph.image = meta
|
|
190
|
+
break
|
|
191
|
+
case 'url':
|
|
192
|
+
openGraph.url = meta
|
|
193
|
+
break
|
|
194
|
+
case 'type':
|
|
195
|
+
openGraph.type = meta
|
|
196
|
+
break
|
|
197
|
+
case 'site_name':
|
|
198
|
+
openGraph.siteName = meta
|
|
199
|
+
break
|
|
200
|
+
}
|
|
201
|
+
continue
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Twitter Card tags
|
|
205
|
+
if (name?.startsWith('twitter:') || property?.startsWith('twitter:')) {
|
|
206
|
+
const twitterKey = (name || property || '').replace('twitter:', '')
|
|
207
|
+
switch (twitterKey) {
|
|
208
|
+
case 'card':
|
|
209
|
+
twitterCard.card = meta
|
|
210
|
+
break
|
|
211
|
+
case 'title':
|
|
212
|
+
twitterCard.title = meta
|
|
213
|
+
break
|
|
214
|
+
case 'description':
|
|
215
|
+
twitterCard.description = meta
|
|
216
|
+
break
|
|
217
|
+
case 'image':
|
|
218
|
+
twitterCard.image = meta
|
|
219
|
+
break
|
|
220
|
+
case 'site':
|
|
221
|
+
twitterCard.site = meta
|
|
222
|
+
break
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Only add if we found any OG tags
|
|
228
|
+
if (Object.keys(openGraph).length > 0) {
|
|
229
|
+
seo.openGraph = openGraph
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Only add if we found any Twitter tags
|
|
233
|
+
if (Object.keys(twitterCard).length > 0) {
|
|
234
|
+
seo.twitterCard = twitterCard
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Extract canonical URL from head
|
|
240
|
+
*/
|
|
241
|
+
function extractCanonical(
|
|
242
|
+
head: HTMLNode,
|
|
243
|
+
html: string,
|
|
244
|
+
sourcePath?: string,
|
|
245
|
+
): CanonicalUrl | undefined {
|
|
246
|
+
const canonical = head.querySelector('link[rel="canonical"]')
|
|
247
|
+
if (!canonical) return undefined
|
|
248
|
+
|
|
249
|
+
const href = canonical.getAttribute('href')
|
|
250
|
+
if (!href) return undefined
|
|
251
|
+
|
|
252
|
+
const sourceInfo = findElementSourceLocation(canonical, html, sourcePath)
|
|
253
|
+
|
|
254
|
+
return {
|
|
255
|
+
href,
|
|
256
|
+
...sourceInfo,
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Extract JSON-LD structured data from script tags
|
|
262
|
+
*/
|
|
263
|
+
function extractJsonLd(
|
|
264
|
+
head: HTMLNode,
|
|
265
|
+
html: string,
|
|
266
|
+
sourcePath?: string,
|
|
267
|
+
): JsonLdEntry[] {
|
|
268
|
+
const entries: JsonLdEntry[] = []
|
|
269
|
+
|
|
270
|
+
// Also check body for JSON-LD scripts (some sites place them there)
|
|
271
|
+
const root = head.parentNode as HTMLNode
|
|
272
|
+
const scripts = root?.querySelectorAll('script[type="application/ld+json"]') || []
|
|
273
|
+
|
|
274
|
+
for (const script of scripts) {
|
|
275
|
+
const content = script.textContent?.trim()
|
|
276
|
+
if (!content) continue
|
|
277
|
+
|
|
278
|
+
try {
|
|
279
|
+
const data = JSON.parse(content)
|
|
280
|
+
const type = data['@type'] || 'Unknown'
|
|
281
|
+
|
|
282
|
+
const sourceInfo = findElementSourceLocation(script, html, sourcePath)
|
|
283
|
+
|
|
284
|
+
entries.push({
|
|
285
|
+
type,
|
|
286
|
+
data,
|
|
287
|
+
...sourceInfo,
|
|
288
|
+
})
|
|
289
|
+
} catch {
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return entries
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Find the source location (line number and snippet) for an element in the HTML
|
|
298
|
+
*/
|
|
299
|
+
function findElementSourceLocation(
|
|
300
|
+
element: HTMLNode,
|
|
301
|
+
html: string,
|
|
302
|
+
sourcePath?: string,
|
|
303
|
+
): { sourcePath: string; sourceLine: number; sourceSnippet: string } {
|
|
304
|
+
// Get the element's outer HTML as the source snippet
|
|
305
|
+
const sourceSnippet = element.toString()
|
|
306
|
+
|
|
307
|
+
// Find the line number by searching for the element in the original HTML
|
|
308
|
+
let sourceLine = 1
|
|
309
|
+
const elementStr = sourceSnippet.split('\n')[0] || sourceSnippet // Use first line for matching
|
|
310
|
+
const lines = html.split('\n')
|
|
311
|
+
|
|
312
|
+
for (let i = 0; i < lines.length; i++) {
|
|
313
|
+
const line = lines[i]
|
|
314
|
+
if (line?.includes(elementStr.substring(0, Math.min(50, elementStr.length)))) {
|
|
315
|
+
sourceLine = i + 1
|
|
316
|
+
break
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
sourcePath: sourcePath || '',
|
|
322
|
+
sourceLine,
|
|
323
|
+
sourceSnippet,
|
|
324
|
+
}
|
|
325
|
+
}
|
|
@@ -23,4 +23,8 @@ export { findImageSourceLocation } from './image-finder'
|
|
|
23
23
|
export { findCollectionSource, findMarkdownSourceLocation, parseMarkdownContent } from './collection-finder'
|
|
24
24
|
|
|
25
25
|
// Snippet utilities (used by html-processor)
|
|
26
|
-
export { enhanceManifestWithSourceSnippets, extractCompleteTagSnippet, extractInnerHtmlFromSnippet,
|
|
26
|
+
export { enhanceManifestWithSourceSnippets, extractCompleteTagSnippet, extractInnerHtmlFromSnippet, extractSourceSnippet } from './snippet-utils'
|
|
27
|
+
|
|
28
|
+
// SEO source finding
|
|
29
|
+
export { findSeoSource } from './seo-finder'
|
|
30
|
+
export type { SeoElementIdentifier, SeoSourceLocation } from './seo-finder'
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
import fs from 'node:fs/promises'
|
|
2
|
+
import path from 'node:path'
|
|
3
|
+
|
|
4
|
+
import { getProjectRoot } from '../config'
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* SEO element identifier for source finding
|
|
8
|
+
*/
|
|
9
|
+
export interface SeoElementIdentifier {
|
|
10
|
+
/** Meta tag name attribute */
|
|
11
|
+
name?: string
|
|
12
|
+
/** Meta tag property attribute (for OG/Twitter) */
|
|
13
|
+
property?: string
|
|
14
|
+
/** Content value to match */
|
|
15
|
+
content?: string
|
|
16
|
+
/** Canonical URL href */
|
|
17
|
+
href?: string
|
|
18
|
+
/** JSON-LD @type value */
|
|
19
|
+
jsonLdType?: string
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Result of SEO source finding
|
|
24
|
+
*/
|
|
25
|
+
export interface SeoSourceLocation {
|
|
26
|
+
/** Path to source file relative to project root */
|
|
27
|
+
sourcePath: string
|
|
28
|
+
/** Line number in source file (1-indexed) */
|
|
29
|
+
sourceLine: number
|
|
30
|
+
/** Exact source code snippet */
|
|
31
|
+
sourceSnippet: string
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Find the source location for an SEO element.
|
|
36
|
+
* Searches Astro/HTML files in src/pages and src/layouts for matching SEO elements.
|
|
37
|
+
*/
|
|
38
|
+
export async function findSeoSource(
|
|
39
|
+
type: 'title' | 'meta' | 'canonical' | 'jsonld',
|
|
40
|
+
identifier: SeoElementIdentifier,
|
|
41
|
+
): Promise<SeoSourceLocation | undefined> {
|
|
42
|
+
const srcDir = path.join(getProjectRoot(), 'src')
|
|
43
|
+
const searchDirs = [
|
|
44
|
+
path.join(srcDir, 'pages'),
|
|
45
|
+
path.join(srcDir, 'layouts'),
|
|
46
|
+
path.join(srcDir, 'components'),
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
for (const dir of searchDirs) {
|
|
50
|
+
try {
|
|
51
|
+
const result = await searchDirectoryForSeo(dir, type, identifier)
|
|
52
|
+
if (result) return result
|
|
53
|
+
} catch {
|
|
54
|
+
// Directory doesn't exist, continue
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return undefined
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Recursively search a directory for SEO elements
|
|
63
|
+
*/
|
|
64
|
+
async function searchDirectoryForSeo(
|
|
65
|
+
dir: string,
|
|
66
|
+
type: 'title' | 'meta' | 'canonical' | 'jsonld',
|
|
67
|
+
identifier: SeoElementIdentifier,
|
|
68
|
+
): Promise<SeoSourceLocation | undefined> {
|
|
69
|
+
try {
|
|
70
|
+
const entries = await fs.readdir(dir, { withFileTypes: true })
|
|
71
|
+
|
|
72
|
+
for (const entry of entries) {
|
|
73
|
+
const fullPath = path.join(dir, entry.name)
|
|
74
|
+
|
|
75
|
+
if (entry.isDirectory()) {
|
|
76
|
+
const result = await searchDirectoryForSeo(fullPath, type, identifier)
|
|
77
|
+
if (result) return result
|
|
78
|
+
} else if (entry.isFile() && (entry.name.endsWith('.astro') || entry.name.endsWith('.html'))) {
|
|
79
|
+
const result = await searchFileForSeo(fullPath, type, identifier)
|
|
80
|
+
if (result) return result
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
} catch {
|
|
84
|
+
// Error reading directory
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return undefined
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Search a single file for matching SEO element
|
|
92
|
+
*/
|
|
93
|
+
async function searchFileForSeo(
|
|
94
|
+
filePath: string,
|
|
95
|
+
type: 'title' | 'meta' | 'canonical' | 'jsonld',
|
|
96
|
+
identifier: SeoElementIdentifier,
|
|
97
|
+
): Promise<SeoSourceLocation | undefined> {
|
|
98
|
+
try {
|
|
99
|
+
const content = await fs.readFile(filePath, 'utf-8')
|
|
100
|
+
const lines = content.split('\n')
|
|
101
|
+
|
|
102
|
+
switch (type) {
|
|
103
|
+
case 'title':
|
|
104
|
+
return findTitleInLines(lines, filePath, identifier.content)
|
|
105
|
+
|
|
106
|
+
case 'meta':
|
|
107
|
+
return findMetaInLines(lines, filePath, identifier)
|
|
108
|
+
|
|
109
|
+
case 'canonical':
|
|
110
|
+
return findCanonicalInLines(lines, filePath, identifier.href)
|
|
111
|
+
|
|
112
|
+
case 'jsonld':
|
|
113
|
+
return findJsonLdInLines(lines, filePath, identifier.jsonLdType)
|
|
114
|
+
|
|
115
|
+
default:
|
|
116
|
+
return undefined
|
|
117
|
+
}
|
|
118
|
+
} catch {
|
|
119
|
+
return undefined
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Find title element in source lines
|
|
125
|
+
*/
|
|
126
|
+
function findTitleInLines(
|
|
127
|
+
lines: string[],
|
|
128
|
+
filePath: string,
|
|
129
|
+
content?: string,
|
|
130
|
+
): SeoSourceLocation | undefined {
|
|
131
|
+
for (let i = 0; i < lines.length; i++) {
|
|
132
|
+
const line = lines[i] || ''
|
|
133
|
+
|
|
134
|
+
// Match <title>...</title> or <title>
|
|
135
|
+
if (line.includes('<title')) {
|
|
136
|
+
// Check if content matches (if specified)
|
|
137
|
+
if (content) {
|
|
138
|
+
// Handle single-line title
|
|
139
|
+
const match = line.match(/<title[^>]*>([^<]*)<\/title>/i)
|
|
140
|
+
if (match?.[1]?.includes(content.substring(0, 20))) {
|
|
141
|
+
return {
|
|
142
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
143
|
+
sourceLine: i + 1,
|
|
144
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'title'),
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// Handle multi-line or dynamic title
|
|
148
|
+
if (line.includes('<title')) {
|
|
149
|
+
return {
|
|
150
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
151
|
+
sourceLine: i + 1,
|
|
152
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'title'),
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
} else {
|
|
156
|
+
return {
|
|
157
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
158
|
+
sourceLine: i + 1,
|
|
159
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'title'),
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return undefined
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Find meta element in source lines
|
|
170
|
+
*/
|
|
171
|
+
function findMetaInLines(
|
|
172
|
+
lines: string[],
|
|
173
|
+
filePath: string,
|
|
174
|
+
identifier: SeoElementIdentifier,
|
|
175
|
+
): SeoSourceLocation | undefined {
|
|
176
|
+
const { name, property, content } = identifier
|
|
177
|
+
|
|
178
|
+
for (let i = 0; i < lines.length; i++) {
|
|
179
|
+
const line = lines[i] || ''
|
|
180
|
+
|
|
181
|
+
if (!line.includes('<meta')) continue
|
|
182
|
+
|
|
183
|
+
// Check for name attribute match
|
|
184
|
+
if (name) {
|
|
185
|
+
const nameMatch = line.match(/name\s*=\s*["']([^"']+)["']/i)
|
|
186
|
+
if (nameMatch && nameMatch[1] === name) {
|
|
187
|
+
// Verify content if specified
|
|
188
|
+
if (content) {
|
|
189
|
+
const contentMatch = line.match(/content\s*=\s*["']([^"']*)["']/i)
|
|
190
|
+
if (contentMatch?.[1]?.includes(content.substring(0, 30))) {
|
|
191
|
+
return {
|
|
192
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
193
|
+
sourceLine: i + 1,
|
|
194
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'meta'),
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
} else {
|
|
198
|
+
return {
|
|
199
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
200
|
+
sourceLine: i + 1,
|
|
201
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'meta'),
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Check for property attribute match (OG/Twitter)
|
|
208
|
+
if (property) {
|
|
209
|
+
const propMatch = line.match(/property\s*=\s*["']([^"']+)["']/i)
|
|
210
|
+
if (propMatch && propMatch[1] === property) {
|
|
211
|
+
return {
|
|
212
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
213
|
+
sourceLine: i + 1,
|
|
214
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'meta'),
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return undefined
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Find canonical link in source lines
|
|
225
|
+
*/
|
|
226
|
+
function findCanonicalInLines(
|
|
227
|
+
lines: string[],
|
|
228
|
+
filePath: string,
|
|
229
|
+
href?: string,
|
|
230
|
+
): SeoSourceLocation | undefined {
|
|
231
|
+
for (let i = 0; i < lines.length; i++) {
|
|
232
|
+
const line = lines[i] || ''
|
|
233
|
+
|
|
234
|
+
if (line.includes('rel="canonical"') || line.includes("rel='canonical'")) {
|
|
235
|
+
// Verify href if specified
|
|
236
|
+
if (href) {
|
|
237
|
+
if (line.includes(href.substring(0, 30))) {
|
|
238
|
+
return {
|
|
239
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
240
|
+
sourceLine: i + 1,
|
|
241
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'link'),
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
} else {
|
|
245
|
+
return {
|
|
246
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
247
|
+
sourceLine: i + 1,
|
|
248
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'link'),
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return undefined
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Find JSON-LD script in source lines
|
|
259
|
+
*/
|
|
260
|
+
function findJsonLdInLines(
|
|
261
|
+
lines: string[],
|
|
262
|
+
filePath: string,
|
|
263
|
+
jsonLdType?: string,
|
|
264
|
+
): SeoSourceLocation | undefined {
|
|
265
|
+
for (let i = 0; i < lines.length; i++) {
|
|
266
|
+
const line = lines[i] || ''
|
|
267
|
+
|
|
268
|
+
if (line.includes('application/ld+json')) {
|
|
269
|
+
// Check if @type matches (if specified)
|
|
270
|
+
if (jsonLdType) {
|
|
271
|
+
// Look ahead for @type in following lines
|
|
272
|
+
const snippet = extractMultiLineElement(lines, i, 'script')
|
|
273
|
+
if (snippet.includes(`"@type"`) && snippet.includes(jsonLdType)) {
|
|
274
|
+
return {
|
|
275
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
276
|
+
sourceLine: i + 1,
|
|
277
|
+
sourceSnippet: snippet,
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
} else {
|
|
281
|
+
return {
|
|
282
|
+
sourcePath: path.relative(getProjectRoot(), filePath),
|
|
283
|
+
sourceLine: i + 1,
|
|
284
|
+
sourceSnippet: extractMultiLineElement(lines, i, 'script'),
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return undefined
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Extract a potentially multi-line element from source lines
|
|
295
|
+
*/
|
|
296
|
+
function extractMultiLineElement(lines: string[], startLine: number, tag: string): string {
|
|
297
|
+
const snippetLines: string[] = []
|
|
298
|
+
let depth = 0
|
|
299
|
+
let foundClosing = false
|
|
300
|
+
|
|
301
|
+
// For self-closing tags like <meta /> and <link />
|
|
302
|
+
const isSelfClosing = ['meta', 'link', 'img', 'br', 'hr', 'input'].includes(tag.toLowerCase())
|
|
303
|
+
|
|
304
|
+
for (let i = startLine; i < Math.min(startLine + 30, lines.length); i++) {
|
|
305
|
+
const line = lines[i]
|
|
306
|
+
if (!line) continue
|
|
307
|
+
|
|
308
|
+
snippetLines.push(line)
|
|
309
|
+
|
|
310
|
+
// For self-closing tags, check if line ends the tag
|
|
311
|
+
if (isSelfClosing) {
|
|
312
|
+
if (line.includes('/>') || (line.includes('>') && !line.includes('</' + tag))) {
|
|
313
|
+
foundClosing = true
|
|
314
|
+
break
|
|
315
|
+
}
|
|
316
|
+
} else {
|
|
317
|
+
// Count opening and closing tags
|
|
318
|
+
const openTags = (line.match(new RegExp(`<${tag}(?:[\\s>]|$)`, 'gi')) || []).length
|
|
319
|
+
const selfClose = (line.match(new RegExp(`<${tag}[^>]*/>`, 'gi')) || []).length
|
|
320
|
+
const closeTags = (line.match(new RegExp(`</${tag}>`, 'gi')) || []).length
|
|
321
|
+
|
|
322
|
+
depth += openTags - selfClose - closeTags
|
|
323
|
+
|
|
324
|
+
if (depth <= 0 && (closeTags > 0 || selfClose > 0)) {
|
|
325
|
+
foundClosing = true
|
|
326
|
+
break
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
if (!foundClosing && snippetLines.length > 1) {
|
|
332
|
+
return snippetLines[0] || ''
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return snippetLines.join('\n')
|
|
336
|
+
}
|
|
@@ -173,14 +173,14 @@ export function extractImageSnippet(lines: string[], startLine: number): string
|
|
|
173
173
|
}
|
|
174
174
|
|
|
175
175
|
/**
|
|
176
|
-
* Read source file and extract the
|
|
176
|
+
* Read source file and extract the complete element at the specified line.
|
|
177
177
|
*
|
|
178
178
|
* @param sourceFile - Path to source file (relative to cwd)
|
|
179
179
|
* @param sourceLine - 1-indexed line number
|
|
180
180
|
* @param tag - The tag name
|
|
181
|
-
* @returns The
|
|
181
|
+
* @returns The complete element from source, or undefined if can't extract
|
|
182
182
|
*/
|
|
183
|
-
export async function
|
|
183
|
+
export async function extractSourceSnippet(
|
|
184
184
|
sourceFile: string,
|
|
185
185
|
sourceLine: number,
|
|
186
186
|
tag: string,
|
|
@@ -193,11 +193,8 @@ export async function extractSourceInnerHtml(
|
|
|
193
193
|
const content = await fs.readFile(filePath, 'utf-8')
|
|
194
194
|
const lines = content.split('\n')
|
|
195
195
|
|
|
196
|
-
// Extract the complete tag snippet
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
// Extract innerHTML from the snippet
|
|
200
|
-
return extractInnerHtmlFromSnippet(snippet, tag)
|
|
196
|
+
// Extract the complete tag snippet (including wrapper element)
|
|
197
|
+
return extractCompleteTagSnippet(lines, sourceLine - 1, tag)
|
|
201
198
|
} catch {
|
|
202
199
|
return undefined
|
|
203
200
|
}
|
|
@@ -243,8 +240,8 @@ export async function enhanceManifestWithSourceSnippets(
|
|
|
243
240
|
return [id, entry] as const
|
|
244
241
|
}
|
|
245
242
|
|
|
246
|
-
// Extract the
|
|
247
|
-
const sourceSnippet = await
|
|
243
|
+
// Extract the complete source element
|
|
244
|
+
const sourceSnippet = await extractSourceSnippet(
|
|
248
245
|
entry.sourcePath,
|
|
249
246
|
entry.sourceLine,
|
|
250
247
|
entry.tag,
|