npm - @nuasite/cms-marker - Versions diffs - 0.0.75 → 0.0.76 - Mend

@nuasite/cms-marker 0.0.75 → 0.0.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/types/build-processor.d.ts.map +1 -1
package/dist/types/dev-middleware.d.ts.map +1 -1
package/dist/types/html-processor.d.ts +5 -1
package/dist/types/html-processor.d.ts.map +1 -1
package/dist/types/index.d.ts.map +1 -1
package/dist/types/manifest-writer.d.ts +3 -2
package/dist/types/manifest-writer.d.ts.map +1 -1
package/dist/types/seo-processor.d.ts +23 -0
package/dist/types/seo-processor.d.ts.map +1 -0
package/dist/types/source-finder/index.d.ts +3 -1
package/dist/types/source-finder/index.d.ts.map +1 -1
package/dist/types/source-finder/seo-finder.d.ts +32 -0
package/dist/types/source-finder/seo-finder.d.ts.map +1 -0
package/dist/types/source-finder/snippet-utils.d.ts +3 -3
package/dist/types/source-finder/snippet-utils.d.ts.map +1 -1
package/dist/types/source-finder/source-lookup.d.ts.map +1 -1
package/dist/types/tsconfig.tsbuildinfo +1 -1
package/dist/types/types.d.ts +89 -0
package/dist/types/types.d.ts.map +1 -1
package/package.json +1 -1
package/src/build-processor.ts +3 -1
package/src/dev-middleware.ts +16 -13
package/src/html-processor.ts +42 -2
package/src/index.ts +2 -0
package/src/manifest-writer.ts +12 -2
package/src/seo-processor.ts +325 -0
package/src/source-finder/index.ts +5 -1
package/src/source-finder/seo-finder.ts +336 -0
package/src/source-finder/snippet-utils.ts +7 -10
package/src/source-finder/source-lookup.ts +3 -4
package/src/types.ts +101 -0

package/src/seo-processor.ts ADDED Viewed

@@ -0,0 +1,325 @@
+import { type HTMLElement as ParsedHTMLElement, parse } from 'node-html-parser'
+import type { CanonicalUrl, JsonLdEntry, OpenGraphData, PageSeoData, SeoKeywords, SeoMetaTag, SeoTitle, TwitterCardData } from './types'
+/** Type for parsed HTML element nodes from node-html-parser */
+type HTMLNode = ParsedHTMLElement
+export interface ProcessSeoOptions {
+	/** Whether to mark the page title with a CMS ID (default: true) */
+	markTitle?: boolean
+	/** Whether to parse JSON-LD structured data (default: true) */
+	parseJsonLd?: boolean
+	/** Path to source file for source tracking */
+	sourcePath?: string
+}
+export interface ProcessSeoResult {
+	/** Extracted SEO data */
+	seo: PageSeoData
+	/** The modified HTML with title CMS ID if markTitle is enabled */
+	html: string
+	/** The CMS ID assigned to the title element */
+	titleCmsId?: string
+}
+/**
+ * Process HTML to extract SEO metadata from the <head> section.
+ * Returns structured SEO data with source tracking information.
+ */
+export function processSeoFromHtml(
+	html: string,
+	options: ProcessSeoOptions = {},
+	getNextId?: () => string,
+): ProcessSeoResult {
+	const { markTitle = true, parseJsonLd = true, sourcePath } = options
+	const root = parse(html, {
+		lowerCaseTagName: false,
+		comment: true,
+		blockTextElements: {
+			script: true,
+			noscript: true,
+			style: true,
+			pre: true,
+		},
+	})
+	const head = root.querySelector('head')
+	const seo: PageSeoData = {}
+	let titleCmsId: string | undefined
+	// Extract title
+	const titleResult = extractTitle(root, html, sourcePath, markTitle, getNextId)
+	if (titleResult) {
+		seo.title = titleResult.title
+		titleCmsId = titleResult.cmsId
+	}
+	// Extract meta tags from head
+	if (head) {
+		const metaTags = extractMetaTags(head, html, sourcePath)
+		categorizeMetaTags(metaTags, seo)
+		// Extract canonical URL
+		const canonical = extractCanonical(head, html, sourcePath)
+		if (canonical) {
+			seo.canonical = canonical
+		}
+		// Extract JSON-LD
+		if (parseJsonLd) {
+			const jsonLdEntries = extractJsonLd(head, html, sourcePath)
+			if (jsonLdEntries.length > 0) {
+				seo.jsonLd = jsonLdEntries
+			}
+		}
+	}
+	return {
+		seo,
+		html: root.toString(),
+		titleCmsId,
+	}
+}
+/**
+ * Extract the page title from HTML
+ */
+function extractTitle(
+	root: HTMLNode,
+	html: string,
+	sourcePath?: string,
+	markTitle?: boolean,
+	getNextId?: () => string,
+): { title: SeoTitle; cmsId?: string } | undefined {
+	const titleElement = root.querySelector('title')
+	if (!titleElement) return undefined
+	const content = titleElement.textContent?.trim() || ''
+	if (!content) return undefined
+	// Find source location
+	const sourceInfo = findElementSourceLocation(titleElement, html, sourcePath)
+	let cmsId: string | undefined
+	if (markTitle && getNextId) {
+		cmsId = getNextId()
+		titleElement.setAttribute('data-cms-id', cmsId)
+	}
+	return {
+		title: {
+			content,
+			cmsId,
+			...sourceInfo,
+		},
+		cmsId,
+	}
+}
+/**
+ * Extract all meta tags from the head
+ */
+function extractMetaTags(
+	head: HTMLNode,
+	html: string,
+	sourcePath?: string,
+): SeoMetaTag[] {
+	const metaTags: SeoMetaTag[] = []
+	const metas = head.querySelectorAll('meta')
+	for (const meta of metas) {
+		const name = meta.getAttribute('name')
+		const property = meta.getAttribute('property')
+		const content = meta.getAttribute('content')
+		// Skip meta tags without content or without name/property
+		if (!content || (!name && !property)) continue
+		const sourceInfo = findElementSourceLocation(meta, html, sourcePath)
+		metaTags.push({
+			name: name || undefined,
+			property: property || undefined,
+			content,
+			...sourceInfo,
+		})
+	}
+	return metaTags
+}
+/**
+ * Categorize meta tags into description, keywords, Open Graph and Twitter Card
+ */
+function categorizeMetaTags(metaTags: SeoMetaTag[], seo: PageSeoData): void {
+	const openGraph: OpenGraphData = {}
+	const twitterCard: TwitterCardData = {}
+	for (const meta of metaTags) {
+		const { name, property, content } = meta
+		// Description
+		if (name === 'description') {
+			seo.description = meta
+			continue
+		}
+		// Keywords
+		if (name === 'keywords') {
+			const keywords = content.split(',').map(k => k.trim()).filter(Boolean)
+			seo.keywords = {
+				...meta,
+				keywords,
+			} as SeoKeywords
+			continue
+		}
+		// Open Graph tags
+		if (property?.startsWith('og:')) {
+			const ogKey = property.replace('og:', '')
+			switch (ogKey) {
+				case 'title':
+					openGraph.title = meta
+					break
+				case 'description':
+					openGraph.description = meta
+					break
+				case 'image':
+					openGraph.image = meta
+					break
+				case 'url':
+					openGraph.url = meta
+					break
+				case 'type':
+					openGraph.type = meta
+					break
+				case 'site_name':
+					openGraph.siteName = meta
+					break
+			}
+			continue
+		}
+		// Twitter Card tags
+		if (name?.startsWith('twitter:') || property?.startsWith('twitter:')) {
+			const twitterKey = (name || property || '').replace('twitter:', '')
+			switch (twitterKey) {
+				case 'card':
+					twitterCard.card = meta
+					break
+				case 'title':
+					twitterCard.title = meta
+					break
+				case 'description':
+					twitterCard.description = meta
+					break
+				case 'image':
+					twitterCard.image = meta
+					break
+				case 'site':
+					twitterCard.site = meta
+					break
+			}
+		}
+	}
+	// Only add if we found any OG tags
+	if (Object.keys(openGraph).length > 0) {
+		seo.openGraph = openGraph
+	}
+	// Only add if we found any Twitter tags
+	if (Object.keys(twitterCard).length > 0) {
+		seo.twitterCard = twitterCard
+	}
+}
+/**
+ * Extract canonical URL from head
+ */
+function extractCanonical(
+	head: HTMLNode,
+	html: string,
+	sourcePath?: string,
+): CanonicalUrl | undefined {
+	const canonical = head.querySelector('link[rel="canonical"]')
+	if (!canonical) return undefined
+	const href = canonical.getAttribute('href')
+	if (!href) return undefined
+	const sourceInfo = findElementSourceLocation(canonical, html, sourcePath)
+	return {
+		href,
+		...sourceInfo,
+	}
+}
+/**
+ * Extract JSON-LD structured data from script tags
+ */
+function extractJsonLd(
+	head: HTMLNode,
+	html: string,
+	sourcePath?: string,
+): JsonLdEntry[] {
+	const entries: JsonLdEntry[] = []
+	// Also check body for JSON-LD scripts (some sites place them there)
+	const root = head.parentNode as HTMLNode
+	const scripts = root?.querySelectorAll('script[type="application/ld+json"]') || []
+	for (const script of scripts) {
+		const content = script.textContent?.trim()
+		if (!content) continue
+		try {
+			const data = JSON.parse(content)
+			const type = data['@type'] || 'Unknown'
+			const sourceInfo = findElementSourceLocation(script, html, sourcePath)
+			entries.push({
+				type,
+				data,
+				...sourceInfo,
+			})
+		} catch {
+		}
+	}
+	return entries
+}
+/**
+ * Find the source location (line number and snippet) for an element in the HTML
+ */
+function findElementSourceLocation(
+	element: HTMLNode,
+	html: string,
+	sourcePath?: string,
+): { sourcePath: string; sourceLine: number; sourceSnippet: string } {
+	// Get the element's outer HTML as the source snippet
+	const sourceSnippet = element.toString()
+	// Find the line number by searching for the element in the original HTML
+	let sourceLine = 1
+	const elementStr = sourceSnippet.split('\n')[0] || sourceSnippet // Use first line for matching
+	const lines = html.split('\n')
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i]
+		if (line?.includes(elementStr.substring(0, Math.min(50, elementStr.length)))) {
+			sourceLine = i + 1
+			break
+		}
+	}
+	return {
+		sourcePath: sourcePath || '',
+		sourceLine,
+		sourceSnippet,
+	}
+}

package/src/source-finder/index.ts CHANGED Viewed

@@ -23,4 +23,8 @@ export { findImageSourceLocation } from './image-finder'
 export { findCollectionSource, findMarkdownSourceLocation, parseMarkdownContent } from './collection-finder'
 // Snippet utilities (used by html-processor)
-export { enhanceManifestWithSourceSnippets, extractCompleteTagSnippet, extractInnerHtmlFromSnippet, extractSourceInnerHtml } from './snippet-utils'
+export { enhanceManifestWithSourceSnippets, extractCompleteTagSnippet, extractInnerHtmlFromSnippet, extractSourceSnippet } from './snippet-utils'
+// SEO source finding
+export { findSeoSource } from './seo-finder'
+export type { SeoElementIdentifier, SeoSourceLocation } from './seo-finder'

package/src/source-finder/seo-finder.ts ADDED Viewed

@@ -0,0 +1,336 @@
+import fs from 'node:fs/promises'
+import path from 'node:path'
+import { getProjectRoot } from '../config'
+/**
+ * SEO element identifier for source finding
+ */
+export interface SeoElementIdentifier {
+	/** Meta tag name attribute */
+	name?: string
+	/** Meta tag property attribute (for OG/Twitter) */
+	property?: string
+	/** Content value to match */
+	content?: string
+	/** Canonical URL href */
+	href?: string
+	/** JSON-LD @type value */
+	jsonLdType?: string
+}
+/**
+ * Result of SEO source finding
+ */
+export interface SeoSourceLocation {
+	/** Path to source file relative to project root */
+	sourcePath: string
+	/** Line number in source file (1-indexed) */
+	sourceLine: number
+	/** Exact source code snippet */
+	sourceSnippet: string
+}
+/**
+ * Find the source location for an SEO element.
+ * Searches Astro/HTML files in src/pages and src/layouts for matching SEO elements.
+ */
+export async function findSeoSource(
+	type: 'title' | 'meta' | 'canonical' | 'jsonld',
+	identifier: SeoElementIdentifier,
+): Promise<SeoSourceLocation | undefined> {
+	const srcDir = path.join(getProjectRoot(), 'src')
+	const searchDirs = [
+		path.join(srcDir, 'pages'),
+		path.join(srcDir, 'layouts'),
+		path.join(srcDir, 'components'),
+	]
+	for (const dir of searchDirs) {
+		try {
+			const result = await searchDirectoryForSeo(dir, type, identifier)
+			if (result) return result
+		} catch {
+			// Directory doesn't exist, continue
+		}
+	}
+	return undefined
+}
+/**
+ * Recursively search a directory for SEO elements
+ */
+async function searchDirectoryForSeo(
+	dir: string,
+	type: 'title' | 'meta' | 'canonical' | 'jsonld',
+	identifier: SeoElementIdentifier,
+): Promise<SeoSourceLocation | undefined> {
+	try {
+		const entries = await fs.readdir(dir, { withFileTypes: true })
+		for (const entry of entries) {
+			const fullPath = path.join(dir, entry.name)
+			if (entry.isDirectory()) {
+				const result = await searchDirectoryForSeo(fullPath, type, identifier)
+				if (result) return result
+			} else if (entry.isFile() && (entry.name.endsWith('.astro') || entry.name.endsWith('.html'))) {
+				const result = await searchFileForSeo(fullPath, type, identifier)
+				if (result) return result
+			}
+		}
+	} catch {
+		// Error reading directory
+	}
+	return undefined
+}
+/**
+ * Search a single file for matching SEO element
+ */
+async function searchFileForSeo(
+	filePath: string,
+	type: 'title' | 'meta' | 'canonical' | 'jsonld',
+	identifier: SeoElementIdentifier,
+): Promise<SeoSourceLocation | undefined> {
+	try {
+		const content = await fs.readFile(filePath, 'utf-8')
+		const lines = content.split('\n')
+		switch (type) {
+			case 'title':
+				return findTitleInLines(lines, filePath, identifier.content)
+			case 'meta':
+				return findMetaInLines(lines, filePath, identifier)
+			case 'canonical':
+				return findCanonicalInLines(lines, filePath, identifier.href)
+			case 'jsonld':
+				return findJsonLdInLines(lines, filePath, identifier.jsonLdType)
+			default:
+				return undefined
+		}
+	} catch {
+		return undefined
+	}
+}
+/**
+ * Find title element in source lines
+ */
+function findTitleInLines(
+	lines: string[],
+	filePath: string,
+	content?: string,
+): SeoSourceLocation | undefined {
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i] || ''
+		// Match <title>...</title> or <title>
+		if (line.includes('<title')) {
+			// Check if content matches (if specified)
+			if (content) {
+				// Handle single-line title
+				const match = line.match(/<title[^>]*>([^<]*)<\/title>/i)
+				if (match?.[1]?.includes(content.substring(0, 20))) {
+					return {
+						sourcePath: path.relative(getProjectRoot(), filePath),
+						sourceLine: i + 1,
+						sourceSnippet: extractMultiLineElement(lines, i, 'title'),
+					}
+				}
+				// Handle multi-line or dynamic title
+				if (line.includes('<title')) {
+					return {
+						sourcePath: path.relative(getProjectRoot(), filePath),
+						sourceLine: i + 1,
+						sourceSnippet: extractMultiLineElement(lines, i, 'title'),
+					}
+				}
+			} else {
+				return {
+					sourcePath: path.relative(getProjectRoot(), filePath),
+					sourceLine: i + 1,
+					sourceSnippet: extractMultiLineElement(lines, i, 'title'),
+				}
+			}
+		}
+	}
+	return undefined
+}
+/**
+ * Find meta element in source lines
+ */
+function findMetaInLines(
+	lines: string[],
+	filePath: string,
+	identifier: SeoElementIdentifier,
+): SeoSourceLocation | undefined {
+	const { name, property, content } = identifier
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i] || ''
+		if (!line.includes('<meta')) continue
+		// Check for name attribute match
+		if (name) {
+			const nameMatch = line.match(/name\s*=\s*["']([^"']+)["']/i)
+			if (nameMatch && nameMatch[1] === name) {
+				// Verify content if specified
+				if (content) {
+					const contentMatch = line.match(/content\s*=\s*["']([^"']*)["']/i)
+					if (contentMatch?.[1]?.includes(content.substring(0, 30))) {
+						return {
+							sourcePath: path.relative(getProjectRoot(), filePath),
+							sourceLine: i + 1,
+							sourceSnippet: extractMultiLineElement(lines, i, 'meta'),
+						}
+					}
+				} else {
+					return {
+						sourcePath: path.relative(getProjectRoot(), filePath),
+						sourceLine: i + 1,
+						sourceSnippet: extractMultiLineElement(lines, i, 'meta'),
+					}
+				}
+			}
+		}
+		// Check for property attribute match (OG/Twitter)
+		if (property) {
+			const propMatch = line.match(/property\s*=\s*["']([^"']+)["']/i)
+			if (propMatch && propMatch[1] === property) {
+				return {
+					sourcePath: path.relative(getProjectRoot(), filePath),
+					sourceLine: i + 1,
+					sourceSnippet: extractMultiLineElement(lines, i, 'meta'),
+				}
+			}
+		}
+	}
+	return undefined
+}
+/**
+ * Find canonical link in source lines
+ */
+function findCanonicalInLines(
+	lines: string[],
+	filePath: string,
+	href?: string,
+): SeoSourceLocation | undefined {
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i] || ''
+		if (line.includes('rel="canonical"') || line.includes("rel='canonical'")) {
+			// Verify href if specified
+			if (href) {
+				if (line.includes(href.substring(0, 30))) {
+					return {
+						sourcePath: path.relative(getProjectRoot(), filePath),
+						sourceLine: i + 1,
+						sourceSnippet: extractMultiLineElement(lines, i, 'link'),
+					}
+				}
+			} else {
+				return {
+					sourcePath: path.relative(getProjectRoot(), filePath),
+					sourceLine: i + 1,
+					sourceSnippet: extractMultiLineElement(lines, i, 'link'),
+				}
+			}
+		}
+	}
+	return undefined
+}
+/**
+ * Find JSON-LD script in source lines
+ */
+function findJsonLdInLines(
+	lines: string[],
+	filePath: string,
+	jsonLdType?: string,
+): SeoSourceLocation | undefined {
+	for (let i = 0; i < lines.length; i++) {
+		const line = lines[i] || ''
+		if (line.includes('application/ld+json')) {
+			// Check if @type matches (if specified)
+			if (jsonLdType) {
+				// Look ahead for @type in following lines
+				const snippet = extractMultiLineElement(lines, i, 'script')
+				if (snippet.includes(`"@type"`) && snippet.includes(jsonLdType)) {
+					return {
+						sourcePath: path.relative(getProjectRoot(), filePath),
+						sourceLine: i + 1,
+						sourceSnippet: snippet,
+					}
+				}
+			} else {
+				return {
+					sourcePath: path.relative(getProjectRoot(), filePath),
+					sourceLine: i + 1,
+					sourceSnippet: extractMultiLineElement(lines, i, 'script'),
+				}
+			}
+		}
+	}
+	return undefined
+}
+/**
+ * Extract a potentially multi-line element from source lines
+ */
+function extractMultiLineElement(lines: string[], startLine: number, tag: string): string {
+	const snippetLines: string[] = []
+	let depth = 0
+	let foundClosing = false
+	// For self-closing tags like <meta /> and <link />
+	const isSelfClosing = ['meta', 'link', 'img', 'br', 'hr', 'input'].includes(tag.toLowerCase())
+	for (let i = startLine; i < Math.min(startLine + 30, lines.length); i++) {
+		const line = lines[i]
+		if (!line) continue
+		snippetLines.push(line)
+		// For self-closing tags, check if line ends the tag
+		if (isSelfClosing) {
+			if (line.includes('/>') || (line.includes('>') && !line.includes('</' + tag))) {
+				foundClosing = true
+				break
+			}
+		} else {
+			// Count opening and closing tags
+			const openTags = (line.match(new RegExp(`<${tag}(?:[\\s>]|$)`, 'gi')) || []).length
+			const selfClose = (line.match(new RegExp(`<${tag}[^>]*/>`, 'gi')) || []).length
+			const closeTags = (line.match(new RegExp(`</${tag}>`, 'gi')) || []).length
+			depth += openTags - selfClose - closeTags
+			if (depth <= 0 && (closeTags > 0 || selfClose > 0)) {
+				foundClosing = true
+				break
+			}
+		}
+	}
+	if (!foundClosing && snippetLines.length > 1) {
+		return snippetLines[0] || ''
+	}
+	return snippetLines.join('\n')
+}

package/src/source-finder/snippet-utils.ts CHANGED Viewed

@@ -173,14 +173,14 @@ export function extractImageSnippet(lines: string[], startLine: number): string
 }
 /**
- * Read source file and extract the innerHTML at the specified line.
+ * Read source file and extract the complete element at the specified line.
  *
  * @param sourceFile - Path to source file (relative to cwd)
  * @param sourceLine - 1-indexed line number
  * @param tag - The tag name
- * @returns The innerHTML from source, or undefined if can't extract
+ * @returns The complete element from source, or undefined if can't extract
  */
-export async function extractSourceInnerHtml(
+export async function extractSourceSnippet(
 	sourceFile: string,
 	sourceLine: number,
 	tag: string,
@@ -193,11 +193,8 @@ export async function extractSourceInnerHtml(
 		const content = await fs.readFile(filePath, 'utf-8')
 		const lines = content.split('\n')
-		// Extract the complete tag snippet
-		const snippet = extractCompleteTagSnippet(lines, sourceLine - 1, tag)
-		// Extract innerHTML from the snippet
-		return extractInnerHtmlFromSnippet(snippet, tag)
+		// Extract the complete tag snippet (including wrapper element)
+		return extractCompleteTagSnippet(lines, sourceLine - 1, tag)
 	} catch {
 		return undefined
 	}
@@ -243,8 +240,8 @@ export async function enhanceManifestWithSourceSnippets(
 			return [id, entry] as const
 		}
-		// Extract the actual source innerHTML
-		const sourceSnippet = await extractSourceInnerHtml(
+		// Extract the complete source element
+		const sourceSnippet = await extractSourceSnippet(
 			entry.sourcePath,
 			entry.sourceLine,
 			entry.tag,