@ontosdk/next 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.mts ADDED
@@ -0,0 +1 @@
1
+ #!/usr/bin/env node
package/dist/cli.d.ts ADDED
@@ -0,0 +1 @@
1
+ #!/usr/bin/env node
package/dist/cli.js ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env node
2
+ "use strict";var j=Object.create;var $=Object.defineProperty;var v=Object.getOwnPropertyDescriptor;var z=Object.getOwnPropertyNames;var P=Object.getPrototypeOf,R=Object.prototype.hasOwnProperty;var N=(t,n,o,e)=>{if(n&&typeof n=="object"||typeof n=="function")for(let r of z(n))!R.call(t,r)&&r!==o&&$(t,r,{get:()=>n[r],enumerable:!(e=v(n,r))||e.enumerable});return t};var h=(t,n,o)=>(o=t!=null?j(P(t)):{},N(n||!t||!t.__esModule?$(o,"default",{value:t,enumerable:!0}):o,t));var b=require("glob"),c=h(require("fs")),p=h(require("path")),i=h(require("picocolors"));var k=h(require("cheerio")),O=h(require("turndown")),E=new O.default({headingStyle:"atx",codeBlockStyle:"fenced"});function F(t,n="Generated Output"){let o=t.length,e=k.load(t),r=e("title").text()||e("h1").first().text()||"Untitled Page",f=e('meta[name="description"]').attr("content")||"No description found.",l=[];e('script[type="application/ld+json"]').each((g,w)=>{try{let x=e(w).html()||"",m=JSON.parse(x);l.push(m)}catch{}}),e("script, style, noscript, iframe, svg, nav, footer, meta, link, header").remove();let s="";e("main").length>0?s=e("main").html()||"":e("article").length>0?s=e("article").html()||"":s=e("body").html()||"";let S=E.turndown(s),a=[`# ${r}`,`> ${f}`,"",`**Source:** ${n}`,`**Extracted:** ${new Date().toISOString()}`,"","---",""].join(`
3
+ `)+S;l.length>0&&(a+=`
4
+
5
+ ---
6
+ ## Structured Data (JSON-LD)
7
+ \`\`\`json
8
+ `,l.forEach(g=>{a+=JSON.stringify(g,null,2)+`
9
+ `}),a+="```\n");let u=a.length,d=o>0?(o-u)/o*100:0;return{markdown:a,metadata:{title:r,description:f,jsonLd:l},stats:{originalHtmlSize:o,markdownSize:u,tokenReductionRatio:d}}}async function L(){console.log(i.default.cyan(`
10
+ [Onto] Starting Semantic Output Generation...`));let t=process.cwd(),n=p.default.join(t,".next/server/app"),o=p.default.join(t,"public/.onto");if(!c.default.existsSync(n)){console.log(i.default.yellow(`[Onto] Could not find Next.js app output at ${n}`)),console.log(i.default.yellow('[Onto] Ensure this is run after "next build" and you are using the App Router.'));return}let e=await(0,b.glob)("**/*.html",{cwd:n});if(e.length===0){console.log(i.default.yellow("[Onto] No static HTML files found to process."));return}c.default.existsSync(o)||c.default.mkdirSync(o,{recursive:!0});let r=0,f=0,l=0;for(let s of e){let S=p.default.join(n,s),y=s.replace(/\.html$/,".md"),a=p.default.join(o,y);try{let u=c.default.readFileSync(S,"utf8"),d=F(u,`/${y.replace(/\.md$/,"")}`),g=p.default.dirname(a);c.default.existsSync(g)||c.default.mkdirSync(g,{recursive:!0}),c.default.writeFileSync(a,d.markdown,"utf8"),r+=d.stats.originalHtmlSize,f+=d.stats.markdownSize,l++;let w=(d.stats.originalHtmlSize/1024).toFixed(1),x=(d.stats.markdownSize/1024).toFixed(1),m=s.replace(/\.html$/,"");m==="index"?m="/":m=`/${m}`,console.log(i.default.green("\u2713 Optimized")+i.default.dim(` ${m} `)+i.default.blue(`[${w}KB -> ${x}KB]`))}catch(u){console.error(i.default.red(`\u2717 Failed to process ${s}: ${u.message}`))}}console.log(i.default.cyan(`
11
+ [Onto] Finished generation.`)),console.log(i.default.bold(i.default.magenta(`Processed ${l} pages. Total Size: ${(r/1024).toFixed(1)}KB -> ${(f/1024).toFixed(1)}KB`))),console.log(i.default.dim(`Edge payloads are ready at /public/.onto/*
12
+ `))}L().catch(t=>{console.error(i.default.red(`[Onto] Fatal Error: ${t.message}`)),process.exit(1)});
13
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/cli.ts","../src/extractor.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { glob } from 'glob';\r\nimport fs from 'fs';\r\nimport path from 'path';\r\nimport pc from 'picocolors';\r\nimport { extractContent } from './extractor';\r\n\r\nasync function main() {\r\n console.log(pc.cyan('\\n[Onto] Starting Semantic Output Generation...'));\r\n\r\n const cwd = process.cwd();\r\n const nextAppDirDir = path.join(cwd, '.next/server/app');\r\n const ontoPublicDir = path.join(cwd, 'public/.onto');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));\r\n console.log(pc.yellow(`[Onto] Ensure this is run after \"next build\" and you are using the App Router.`));\r\n return;\r\n }\r\n\r\n // Find all HTML files rendered by Next.js in the app directory\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n\r\n if (files.length === 0) {\r\n console.log(pc.yellow(`[Onto] No static HTML files found to process.`));\r\n return;\r\n }\r\n\r\n // Ensure output directory exists\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalOriginalSize = 0;\r\n let totalMarkdownSize = 0;\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n\r\n // We map file path e.g. \"pricing.html\" to \"pricing.md\", or \"blog/post.html\" to \"blog/post.md\"\r\n let outputPathRelative = file.replace(/\\.html$/, '.md');\r\n // If it's a dynamic route page, or purely root index.html\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\\.md$/, '')}`);\r\n\r\n // Ensure specific sub-directory exists (e.g., for blog/post.md)\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n\r\n totalOriginalSize += result.stats.originalHtmlSize;\r\n totalMarkdownSize += result.stats.markdownSize;\r\n totalFilesProcessed++;\r\n\r\n const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);\r\n const mdKb = (result.stats.markdownSize / 1024).toFixed(1);\r\n\r\n // /index.html -> /\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n console.log(\r\n pc.green(`✓ Optimized`) +\r\n pc.dim(` ${routeName} `) +\r\n pc.blue(`[${origKb}KB -> ${mdKb}KB]`)\r\n );\r\n } catch (e: any) {\r\n console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(pc.cyan(`\\n[Onto] Finished generation.`));\r\n console.log(\r\n pc.bold(\r\n pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)\r\n )\r\n );\r\n console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\\n`));\r\n}\r\n\r\nmain().catch(e => {\r\n console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));\r\n process.exit(1);\r\n});\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":";wdACA,IAAAA,EAAqB,gBACrBC,EAAe,iBACfC,EAAiB,mBACjBC,EAAe,yBCJf,IAAAC,EAAyB,sBACzBC,EAA4B,uBAEtBC,EAAkB,IAAI,EAAAC,QAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASC,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWf,EAAgB,SAASc,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ,CDtGA,eAAeC,GAAO,CAClB,QAAQ,IAAI,EAAAC,QAAG,KAAK;AAAA,8CAAiD,CAAC,EAEtE,IAAMC,EAAM,QAAQ,IAAI,EAClBC,EAAgB,EAAAC,QAAK,KAAKF,EAAK,kBAAkB,EACjDG,EAAgB,EAAAD,QAAK,KAAKF,EAAK,cAAc,EAEnD,GAAI,CAAC,EAAAI,QAAG,WAAWH,CAAa,EAAG,CAC/B,QAAQ,IAAI,EAAAF,QAAG,OAAO,+CAA+CE,CAAa,EAAE,CAAC,EACrF,QAAQ,IAAI,EAAAF,QAAG,OAAO,gFAAgF,CAAC,EACvG,MACJ,CAGA,IAAMM,EAAQ,QAAM,QAAK,YAAa,CAAE,IAAKJ,CAAc,CAAC,EAE5D,GAAII,EAAM,SAAW,EAAG,CACpB,QAAQ,IAAI,EAAAN,QAAG,OAAO,+CAA+C,CAAC,EACtE,MACJ,CAGK,EAAAK,QAAG,WAAWD,CAAa,GAC5B,EAAAC,QAAG,UAAUD,EAAe,CAAE,UAAW,EAAK,CAAC,EAGnD,IAAIG,EAAoB,EACpBC,EAAoB,EACpBC,EAAsB,EAE1B,QAAWC,KAAQJ,EAAO,CACtB,IAAMK,EAAY,EAAAR,QAAK,KAAKD,EAAeQ,CAAI,EAG3CE,EAAqBF,EAAK,QAAQ,UAAW,KAAK,EAEhDG,EAAa,EAAAV,QAAK,KAAKC,EAAeQ,CAAkB,EAE9D,GAAI,CACA,IAAME,EAAc,EAAAT,QAAG,aAAaM,EAAW,MAAM,EAE/CI,EAASC,EAAeF,EAAa,IAAIF,EAAmB,QAAQ,QAAS,EAAE,CAAC,EAAE,EAGlFK,EAAY,EAAAd,QAAK,QAAQU,CAAU,EACpC,EAAAR,QAAG,WAAWY,CAAS,GACxB,EAAAZ,QAAG,UAAUY,EAAW,CAAE,UAAW,EAAK,CAAC,EAG/C,EAAAZ,QAAG,cAAcQ,EAAYE,EAAO,SAAU,MAAM,EAEpDR,GAAqBQ,EAAO,MAAM,iBAClCP,GAAqBO,EAAO,MAAM,aAClCN,IAEA,IAAMS,GAAUH,EAAO,MAAM,iBAAmB,MAAM,QAAQ,CAAC,EACzDI,GAAQJ,EAAO,MAAM,aAAe,MAAM,QAAQ,CAAC,EAGrDK,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCU,IAAc,QAASA,EAAY,IAClCA,EAAY,IAAIA,CAAS,GAE9B,QAAQ,IACJ,EAAApB,QAAG,MAAM,kBAAa,EACtB,EAAAA,QAAG,IAAI,IAAIoB,CAAS,GAAG,EACvB,EAAApB,QAAG,KAAK,IAAIkB,CAAM,SAASC,CAAI,KAAK,CACxC,CACJ,OAASE,EAAQ,CACb,QAAQ,MAAM,EAAArB,QAAG,IAAI,4BAAuBU,CAAI,KAAKW,EAAE,OAAO,EAAE,CAAC,CACrE,CACJ,CAEA,QAAQ,IAAI,EAAArB,QAAG,KAAK;AAAA,4BAA+B,CAAC,EACpD,QAAQ,IACJ,EAAAA,QAAG,KACC,EAAAA,QAAG,QAAQ,aAAaS,CAAmB,wBAAwBF,EAAoB,MAAM,QAAQ,CAAC,CAAC,UAAUC,EAAoB,MAAM,QAAQ,CAAC,CAAC,IAAI,CAC7J,CACJ,EACA,QAAQ,IAAI,EAAAR,QAAG,IAAI;AAAA,CAA8C,CAAC,CACtE,CAEAD,EAAK,EAAE,MAAMsB,GAAK,CACd,QAAQ,MAAM,EAAArB,QAAG,IAAI,uBAAuBqB,EAAE,OAAO,EAAE,CAAC,EACxD,QAAQ,KAAK,CAAC,CAClB,CAAC","names":["import_glob","import_fs","import_path","import_picocolors","cheerio","import_turndown","turndownService","TurndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio","main","pc","cwd","nextAppDirDir","path","ontoPublicDir","fs","files","totalOriginalSize","totalMarkdownSize","totalFilesProcessed","file","inputPath","outputPathRelative","outputPath","htmlContent","result","extractContent","outputDir","origKb","mdKb","routeName","e"]}
package/dist/cli.mjs ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env node
2
+ import{glob as F}from"glob";import l from"fs";import g from"path";import e from"picocolors";import*as x from"cheerio";import k from"turndown";var O=new k({headingStyle:"atx",codeBlockStyle:"fenced"});function $(r,d="Generated Output"){let o=r.length,t=x.load(r),u=t("title").text()||t("h1").first().text()||"Untitled Page",p=t('meta[name="description"]').attr("content")||"No description found.",s=[];t('script[type="application/ld+json"]').each((f,y)=>{try{let w=t(y).html()||"",c=JSON.parse(w);s.push(c)}catch{}}),t("script, style, noscript, iframe, svg, nav, footer, meta, link, header").remove();let n="";t("main").length>0?n=t("main").html()||"":t("article").length>0?n=t("article").html()||"":n=t("body").html()||"";let h=O.turndown(n),i=[`# ${u}`,`> ${p}`,"",`**Source:** ${d}`,`**Extracted:** ${new Date().toISOString()}`,"","---",""].join(`
3
+ `)+h;s.length>0&&(i+=`
4
+
5
+ ---
6
+ ## Structured Data (JSON-LD)
7
+ \`\`\`json
8
+ `,s.forEach(f=>{i+=JSON.stringify(f,null,2)+`
9
+ `}),i+="```\n");let m=i.length,a=o>0?(o-m)/o*100:0;return{markdown:i,metadata:{title:u,description:p,jsonLd:s},stats:{originalHtmlSize:o,markdownSize:m,tokenReductionRatio:a}}}async function b(){console.log(e.cyan(`
10
+ [Onto] Starting Semantic Output Generation...`));let r=process.cwd(),d=g.join(r,".next/server/app"),o=g.join(r,"public/.onto");if(!l.existsSync(d)){console.log(e.yellow(`[Onto] Could not find Next.js app output at ${d}`)),console.log(e.yellow('[Onto] Ensure this is run after "next build" and you are using the App Router.'));return}let t=await F("**/*.html",{cwd:d});if(t.length===0){console.log(e.yellow("[Onto] No static HTML files found to process."));return}l.existsSync(o)||l.mkdirSync(o,{recursive:!0});let u=0,p=0,s=0;for(let n of t){let h=g.join(d,n),S=n.replace(/\.html$/,".md"),i=g.join(o,S);try{let m=l.readFileSync(h,"utf8"),a=$(m,`/${S.replace(/\.md$/,"")}`),f=g.dirname(i);l.existsSync(f)||l.mkdirSync(f,{recursive:!0}),l.writeFileSync(i,a.markdown,"utf8"),u+=a.stats.originalHtmlSize,p+=a.stats.markdownSize,s++;let y=(a.stats.originalHtmlSize/1024).toFixed(1),w=(a.stats.markdownSize/1024).toFixed(1),c=n.replace(/\.html$/,"");c==="index"?c="/":c=`/${c}`,console.log(e.green("\u2713 Optimized")+e.dim(` ${c} `)+e.blue(`[${y}KB -> ${w}KB]`))}catch(m){console.error(e.red(`\u2717 Failed to process ${n}: ${m.message}`))}}console.log(e.cyan(`
11
+ [Onto] Finished generation.`)),console.log(e.bold(e.magenta(`Processed ${s} pages. Total Size: ${(u/1024).toFixed(1)}KB -> ${(p/1024).toFixed(1)}KB`))),console.log(e.dim(`Edge payloads are ready at /public/.onto/*
12
+ `))}b().catch(r=>{console.error(e.red(`[Onto] Fatal Error: ${r.message}`)),process.exit(1)});
13
+ //# sourceMappingURL=cli.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/cli.ts","../src/extractor.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { glob } from 'glob';\r\nimport fs from 'fs';\r\nimport path from 'path';\r\nimport pc from 'picocolors';\r\nimport { extractContent } from './extractor';\r\n\r\nasync function main() {\r\n console.log(pc.cyan('\\n[Onto] Starting Semantic Output Generation...'));\r\n\r\n const cwd = process.cwd();\r\n const nextAppDirDir = path.join(cwd, '.next/server/app');\r\n const ontoPublicDir = path.join(cwd, 'public/.onto');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));\r\n console.log(pc.yellow(`[Onto] Ensure this is run after \"next build\" and you are using the App Router.`));\r\n return;\r\n }\r\n\r\n // Find all HTML files rendered by Next.js in the app directory\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n\r\n if (files.length === 0) {\r\n console.log(pc.yellow(`[Onto] No static HTML files found to process.`));\r\n return;\r\n }\r\n\r\n // Ensure output directory exists\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalOriginalSize = 0;\r\n let totalMarkdownSize = 0;\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n\r\n // We map file path e.g. \"pricing.html\" to \"pricing.md\", or \"blog/post.html\" to \"blog/post.md\"\r\n let outputPathRelative = file.replace(/\\.html$/, '.md');\r\n // If it's a dynamic route page, or purely root index.html\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\\.md$/, '')}`);\r\n\r\n // Ensure specific sub-directory exists (e.g., for blog/post.md)\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n\r\n totalOriginalSize += result.stats.originalHtmlSize;\r\n totalMarkdownSize += result.stats.markdownSize;\r\n totalFilesProcessed++;\r\n\r\n const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);\r\n const mdKb = (result.stats.markdownSize / 1024).toFixed(1);\r\n\r\n // /index.html -> /\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n console.log(\r\n pc.green(`✓ Optimized`) +\r\n pc.dim(` ${routeName} `) +\r\n pc.blue(`[${origKb}KB -> ${mdKb}KB]`)\r\n );\r\n } catch (e: any) {\r\n console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(pc.cyan(`\\n[Onto] Finished generation.`));\r\n console.log(\r\n pc.bold(\r\n pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)\r\n )\r\n );\r\n console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\\n`));\r\n}\r\n\r\nmain().catch(e => {\r\n console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));\r\n process.exit(1);\r\n});\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":";AACA,OAAS,QAAAA,MAAY,OACrB,OAAOC,MAAQ,KACf,OAAOC,MAAU,OACjB,OAAOC,MAAQ,aCJf,UAAYC,MAAa,UACzB,OAAOC,MAAqB,WAE5B,IAAMC,EAAkB,IAAID,EAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASE,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWd,EAAgB,SAASa,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ,CDtGA,eAAeC,GAAO,CAClB,QAAQ,IAAIC,EAAG,KAAK;AAAA,8CAAiD,CAAC,EAEtE,IAAMC,EAAM,QAAQ,IAAI,EAClBC,EAAgBC,EAAK,KAAKF,EAAK,kBAAkB,EACjDG,EAAgBD,EAAK,KAAKF,EAAK,cAAc,EAEnD,GAAI,CAACI,EAAG,WAAWH,CAAa,EAAG,CAC/B,QAAQ,IAAIF,EAAG,OAAO,+CAA+CE,CAAa,EAAE,CAAC,EACrF,QAAQ,IAAIF,EAAG,OAAO,gFAAgF,CAAC,EACvG,MACJ,CAGA,IAAMM,EAAQ,MAAMC,EAAK,YAAa,CAAE,IAAKL,CAAc,CAAC,EAE5D,GAAII,EAAM,SAAW,EAAG,CACpB,QAAQ,IAAIN,EAAG,OAAO,+CAA+C,CAAC,EACtE,MACJ,CAGKK,EAAG,WAAWD,CAAa,GAC5BC,EAAG,UAAUD,EAAe,CAAE,UAAW,EAAK,CAAC,EAGnD,IAAII,EAAoB,EACpBC,EAAoB,EACpBC,EAAsB,EAE1B,QAAWC,KAAQL,EAAO,CACtB,IAAMM,EAAYT,EAAK,KAAKD,EAAeS,CAAI,EAG3CE,EAAqBF,EAAK,QAAQ,UAAW,KAAK,EAEhDG,EAAaX,EAAK,KAAKC,EAAeS,CAAkB,EAE9D,GAAI,CACA,IAAME,EAAcV,EAAG,aAAaO,EAAW,MAAM,EAE/CI,EAASC,EAAeF,EAAa,IAAIF,EAAmB,QAAQ,QAAS,EAAE,CAAC,EAAE,EAGlFK,EAAYf,EAAK,QAAQW,CAAU,EACpCT,EAAG,WAAWa,CAAS,GACxBb,EAAG,UAAUa,EAAW,CAAE,UAAW,EAAK,CAAC,EAG/Cb,EAAG,cAAcS,EAAYE,EAAO,SAAU,MAAM,EAEpDR,GAAqBQ,EAAO,MAAM,iBAClCP,GAAqBO,EAAO,MAAM,aAClCN,IAEA,IAAMS,GAAUH,EAAO,MAAM,iBAAmB,MAAM,QAAQ,CAAC,EACzDI,GAAQJ,EAAO,MAAM,aAAe,MAAM,QAAQ,CAAC,EAGrDK,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCU,IAAc,QAASA,EAAY,IAClCA,EAAY,IAAIA,CAAS,GAE9B,QAAQ,IACJrB,EAAG,MAAM,kBAAa,EACtBA,EAAG,IAAI,IAAIqB,CAAS,GAAG,EACvBrB,EAAG,KAAK,IAAImB,CAAM,SAASC,CAAI,KAAK,CACxC,CACJ,OAASE,EAAQ,CACb,QAAQ,MAAMtB,EAAG,IAAI,4BAAuBW,CAAI,KAAKW,EAAE,OAAO,EAAE,CAAC,CACrE,CACJ,CAEA,QAAQ,IAAItB,EAAG,KAAK;AAAA,4BAA+B,CAAC,EACpD,QAAQ,IACJA,EAAG,KACCA,EAAG,QAAQ,aAAaU,CAAmB,wBAAwBF,EAAoB,MAAM,QAAQ,CAAC,CAAC,UAAUC,EAAoB,MAAM,QAAQ,CAAC,CAAC,IAAI,CAC7J,CACJ,EACA,QAAQ,IAAIT,EAAG,IAAI;AAAA,CAA8C,CAAC,CACtE,CAEAD,EAAK,EAAE,MAAMuB,GAAK,CACd,QAAQ,MAAMtB,EAAG,IAAI,uBAAuBsB,EAAE,OAAO,EAAE,CAAC,EACxD,QAAQ,KAAK,CAAC,CAClB,CAAC","names":["glob","fs","path","pc","cheerio","TurndownService","turndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio","main","pc","cwd","nextAppDirDir","path","ontoPublicDir","fs","files","glob","totalOriginalSize","totalMarkdownSize","totalFilesProcessed","file","inputPath","outputPathRelative","outputPath","htmlContent","result","extractContent","outputDir","origKb","mdKb","routeName","e"]}
@@ -0,0 +1,22 @@
1
+ interface ExtractionResult {
2
+ markdown: string;
3
+ metadata: {
4
+ title: string;
5
+ description: string;
6
+ jsonLd: any[];
7
+ };
8
+ stats: {
9
+ originalHtmlSize: number;
10
+ markdownSize: number;
11
+ tokenReductionRatio: number;
12
+ };
13
+ }
14
+ /**
15
+ * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.
16
+ * @param html The raw HTML string.
17
+ * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.
18
+ * @returns {ExtractionResult} The extracted payload.
19
+ */
20
+ declare function extractContent(html: string, sourceUrl?: string): ExtractionResult;
21
+
22
+ export { extractContent };
@@ -0,0 +1,22 @@
1
+ interface ExtractionResult {
2
+ markdown: string;
3
+ metadata: {
4
+ title: string;
5
+ description: string;
6
+ jsonLd: any[];
7
+ };
8
+ stats: {
9
+ originalHtmlSize: number;
10
+ markdownSize: number;
11
+ tokenReductionRatio: number;
12
+ };
13
+ }
14
+ /**
15
+ * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.
16
+ * @param html The raw HTML string.
17
+ * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.
18
+ * @returns {ExtractionResult} The extracted payload.
19
+ */
20
+ declare function extractContent(html: string, sourceUrl?: string): ExtractionResult;
21
+
22
+ export { extractContent };
package/dist/index.js ADDED
@@ -0,0 +1,9 @@
1
+ "use strict";var $=Object.create;var c=Object.defineProperty;var j=Object.getOwnPropertyDescriptor;var v=Object.getOwnPropertyNames;var O=Object.getPrototypeOf,R=Object.prototype.hasOwnProperty;var L=(t,n)=>{for(var o in n)c(t,o,{get:n[o],enumerable:!0})},u=(t,n,o,e)=>{if(n&&typeof n=="object"||typeof n=="function")for(let r of v(n))!R.call(t,r)&&r!==o&&c(t,r,{get:()=>n[r],enumerable:!(e=j(n,r))||e.enumerable});return t};var f=(t,n,o)=>(o=t!=null?$(O(t)):{},u(n||!t||!t.__esModule?c(o,"default",{value:t,enumerable:!0}):o,t)),z=t=>u(c({},"__esModule",{value:!0}),t);var E={};L(E,{extractContent:()=>g});module.exports=z(E);var h=f(require("cheerio")),p=f(require("turndown")),N=new p.default({headingStyle:"atx",codeBlockStyle:"fenced"});function g(t,n="Generated Output"){let o=t.length,e=h.load(t),r=e("title").text()||e("h1").first().text()||"Untitled Page",l=e('meta[name="description"]').attr("content")||"No description found.",a=[];e('script[type="application/ld+json"]').each((m,w)=>{try{let k=e(w).html()||"",x=JSON.parse(k);a.push(x)}catch{}}),e("script, style, noscript, iframe, svg, nav, footer, meta, link, header").remove();let s="";e("main").length>0?s=e("main").html()||"":e("article").length>0?s=e("article").html()||"":s=e("body").html()||"";let S=N.turndown(s),i=[`# ${r}`,`> ${l}`,"",`**Source:** ${n}`,`**Extracted:** ${new Date().toISOString()}`,"","---",""].join(`
2
+ `)+S;a.length>0&&(i+=`
3
+
4
+ ---
5
+ ## Structured Data (JSON-LD)
6
+ \`\`\`json
7
+ `,a.forEach(m=>{i+=JSON.stringify(m,null,2)+`
8
+ `}),i+="```\n");let d=i.length,y=o>0?(o-d)/o*100:0;return{markdown:i,metadata:{title:r,description:l,jsonLd:a},stats:{originalHtmlSize:o,markdownSize:d,tokenReductionRatio:y}}}0&&(module.exports={extractContent});
9
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts","../src/extractor.ts"],"sourcesContent":["// We cannot use Webpack plugins reliably in Next.js Turbopack due to WorkerError restrictions.\r\n// Users must instead run `npx onto-next` as a postbuild script.\r\nexport { extractContent } from './extractor';\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":"0jBAAA,IAAAA,EAAA,GAAAC,EAAAD,EAAA,oBAAAE,IAAA,eAAAC,EAAAH,GCAA,IAAAI,EAAyB,sBACzBC,EAA4B,uBAEtBC,EAAkB,IAAI,EAAAC,QAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASC,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWf,EAAgB,SAASc,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ","names":["index_exports","__export","extractContent","__toCommonJS","cheerio","import_turndown","turndownService","TurndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio"]}
package/dist/index.mjs ADDED
@@ -0,0 +1,9 @@
1
+ import*as d from"cheerio";import S from"turndown";var y=new S({headingStyle:"atx",codeBlockStyle:"fenced"});function w(i,m="Generated Output"){let n=i.length,t=d.load(i),a=t("title").text()||t("h1").first().text()||"Untitled Page",s=t('meta[name="description"]').attr("content")||"No description found.",o=[];t('script[type="application/ld+json"]').each((l,h)=>{try{let p=t(h).html()||"",g=JSON.parse(p);o.push(g)}catch{}}),t("script, style, noscript, iframe, svg, nav, footer, meta, link, header").remove();let r="";t("main").length>0?r=t("main").html()||"":t("article").length>0?r=t("article").html()||"":r=t("body").html()||"";let u=y.turndown(r),e=[`# ${a}`,`> ${s}`,"",`**Source:** ${m}`,`**Extracted:** ${new Date().toISOString()}`,"","---",""].join(`
2
+ `)+u;o.length>0&&(e+=`
3
+
4
+ ---
5
+ ## Structured Data (JSON-LD)
6
+ \`\`\`json
7
+ `,o.forEach(l=>{e+=JSON.stringify(l,null,2)+`
8
+ `}),e+="```\n");let c=e.length,f=n>0?(n-c)/n*100:0;return{markdown:e,metadata:{title:a,description:s,jsonLd:o},stats:{originalHtmlSize:n,markdownSize:c,tokenReductionRatio:f}}}export{w as extractContent};
9
+ //# sourceMappingURL=index.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/extractor.ts"],"sourcesContent":["import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":"AAAA,UAAYA,MAAa,UACzB,OAAOC,MAAqB,WAE5B,IAAMC,EAAkB,IAAID,EAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASE,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWd,EAAgB,SAASa,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ","names":["cheerio","TurndownService","turndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio"]}
@@ -0,0 +1,5 @@
1
+ import { NextRequest, NextResponse } from 'next/server';
2
+
3
+ declare function ontoMiddleware(request: NextRequest): NextResponse<unknown>;
4
+
5
+ export { ontoMiddleware };
@@ -0,0 +1,5 @@
1
+ import { NextRequest, NextResponse } from 'next/server';
2
+
3
+ declare function ontoMiddleware(request: NextRequest): NextResponse<unknown>;
4
+
5
+ export { ontoMiddleware };
@@ -0,0 +1,2 @@
1
+ "use strict";var c=Object.defineProperty;var d=Object.getOwnPropertyDescriptor;var l=Object.getOwnPropertyNames;var u=Object.prototype.hasOwnProperty;var h=(e,t)=>{for(var s in t)c(e,s,{get:t[s],enumerable:!0})},p=(e,t,s,r)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of l(t))!u.call(e,o)&&o!==s&&c(e,o,{get:()=>t[o],enumerable:!(r=d(t,o))||r.enumerable});return e};var x=e=>p(c({},"__esModule",{value:!0}),e);var B={};h(B,{ontoMiddleware:()=>f});module.exports=x(B);var i=require("next/server"),m=["GPTBot","ChatGPT-User","ClaudeBot","Claude-Web","anthropic-ai","PerplexityBot","OAI-SearchBot","GoogleExtended"];function f(e){let t=e.headers.get("user-agent")||"",s=e.headers.get("accept")||"",r=m.some(a=>t.includes(a)),o=s.includes("text/markdown");if(r||o){let a=e.nextUrl.clone();if(a.pathname.startsWith("/_next")||a.pathname.includes("."))return i.NextResponse.next();let n=a.pathname;return(n==="/"||n==="")&&(n="/index"),n.endsWith("/")&&n!=="/"&&(n=n.slice(0,-1)),a.pathname=`/.onto${n}.md`,i.NextResponse.rewrite(a)}return i.NextResponse.next()}0&&(module.exports={ontoMiddleware});
2
+ //# sourceMappingURL=middleware.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/middleware.ts"],"sourcesContent":["import { NextRequest, NextResponse } from 'next/server';\r\n\r\nconst AI_BOT_USER_AGENTS = [\r\n 'GPTBot',\r\n 'ChatGPT-User',\r\n 'ClaudeBot',\r\n 'Claude-Web',\r\n 'anthropic-ai',\r\n 'PerplexityBot',\r\n 'OAI-SearchBot',\r\n 'GoogleExtended',\r\n];\r\n\r\nexport function ontoMiddleware(request: NextRequest) {\r\n const userAgent = request.headers.get('user-agent') || '';\r\n const accept = request.headers.get('accept') || '';\r\n\r\n const isAiBot = AI_BOT_USER_AGENTS.some(bot => userAgent.includes(bot));\r\n const isMarkdownRequested = accept.includes('text/markdown');\r\n\r\n // If traffic is identified as an AI Bot, rewrite the URL\r\n if (isAiBot || isMarkdownRequested) {\r\n const url = request.nextUrl.clone();\r\n\r\n // Ignore internal next.js requests & static assets\r\n if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {\r\n return NextResponse.next();\r\n }\r\n\r\n // Determine the corresponding payload path\r\n let payloadPath = url.pathname;\r\n if (payloadPath === '/' || payloadPath === '') {\r\n payloadPath = '/index';\r\n }\r\n\r\n // Strip trailing slash if present\r\n if (payloadPath.endsWith('/') && payloadPath !== '/') {\r\n payloadPath = payloadPath.slice(0, -1);\r\n }\r\n\r\n url.pathname = `/.onto${payloadPath}.md`;\r\n\r\n // Rewrite implicitly serves the target URL transparently to the client.\r\n return NextResponse.rewrite(url);\r\n }\r\n\r\n return NextResponse.next();\r\n}\r\n"],"mappings":"yaAAA,IAAAA,EAAA,GAAAC,EAAAD,EAAA,oBAAAE,IAAA,eAAAC,EAAAH,GAAA,IAAAI,EAA0C,uBAEpCC,EAAqB,CACvB,SACA,eACA,YACA,aACA,eACA,gBACA,gBACA,gBACJ,EAEO,SAASH,EAAeI,EAAsB,CACjD,IAAMC,EAAYD,EAAQ,QAAQ,IAAI,YAAY,GAAK,GACjDE,EAASF,EAAQ,QAAQ,IAAI,QAAQ,GAAK,GAE1CG,EAAUJ,EAAmB,KAAKK,GAAOH,EAAU,SAASG,CAAG,CAAC,EAChEC,EAAsBH,EAAO,SAAS,eAAe,EAG3D,GAAIC,GAAWE,EAAqB,CAChC,IAAMC,EAAMN,EAAQ,QAAQ,MAAM,EAGlC,GAAIM,EAAI,SAAS,WAAW,QAAQ,GAAKA,EAAI,SAAS,SAAS,GAAG,EAC9D,OAAO,eAAa,KAAK,EAI7B,IAAIC,EAAcD,EAAI,SACtB,OAAIC,IAAgB,KAAOA,IAAgB,MACvCA,EAAc,UAIdA,EAAY,SAAS,GAAG,GAAKA,IAAgB,MAC7CA,EAAcA,EAAY,MAAM,EAAG,EAAE,GAGzCD,EAAI,SAAW,SAASC,CAAW,MAG5B,eAAa,QAAQD,CAAG,CACnC,CAEA,OAAO,eAAa,KAAK,CAC7B","names":["middleware_exports","__export","ontoMiddleware","__toCommonJS","import_server","AI_BOT_USER_AGENTS","request","userAgent","accept","isAiBot","bot","isMarkdownRequested","url","payloadPath"]}
@@ -0,0 +1,2 @@
1
+ import{NextResponse as o}from"next/server";var c=["GPTBot","ChatGPT-User","ClaudeBot","Claude-Web","anthropic-ai","PerplexityBot","OAI-SearchBot","GoogleExtended"];function u(n){let a=n.headers.get("user-agent")||"",s=n.headers.get("accept")||"",r=c.some(t=>a.includes(t)),i=s.includes("text/markdown");if(r||i){let t=n.nextUrl.clone();if(t.pathname.startsWith("/_next")||t.pathname.includes("."))return o.next();let e=t.pathname;return(e==="/"||e==="")&&(e="/index"),e.endsWith("/")&&e!=="/"&&(e=e.slice(0,-1)),t.pathname=`/.onto${e}.md`,o.rewrite(t)}return o.next()}export{u as ontoMiddleware};
2
+ //# sourceMappingURL=middleware.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/middleware.ts"],"sourcesContent":["import { NextRequest, NextResponse } from 'next/server';\r\n\r\nconst AI_BOT_USER_AGENTS = [\r\n 'GPTBot',\r\n 'ChatGPT-User',\r\n 'ClaudeBot',\r\n 'Claude-Web',\r\n 'anthropic-ai',\r\n 'PerplexityBot',\r\n 'OAI-SearchBot',\r\n 'GoogleExtended',\r\n];\r\n\r\nexport function ontoMiddleware(request: NextRequest) {\r\n const userAgent = request.headers.get('user-agent') || '';\r\n const accept = request.headers.get('accept') || '';\r\n\r\n const isAiBot = AI_BOT_USER_AGENTS.some(bot => userAgent.includes(bot));\r\n const isMarkdownRequested = accept.includes('text/markdown');\r\n\r\n // If traffic is identified as an AI Bot, rewrite the URL\r\n if (isAiBot || isMarkdownRequested) {\r\n const url = request.nextUrl.clone();\r\n\r\n // Ignore internal next.js requests & static assets\r\n if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {\r\n return NextResponse.next();\r\n }\r\n\r\n // Determine the corresponding payload path\r\n let payloadPath = url.pathname;\r\n if (payloadPath === '/' || payloadPath === '') {\r\n payloadPath = '/index';\r\n }\r\n\r\n // Strip trailing slash if present\r\n if (payloadPath.endsWith('/') && payloadPath !== '/') {\r\n payloadPath = payloadPath.slice(0, -1);\r\n }\r\n\r\n url.pathname = `/.onto${payloadPath}.md`;\r\n\r\n // Rewrite implicitly serves the target URL transparently to the client.\r\n return NextResponse.rewrite(url);\r\n }\r\n\r\n return NextResponse.next();\r\n}\r\n"],"mappings":"AAAA,OAAsB,gBAAAA,MAAoB,cAE1C,IAAMC,EAAqB,CACvB,SACA,eACA,YACA,aACA,eACA,gBACA,gBACA,gBACJ,EAEO,SAASC,EAAeC,EAAsB,CACjD,IAAMC,EAAYD,EAAQ,QAAQ,IAAI,YAAY,GAAK,GACjDE,EAASF,EAAQ,QAAQ,IAAI,QAAQ,GAAK,GAE1CG,EAAUL,EAAmB,KAAKM,GAAOH,EAAU,SAASG,CAAG,CAAC,EAChEC,EAAsBH,EAAO,SAAS,eAAe,EAG3D,GAAIC,GAAWE,EAAqB,CAChC,IAAMC,EAAMN,EAAQ,QAAQ,MAAM,EAGlC,GAAIM,EAAI,SAAS,WAAW,QAAQ,GAAKA,EAAI,SAAS,SAAS,GAAG,EAC9D,OAAOT,EAAa,KAAK,EAI7B,IAAIU,EAAcD,EAAI,SACtB,OAAIC,IAAgB,KAAOA,IAAgB,MACvCA,EAAc,UAIdA,EAAY,SAAS,GAAG,GAAKA,IAAgB,MAC7CA,EAAcA,EAAY,MAAM,EAAG,EAAE,GAGzCD,EAAI,SAAW,SAASC,CAAW,MAG5BV,EAAa,QAAQS,CAAG,CACnC,CAEA,OAAOT,EAAa,KAAK,CAC7B","names":["NextResponse","AI_BOT_USER_AGENTS","ontoMiddleware","request","userAgent","accept","isAiBot","bot","isMarkdownRequested","url","payloadPath"]}
package/package.json ADDED
@@ -0,0 +1,53 @@
1
+ {
2
+ "name": "@ontosdk/next",
3
+ "version": "1.0.0",
4
+ "description": "Extracts semantic Markdown from React/Next.js pages for AI Agents",
5
+ "main": "dist/index.js",
6
+ "module": "dist/index.mjs",
7
+ "types": "dist/index.d.ts",
8
+ "publishConfig": {
9
+ "access": "public"
10
+ },
11
+ "exports": {
12
+ ".": {
13
+ "require": "./dist/index.js",
14
+ "import": "./dist/index.mjs",
15
+ "types": "./dist/index.d.ts"
16
+ },
17
+ "./middleware": {
18
+ "require": "./dist/middleware.js",
19
+ "import": "./dist/middleware.mjs",
20
+ "types": "./dist/middleware.d.ts"
21
+ }
22
+ },
23
+ "bin": {
24
+ "onto-next": "dist/cli.js"
25
+ },
26
+ "scripts": {
27
+ "build": "tsup",
28
+ "dev": "tsup --watch",
29
+ "test": "echo \"Error: no test specified\" && exit 1"
30
+ },
31
+ "keywords": [
32
+ "nextjs",
33
+ "ai",
34
+ "markdown",
35
+ "extraction",
36
+ "seo"
37
+ ],
38
+ "author": "Onto",
39
+ "license": "MIT",
40
+ "dependencies": {
41
+ "cheerio": "^1.0.0-rc.12",
42
+ "glob": "^10.3.10",
43
+ "picocolors": "^1.0.0",
44
+ "turndown": "^7.1.3"
45
+ },
46
+ "devDependencies": {
47
+ "@types/node": "^20.11.24",
48
+ "@types/turndown": "^5.0.4",
49
+ "next": "^16.1.6",
50
+ "tsup": "^8.0.2",
51
+ "typescript": "^5.3.3"
52
+ }
53
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env node
2
+ import { glob } from 'glob';
3
+ import fs from 'fs';
4
+ import path from 'path';
5
+ import pc from 'picocolors';
6
+ import { extractContent } from './extractor';
7
+
8
+ async function main() {
9
+ console.log(pc.cyan('\n[Onto] Starting Semantic Output Generation...'));
10
+
11
+ const cwd = process.cwd();
12
+ const nextAppDirDir = path.join(cwd, '.next/server/app');
13
+ const ontoPublicDir = path.join(cwd, 'public/.onto');
14
+
15
+ if (!fs.existsSync(nextAppDirDir)) {
16
+ console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));
17
+ console.log(pc.yellow(`[Onto] Ensure this is run after "next build" and you are using the App Router.`));
18
+ return;
19
+ }
20
+
21
+ // Find all HTML files rendered by Next.js in the app directory
22
+ const files = await glob('**/*.html', { cwd: nextAppDirDir });
23
+
24
+ if (files.length === 0) {
25
+ console.log(pc.yellow(`[Onto] No static HTML files found to process.`));
26
+ return;
27
+ }
28
+
29
+ // Ensure output directory exists
30
+ if (!fs.existsSync(ontoPublicDir)) {
31
+ fs.mkdirSync(ontoPublicDir, { recursive: true });
32
+ }
33
+
34
+ let totalOriginalSize = 0;
35
+ let totalMarkdownSize = 0;
36
+ let totalFilesProcessed = 0;
37
+
38
+ for (const file of files) {
39
+ const inputPath = path.join(nextAppDirDir, file);
40
+
41
+ // We map file path e.g. "pricing.html" to "pricing.md", or "blog/post.html" to "blog/post.md"
42
+ let outputPathRelative = file.replace(/\.html$/, '.md');
43
+ // If it's a dynamic route page, or purely root index.html
44
+ const outputPath = path.join(ontoPublicDir, outputPathRelative);
45
+
46
+ try {
47
+ const htmlContent = fs.readFileSync(inputPath, 'utf8');
48
+
49
+ const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\.md$/, '')}`);
50
+
51
+ // Ensure specific sub-directory exists (e.g., for blog/post.md)
52
+ const outputDir = path.dirname(outputPath);
53
+ if (!fs.existsSync(outputDir)) {
54
+ fs.mkdirSync(outputDir, { recursive: true });
55
+ }
56
+
57
+ fs.writeFileSync(outputPath, result.markdown, 'utf8');
58
+
59
+ totalOriginalSize += result.stats.originalHtmlSize;
60
+ totalMarkdownSize += result.stats.markdownSize;
61
+ totalFilesProcessed++;
62
+
63
+ const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);
64
+ const mdKb = (result.stats.markdownSize / 1024).toFixed(1);
65
+
66
+ // /index.html -> /
67
+ let routeName = file.replace(/\.html$/, '');
68
+ if (routeName === 'index') routeName = '/';
69
+ else routeName = `/${routeName}`;
70
+
71
+ console.log(
72
+ pc.green(`✓ Optimized`) +
73
+ pc.dim(` ${routeName} `) +
74
+ pc.blue(`[${origKb}KB -> ${mdKb}KB]`)
75
+ );
76
+ } catch (e: any) {
77
+ console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));
78
+ }
79
+ }
80
+
81
+ console.log(pc.cyan(`\n[Onto] Finished generation.`));
82
+ console.log(
83
+ pc.bold(
84
+ pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)
85
+ )
86
+ );
87
+ console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\n`));
88
+ }
89
+
90
+ main().catch(e => {
91
+ console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));
92
+ process.exit(1);
93
+ });
@@ -0,0 +1,156 @@
1
+ import * as cheerio from 'cheerio';
2
+ import TurndownService from 'turndown';
3
+
4
+ const turndownService = new TurndownService({
5
+ headingStyle: 'atx',
6
+ codeBlockStyle: 'fenced',
7
+ });
8
+
9
+ // Configure turndown to keep some layout or handle semantic tags differently if needed
10
+
11
+ export interface ExtractionResult {
12
+ markdown: string;
13
+ metadata: {
14
+ title: string;
15
+ description: string;
16
+ jsonLd: any[];
17
+ };
18
+ stats: {
19
+ originalHtmlSize: number;
20
+ markdownSize: number;
21
+ tokenReductionRatio: number;
22
+ };
23
+ }
24
+
25
+ /**
26
+ * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.
27
+ * @param html The raw HTML string.
28
+ * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.
29
+ * @returns {ExtractionResult} The extracted payload.
30
+ */
31
+ export function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {
32
+ const originalSize = html.length;
33
+
34
+ const $ = cheerio.load(html);
35
+
36
+ // 1. Extract Metadata BEFORE removing structure
37
+ const title = $('title').text() || $('h1').first().text() || 'Untitled Page';
38
+ const description = $('meta[name="description"]').attr('content') || 'No description found.';
39
+
40
+ const jsonLdScripts: any[] = [];
41
+ $('script[type="application/ld+json"]').each((_, el) => {
42
+ try {
43
+ const raw = $(el).html() || '';
44
+ const parsed = JSON.parse(raw);
45
+ jsonLdScripts.push(parsed);
46
+ } catch {
47
+ // ignore bad json
48
+ }
49
+ });
50
+
51
+ // 2. Strip noise (React boilerplate, styles, unnecessary tags)
52
+ $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();
53
+
54
+ // Optionally remove typical Next.js hidden wrappers if they don't contain real content.
55
+ // Next.js uses <div id="__next"> but we mostly just want semantic content.
56
+
57
+ // 3. Find the entry point for content
58
+ // Prefer <main> or <article> over <body>
59
+ let contentHtml = '';
60
+ if ($('main').length > 0) {
61
+ contentHtml = $('main').html() || '';
62
+ } else if ($('article').length > 0) {
63
+ contentHtml = $('article').html() || '';
64
+ } else {
65
+ contentHtml = $('body').html() || '';
66
+ }
67
+
68
+ // 4. Convert to Markdown
69
+ let markdown = turndownService.turndown(contentHtml);
70
+
71
+ // 5. Optionally inject Metadata header
72
+ const headerLines = [
73
+ `# ${title}`,
74
+ `> ${description}`,
75
+ ``,
76
+ `**Source:** ${sourceUrl}`,
77
+ `**Extracted:** ${new Date().toISOString()}`,
78
+ ``,
79
+ `---`,
80
+ ``
81
+ ];
82
+
83
+ let finalMarkdown = headerLines.join('\n') + markdown;
84
+
85
+ // Add JSON-LD section if exists
86
+ if (jsonLdScripts.length > 0) {
87
+ finalMarkdown += '\n\n---\n## Structured Data (JSON-LD)\n```json\n';
88
+ jsonLdScripts.forEach(j => {
89
+ finalMarkdown += JSON.stringify(j, null, 2) + '\n';
90
+ });
91
+ finalMarkdown += '```\n';
92
+ }
93
+
94
+ const markdownSize = finalMarkdown.length;
95
+ const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;
96
+
97
+ return {
98
+ markdown: finalMarkdown,
99
+ metadata: {
100
+ title,
101
+ description,
102
+ jsonLd: jsonLdScripts
103
+ },
104
+ stats: {
105
+ originalHtmlSize: originalSize,
106
+ markdownSize,
107
+ tokenReductionRatio
108
+ }
109
+ };
110
+ }
111
+
112
+ export async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {
113
+ const fs = await import('fs');
114
+ const path = await import('path');
115
+ const { glob } = await import('glob');
116
+
117
+ if (!fs.existsSync(nextAppDirDir)) {
118
+ return;
119
+ }
120
+
121
+ const files = await glob('**/*.html', { cwd: nextAppDirDir });
122
+ if (files.length === 0) return;
123
+
124
+ if (!fs.existsSync(ontoPublicDir)) {
125
+ fs.mkdirSync(ontoPublicDir, { recursive: true });
126
+ }
127
+
128
+ let totalFilesProcessed = 0;
129
+
130
+ for (const file of files) {
131
+ const inputPath = path.join(nextAppDirDir, file);
132
+ const outputPathRelative = file.replace(/\.html$/, '.md');
133
+ const outputPath = path.join(ontoPublicDir, outputPathRelative);
134
+
135
+ try {
136
+ const htmlContent = fs.readFileSync(inputPath, 'utf8');
137
+
138
+ let routeName = file.replace(/\.html$/, '');
139
+ if (routeName === 'index') routeName = '/';
140
+ else routeName = `/${routeName}`;
141
+
142
+ const result = extractContent(htmlContent, routeName);
143
+
144
+ const outputDir = path.dirname(outputPath);
145
+ if (!fs.existsSync(outputDir)) {
146
+ fs.mkdirSync(outputDir, { recursive: true });
147
+ }
148
+
149
+ fs.writeFileSync(outputPath, result.markdown, 'utf8');
150
+ totalFilesProcessed++;
151
+ } catch (e: any) {
152
+ console.error(`[Onto] Failed to process ${file}: ${e.message}`);
153
+ }
154
+ }
155
+ console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);
156
+ }
package/src/index.ts ADDED
@@ -0,0 +1,3 @@
1
+ // We cannot use Webpack plugins reliably in Next.js Turbopack due to WorkerError restrictions.
2
+ // Users must instead run `npx onto-next` as a postbuild script.
3
+ export { extractContent } from './extractor';
@@ -0,0 +1,48 @@
1
+ import { NextRequest, NextResponse } from 'next/server';
2
+
3
+ const AI_BOT_USER_AGENTS = [
4
+ 'GPTBot',
5
+ 'ChatGPT-User',
6
+ 'ClaudeBot',
7
+ 'Claude-Web',
8
+ 'anthropic-ai',
9
+ 'PerplexityBot',
10
+ 'OAI-SearchBot',
11
+ 'GoogleExtended',
12
+ ];
13
+
14
+ export function ontoMiddleware(request: NextRequest) {
15
+ const userAgent = request.headers.get('user-agent') || '';
16
+ const accept = request.headers.get('accept') || '';
17
+
18
+ const isAiBot = AI_BOT_USER_AGENTS.some(bot => userAgent.includes(bot));
19
+ const isMarkdownRequested = accept.includes('text/markdown');
20
+
21
+ // If traffic is identified as an AI Bot, rewrite the URL
22
+ if (isAiBot || isMarkdownRequested) {
23
+ const url = request.nextUrl.clone();
24
+
25
+ // Ignore internal next.js requests & static assets
26
+ if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {
27
+ return NextResponse.next();
28
+ }
29
+
30
+ // Determine the corresponding payload path
31
+ let payloadPath = url.pathname;
32
+ if (payloadPath === '/' || payloadPath === '') {
33
+ payloadPath = '/index';
34
+ }
35
+
36
+ // Strip trailing slash if present
37
+ if (payloadPath.endsWith('/') && payloadPath !== '/') {
38
+ payloadPath = payloadPath.slice(0, -1);
39
+ }
40
+
41
+ url.pathname = `/.onto${payloadPath}.md`;
42
+
43
+ // Rewrite implicitly serves the target URL transparently to the client.
44
+ return NextResponse.rewrite(url);
45
+ }
46
+
47
+ return NextResponse.next();
48
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,15 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "es2022",
4
+ "module": "esnext",
5
+ "moduleResolution": "node",
6
+ "esModuleInterop": true,
7
+ "strict": true,
8
+ "skipLibCheck": true,
9
+ "forceConsistentCasingInFileNames": true,
10
+ "outDir": "dist"
11
+ },
12
+ "include": [
13
+ "src/**/*"
14
+ ]
15
+ }
package/tsup.config.ts ADDED
@@ -0,0 +1,14 @@
1
+ import { defineConfig } from 'tsup';
2
+
3
+ export default defineConfig({
4
+ entry: ['src/index.ts', 'src/cli.ts', 'src/middleware.ts'],
5
+ format: ['cjs', 'esm'],
6
+ dts: true,
7
+ splitting: false,
8
+ sourcemap: true,
9
+ clean: true,
10
+ bundle: true,
11
+ outDir: 'dist',
12
+ minify: true,
13
+ external: ['next'],
14
+ });