@ontosdk/next 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +6 -7
- package/dist/cli.js.map +1 -1
- package/dist/cli.mjs +6 -7
- package/dist/cli.mjs.map +1 -1
- package/dist/middleware.d.mts +22 -2
- package/dist/middleware.d.ts +22 -2
- package/dist/middleware.js +5 -1
- package/dist/middleware.js.map +1 -1
- package/dist/middleware.mjs +5 -1
- package/dist/middleware.mjs.map +1 -1
- package/package.json +1 -1
- package/src/bots.ts +65 -0
- package/src/cli.ts +56 -1
- package/src/middleware.ts +80 -14
package/dist/cli.js
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
"use strict";var
|
|
3
|
-
`)+
|
|
2
|
+
"use strict";var F=Object.create;var O=Object.defineProperty;var z=Object.getOwnPropertyDescriptor;var N=Object.getOwnPropertyNames;var R=Object.getPrototypeOf,C=Object.prototype.hasOwnProperty;var E=(e,i,n,t)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of N(i))!C.call(e,s)&&s!==n&&O(e,s,{get:()=>i[s],enumerable:!(t=z(i,s))||t.enumerable});return e};var S=(e,i,n)=>(n=e!=null?F(R(e)):{},E(i||!e||!e.__esModule?O(n,"default",{value:e,enumerable:!0}):n,e));var j=require("glob"),a=S(require("fs")),g=S(require("path")),o=S(require("picocolors"));var k=S(require("cheerio")),v=S(require("turndown")),L=new v.default({headingStyle:"atx",codeBlockStyle:"fenced"});function P(e,i="Generated Output"){let n=e.length,t=k.load(e),s=t("title").text()||t("h1").first().text()||"Untitled Page",m=t('meta[name="description"]').attr("content")||"No description found.",u=[];t('script[type="application/ld+json"]').each((f,d)=>{try{let $=t(d).html()||"",x=JSON.parse($);u.push(x)}catch{}}),t("script, style, noscript, iframe, svg, nav, footer, meta, link, header").remove();let h="";t("main").length>0?h=t("main").html()||"":t("article").length>0?h=t("article").html()||"":h=t("body").html()||"";let w=L.turndown(h),r=[`# ${s}`,`> ${m}`,"",`**Source:** ${i}`,`**Extracted:** ${new Date().toISOString()}`,"","---",""].join(`
|
|
3
|
+
`)+w;u.length>0&&(r+=`
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
## Structured Data (JSON-LD)
|
|
7
7
|
\`\`\`json
|
|
8
|
-
`,
|
|
9
|
-
`}),
|
|
10
|
-
[Onto] Starting Semantic Output Generation...`));let
|
|
11
|
-
|
|
12
|
-
`))}L().catch(t=>{console.error(i.default.red(`[Onto] Fatal Error: ${t.message}`)),process.exit(1)});
|
|
8
|
+
`,u.forEach(f=>{r+=JSON.stringify(f,null,2)+`
|
|
9
|
+
`}),r+="```\n");let c=r.length,p=n>0?(n-c)/n*100:0;return{markdown:r,metadata:{title:s,description:m,jsonLd:u},stats:{originalHtmlSize:n,markdownSize:c,tokenReductionRatio:p}}}function T(){let e=g.default.join(process.cwd(),".env.local");a.default.existsSync(e)&&a.default.readFileSync(e,"utf8").split(/\r?\n/).forEach(n=>{let t=n.trim();if(!t||t.startsWith("#"))return;let[s,...m]=t.split("=");s&&m.length>0&&(process.env[s.trim()]=m.join("=").trim().replace(/^["']|["']$/g,""))})}async function H(){T(),console.log(o.default.cyan(`
|
|
10
|
+
[Onto] Starting Semantic Output Generation...`));let e=process.cwd(),i=g.default.join(e,".next/server/app"),n=g.default.join(e,"public/.onto");if(!a.default.existsSync(i)){console.log(o.default.yellow(`[Onto] Could not find Next.js app output at ${i}`)),console.log(o.default.yellow('[Onto] Ensure this is run after "next build" and you are using the App Router.'));return}let t=await(0,j.glob)("**/*.html",{cwd:i});if(t.length===0){console.log(o.default.yellow("[Onto] No static HTML files found to process."));return}a.default.existsSync(n)||a.default.mkdirSync(n,{recursive:!0});let s=0,m=0,u=0;for(let l of t){let r=g.default.join(i,l),c=l.replace(/\.html$/,".md"),p=g.default.join(n,c);try{let f=a.default.readFileSync(r,"utf8"),d=P(f,`/${c.replace(/\.md$/,"")}`),$=g.default.dirname(p);a.default.existsSync($)||a.default.mkdirSync($,{recursive:!0}),a.default.writeFileSync(p,d.markdown,"utf8"),s+=d.stats.originalHtmlSize,m+=d.stats.markdownSize,u++;let x=(d.stats.originalHtmlSize/1024).toFixed(1),b=(d.stats.markdownSize/1024).toFixed(1),y=l.replace(/\.html$/,"");y==="index"?y="/":y=`/${y}`,console.log(o.default.green("\u2713 Optimized")+o.default.dim(` ${y} `)+o.default.blue(`[${x}KB -> ${b}KB]`))}catch(f){console.error(o.default.red(`\u2717 Failed to process ${l}: ${f.message}`))}}console.log(o.default.bold(o.default.magenta(`Processed ${u} pages. Total Size: ${(s/1024).toFixed(1)}KB -> ${(m/1024).toFixed(1)}KB`)));let h=process.env.ONTO_API_KEY,w=process.env.ONTO_DASHBOARD_URL||"https://app.buildonto.dev";if(h&&u>0){console.log(o.default.cyan(`[Onto] Syncing manifest with Control Plane [${w}]...`));try{let l=t.map(c=>{let p=c.replace(/\.html$/,""),f=p==="index"?"/":`/${p}`,d=g.default.join(n,c.replace(/\.html$/,".md"));return{route:f,filename:`${p}.md`,content:a.default.readFileSync(d,"utf8")}}),r=await fetch(`${w}/api/files`,{method:"POST",headers:{"x-onto-key":h,"Content-Type":"application/json"},body:JSON.stringify({files:l})});if(r.ok)console.log(o.default.green("\u2713 Control Plane sync successful"));else{let c=await r.json().catch(()=>({}));console.log(o.default.yellow(`\u26A0 Control Plane sync skipped: ${c.error||r.statusText}`))}}catch(l){console.log(o.default.yellow(`\u26A0 Control Plane sync failed: ${l.message}`))}}console.log(o.default.dim(`Edge payloads are ready at /public/.onto/*
|
|
11
|
+
`))}H().catch(e=>{console.error(o.default.red(`[Onto] Fatal Error: ${e.message}`)),process.exit(1)});
|
|
13
12
|
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/extractor.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { glob } from 'glob';\r\nimport fs from 'fs';\r\nimport path from 'path';\r\nimport pc from 'picocolors';\r\nimport { extractContent } from './extractor';\r\n\r\nasync function main() {\r\n console.log(pc.cyan('\\n[Onto] Starting Semantic Output Generation...'));\r\n\r\n const cwd = process.cwd();\r\n const nextAppDirDir = path.join(cwd, '.next/server/app');\r\n const ontoPublicDir = path.join(cwd, 'public/.onto');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));\r\n console.log(pc.yellow(`[Onto] Ensure this is run after \"next build\" and you are using the App Router.`));\r\n return;\r\n }\r\n\r\n // Find all HTML files rendered by Next.js in the app directory\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n\r\n if (files.length === 0) {\r\n console.log(pc.yellow(`[Onto] No static HTML files found to process.`));\r\n return;\r\n }\r\n\r\n // Ensure output directory exists\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalOriginalSize = 0;\r\n let totalMarkdownSize = 0;\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n\r\n // We map file path e.g. \"pricing.html\" to \"pricing.md\", or \"blog/post.html\" to \"blog/post.md\"\r\n let outputPathRelative = file.replace(/\\.html$/, '.md');\r\n // If it's a dynamic route page, or purely root index.html\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\\.md$/, '')}`);\r\n\r\n // Ensure specific sub-directory exists (e.g., for blog/post.md)\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n\r\n totalOriginalSize += result.stats.originalHtmlSize;\r\n totalMarkdownSize += result.stats.markdownSize;\r\n totalFilesProcessed++;\r\n\r\n const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);\r\n const mdKb = (result.stats.markdownSize / 1024).toFixed(1);\r\n\r\n // /index.html -> /\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n console.log(\r\n pc.green(`✓ Optimized`) +\r\n pc.dim(` ${routeName} `) +\r\n pc.blue(`[${origKb}KB -> ${mdKb}KB]`)\r\n );\r\n } catch (e: any) {\r\n console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(pc.cyan(`\\n[Onto] Finished generation.`));\r\n console.log(\r\n pc.bold(\r\n pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)\r\n )\r\n );\r\n console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\\n`));\r\n}\r\n\r\nmain().catch(e => {\r\n console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));\r\n process.exit(1);\r\n});\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":";wdACA,IAAAA,EAAqB,gBACrBC,EAAe,iBACfC,EAAiB,mBACjBC,EAAe,yBCJf,IAAAC,EAAyB,sBACzBC,EAA4B,uBAEtBC,EAAkB,IAAI,EAAAC,QAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASC,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWf,EAAgB,SAASc,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ,CDtGA,eAAeC,GAAO,CAClB,QAAQ,IAAI,EAAAC,QAAG,KAAK;AAAA,8CAAiD,CAAC,EAEtE,IAAMC,EAAM,QAAQ,IAAI,EAClBC,EAAgB,EAAAC,QAAK,KAAKF,EAAK,kBAAkB,EACjDG,EAAgB,EAAAD,QAAK,KAAKF,EAAK,cAAc,EAEnD,GAAI,CAAC,EAAAI,QAAG,WAAWH,CAAa,EAAG,CAC/B,QAAQ,IAAI,EAAAF,QAAG,OAAO,+CAA+CE,CAAa,EAAE,CAAC,EACrF,QAAQ,IAAI,EAAAF,QAAG,OAAO,gFAAgF,CAAC,EACvG,MACJ,CAGA,IAAMM,EAAQ,QAAM,QAAK,YAAa,CAAE,IAAKJ,CAAc,CAAC,EAE5D,GAAII,EAAM,SAAW,EAAG,CACpB,QAAQ,IAAI,EAAAN,QAAG,OAAO,+CAA+C,CAAC,EACtE,MACJ,CAGK,EAAAK,QAAG,WAAWD,CAAa,GAC5B,EAAAC,QAAG,UAAUD,EAAe,CAAE,UAAW,EAAK,CAAC,EAGnD,IAAIG,EAAoB,EACpBC,EAAoB,EACpBC,EAAsB,EAE1B,QAAWC,KAAQJ,EAAO,CACtB,IAAMK,EAAY,EAAAR,QAAK,KAAKD,EAAeQ,CAAI,EAG3CE,EAAqBF,EAAK,QAAQ,UAAW,KAAK,EAEhDG,EAAa,EAAAV,QAAK,KAAKC,EAAeQ,CAAkB,EAE9D,GAAI,CACA,IAAME,EAAc,EAAAT,QAAG,aAAaM,EAAW,MAAM,EAE/CI,EAASC,EAAeF,EAAa,IAAIF,EAAmB,QAAQ,QAAS,EAAE,CAAC,EAAE,EAGlFK,EAAY,EAAAd,QAAK,QAAQU,CAAU,EACpC,EAAAR,QAAG,WAAWY,CAAS,GACxB,EAAAZ,QAAG,UAAUY,EAAW,CAAE,UAAW,EAAK,CAAC,EAG/C,EAAAZ,QAAG,cAAcQ,EAAYE,EAAO,SAAU,MAAM,EAEpDR,GAAqBQ,EAAO,MAAM,iBAClCP,GAAqBO,EAAO,MAAM,aAClCN,IAEA,IAAMS,GAAUH,EAAO,MAAM,iBAAmB,MAAM,QAAQ,CAAC,EACzDI,GAAQJ,EAAO,MAAM,aAAe,MAAM,QAAQ,CAAC,EAGrDK,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCU,IAAc,QAASA,EAAY,IAClCA,EAAY,IAAIA,CAAS,GAE9B,QAAQ,IACJ,EAAApB,QAAG,MAAM,kBAAa,EACtB,EAAAA,QAAG,IAAI,IAAIoB,CAAS,GAAG,EACvB,EAAApB,QAAG,KAAK,IAAIkB,CAAM,SAASC,CAAI,KAAK,CACxC,CACJ,OAASE,EAAQ,CACb,QAAQ,MAAM,EAAArB,QAAG,IAAI,4BAAuBU,CAAI,KAAKW,EAAE,OAAO,EAAE,CAAC,CACrE,CACJ,CAEA,QAAQ,IAAI,EAAArB,QAAG,KAAK;AAAA,4BAA+B,CAAC,EACpD,QAAQ,IACJ,EAAAA,QAAG,KACC,EAAAA,QAAG,QAAQ,aAAaS,CAAmB,wBAAwBF,EAAoB,MAAM,QAAQ,CAAC,CAAC,UAAUC,EAAoB,MAAM,QAAQ,CAAC,CAAC,IAAI,CAC7J,CACJ,EACA,QAAQ,IAAI,EAAAR,QAAG,IAAI;AAAA,CAA8C,CAAC,CACtE,CAEAD,EAAK,EAAE,MAAMsB,GAAK,CACd,QAAQ,MAAM,EAAArB,QAAG,IAAI,uBAAuBqB,EAAE,OAAO,EAAE,CAAC,EACxD,QAAQ,KAAK,CAAC,CAClB,CAAC","names":["import_glob","import_fs","import_path","import_picocolors","cheerio","import_turndown","turndownService","TurndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio","main","pc","cwd","nextAppDirDir","path","ontoPublicDir","fs","files","totalOriginalSize","totalMarkdownSize","totalFilesProcessed","file","inputPath","outputPathRelative","outputPath","htmlContent","result","extractContent","outputDir","origKb","mdKb","routeName","e"]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/extractor.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { glob } from 'glob';\r\nimport fs from 'fs';\r\nimport path from 'path';\r\nimport pc from 'picocolors';\r\nimport { extractContent } from './extractor';\r\n\r\n// Simple helper to load .env.local from the current working directory\r\nfunction loadEnv() {\r\n const envPath = path.join(process.cwd(), '.env.local');\r\n if (fs.existsSync(envPath)) {\r\n const envContent = fs.readFileSync(envPath, 'utf8');\r\n envContent.split(/\\r?\\n/).forEach(line => {\r\n const trimmedLine = line.trim();\r\n if (!trimmedLine || trimmedLine.startsWith('#')) return;\r\n const [key, ...valueParts] = trimmedLine.split('=');\r\n if (key && valueParts.length > 0) {\r\n process.env[key.trim()] = valueParts.join('=').trim().replace(/^[\"']|[\"']$/g, '');\r\n }\r\n });\r\n }\r\n}\r\n\r\nasync function main() {\r\n loadEnv();\r\n console.log(pc.cyan('\\n[Onto] Starting Semantic Output Generation...'));\r\n\r\n const cwd = process.cwd();\r\n const nextAppDirDir = path.join(cwd, '.next/server/app');\r\n const ontoPublicDir = path.join(cwd, 'public/.onto');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));\r\n console.log(pc.yellow(`[Onto] Ensure this is run after \"next build\" and you are using the App Router.`));\r\n return;\r\n }\r\n\r\n // Find all HTML files rendered by Next.js in the app directory\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n\r\n if (files.length === 0) {\r\n console.log(pc.yellow(`[Onto] No static HTML files found to process.`));\r\n return;\r\n }\r\n\r\n // Ensure output directory exists\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalOriginalSize = 0;\r\n let totalMarkdownSize = 0;\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n\r\n // We map file path e.g. \"pricing.html\" to \"pricing.md\", or \"blog/post.html\" to \"blog/post.md\"\r\n let outputPathRelative = file.replace(/\\.html$/, '.md');\r\n // If it's a dynamic route page, or purely root index.html\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\\.md$/, '')}`);\r\n\r\n // Ensure specific sub-directory exists (e.g., for blog/post.md)\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n\r\n totalOriginalSize += result.stats.originalHtmlSize;\r\n totalMarkdownSize += result.stats.markdownSize;\r\n totalFilesProcessed++;\r\n\r\n const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);\r\n const mdKb = (result.stats.markdownSize / 1024).toFixed(1);\r\n\r\n // /index.html -> /\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n console.log(\r\n pc.green(`✓ Optimized`) +\r\n pc.dim(` ${routeName} `) +\r\n pc.blue(`[${origKb}KB -> ${mdKb}KB]`)\r\n );\r\n } catch (e: any) {\r\n console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(\r\n pc.bold(\r\n pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)\r\n )\r\n );\r\n\r\n // Sync with Onto Control Plane (Premium)\r\n const ONTO_API_KEY = process.env.ONTO_API_KEY;\r\n const DASHBOARD_URL = process.env.ONTO_DASHBOARD_URL || 'https://app.buildonto.dev';\r\n\r\n if (ONTO_API_KEY && totalFilesProcessed > 0) {\r\n console.log(pc.cyan(`[Onto] Syncing manifest with Control Plane [${DASHBOARD_URL}]...`));\r\n try {\r\n const manifest = files.map(file => {\r\n const routeName = file.replace(/\\.html$/, '');\r\n const route = routeName === 'index' ? '/' : `/${routeName}`;\r\n const mdPath = path.join(ontoPublicDir, file.replace(/\\.html$/, '.md'));\r\n return {\r\n route,\r\n filename: `${routeName}.md`,\r\n content: fs.readFileSync(mdPath, 'utf8')\r\n };\r\n });\r\n\r\n const res = await fetch(`${DASHBOARD_URL}/api/files`, {\r\n method: 'POST',\r\n headers: {\r\n 'x-onto-key': ONTO_API_KEY,\r\n 'Content-Type': 'application/json'\r\n },\r\n body: JSON.stringify({ files: manifest })\r\n });\r\n\r\n if (res.ok) {\r\n console.log(pc.green('✓ Control Plane sync successful'));\r\n } else {\r\n const errData = await res.json().catch(() => ({}));\r\n console.log(pc.yellow(`⚠ Control Plane sync skipped: ${errData.error || res.statusText}`));\r\n }\r\n } catch (e: any) {\r\n console.log(pc.yellow(`⚠ Control Plane sync failed: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\\n`));\r\n}\r\n\r\nmain().catch(e => {\r\n console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));\r\n process.exit(1);\r\n});\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":";wdACA,IAAAA,EAAqB,gBACrBC,EAAe,iBACfC,EAAiB,mBACjBC,EAAe,yBCJf,IAAAC,EAAyB,sBACzBC,EAA4B,uBAEtBC,EAAkB,IAAI,EAAAC,QAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASC,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWf,EAAgB,SAASc,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ,CDrGA,SAASC,GAAU,CACf,IAAMC,EAAU,EAAAC,QAAK,KAAK,QAAQ,IAAI,EAAG,YAAY,EACjD,EAAAC,QAAG,WAAWF,CAAO,GACF,EAAAE,QAAG,aAAaF,EAAS,MAAM,EACvC,MAAM,OAAO,EAAE,QAAQG,GAAQ,CACtC,IAAMC,EAAcD,EAAK,KAAK,EAC9B,GAAI,CAACC,GAAeA,EAAY,WAAW,GAAG,EAAG,OACjD,GAAM,CAACC,EAAK,GAAGC,CAAU,EAAIF,EAAY,MAAM,GAAG,EAC9CC,GAAOC,EAAW,OAAS,IAC3B,QAAQ,IAAID,EAAI,KAAK,CAAC,EAAIC,EAAW,KAAK,GAAG,EAAE,KAAK,EAAE,QAAQ,eAAgB,EAAE,EAExF,CAAC,CAET,CAEA,eAAeC,GAAO,CAClBR,EAAQ,EACR,QAAQ,IAAI,EAAAS,QAAG,KAAK;AAAA,8CAAiD,CAAC,EAEtE,IAAMC,EAAM,QAAQ,IAAI,EAClBC,EAAgB,EAAAT,QAAK,KAAKQ,EAAK,kBAAkB,EACjDE,EAAgB,EAAAV,QAAK,KAAKQ,EAAK,cAAc,EAEnD,GAAI,CAAC,EAAAP,QAAG,WAAWQ,CAAa,EAAG,CAC/B,QAAQ,IAAI,EAAAF,QAAG,OAAO,+CAA+CE,CAAa,EAAE,CAAC,EACrF,QAAQ,IAAI,EAAAF,QAAG,OAAO,gFAAgF,CAAC,EACvG,MACJ,CAGA,IAAMI,EAAQ,QAAM,QAAK,YAAa,CAAE,IAAKF,CAAc,CAAC,EAE5D,GAAIE,EAAM,SAAW,EAAG,CACpB,QAAQ,IAAI,EAAAJ,QAAG,OAAO,+CAA+C,CAAC,EACtE,MACJ,CAGK,EAAAN,QAAG,WAAWS,CAAa,GAC5B,EAAAT,QAAG,UAAUS,EAAe,CAAE,UAAW,EAAK,CAAC,EAGnD,IAAIE,EAAoB,EACpBC,EAAoB,EACpBC,EAAsB,EAE1B,QAAWC,KAAQJ,EAAO,CACtB,IAAMK,EAAY,EAAAhB,QAAK,KAAKS,EAAeM,CAAI,EAG3CE,EAAqBF,EAAK,QAAQ,UAAW,KAAK,EAEhDG,EAAa,EAAAlB,QAAK,KAAKU,EAAeO,CAAkB,EAE9D,GAAI,CACA,IAAME,EAAc,EAAAlB,QAAG,aAAae,EAAW,MAAM,EAE/CI,EAASC,EAAeF,EAAa,IAAIF,EAAmB,QAAQ,QAAS,EAAE,CAAC,EAAE,EAGlFK,EAAY,EAAAtB,QAAK,QAAQkB,CAAU,EACpC,EAAAjB,QAAG,WAAWqB,CAAS,GACxB,EAAArB,QAAG,UAAUqB,EAAW,CAAE,UAAW,EAAK,CAAC,EAG/C,EAAArB,QAAG,cAAciB,EAAYE,EAAO,SAAU,MAAM,EAEpDR,GAAqBQ,EAAO,MAAM,iBAClCP,GAAqBO,EAAO,MAAM,aAClCN,IAEA,IAAMS,GAAUH,EAAO,MAAM,iBAAmB,MAAM,QAAQ,CAAC,EACzDI,GAAQJ,EAAO,MAAM,aAAe,MAAM,QAAQ,CAAC,EAGrDK,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCU,IAAc,QAASA,EAAY,IAClCA,EAAY,IAAIA,CAAS,GAE9B,QAAQ,IACJ,EAAAlB,QAAG,MAAM,kBAAa,EACtB,EAAAA,QAAG,IAAI,IAAIkB,CAAS,GAAG,EACvB,EAAAlB,QAAG,KAAK,IAAIgB,CAAM,SAASC,CAAI,KAAK,CACxC,CACJ,OAASE,EAAQ,CACb,QAAQ,MAAM,EAAAnB,QAAG,IAAI,4BAAuBQ,CAAI,KAAKW,EAAE,OAAO,EAAE,CAAC,CACrE,CACJ,CAEA,QAAQ,IACJ,EAAAnB,QAAG,KACC,EAAAA,QAAG,QAAQ,aAAaO,CAAmB,wBAAwBF,EAAoB,MAAM,QAAQ,CAAC,CAAC,UAAUC,EAAoB,MAAM,QAAQ,CAAC,CAAC,IAAI,CAC7J,CACJ,EAGA,IAAMc,EAAe,QAAQ,IAAI,aAC3BC,EAAgB,QAAQ,IAAI,oBAAsB,4BAExD,GAAID,GAAgBb,EAAsB,EAAG,CACzC,QAAQ,IAAI,EAAAP,QAAG,KAAK,+CAA+CqB,CAAa,MAAM,CAAC,EACvF,GAAI,CACA,IAAMC,EAAWlB,EAAM,IAAII,GAAQ,CAC/B,IAAMU,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCe,EAAQL,IAAc,QAAU,IAAM,IAAIA,CAAS,GACnDM,EAAS,EAAA/B,QAAK,KAAKU,EAAeK,EAAK,QAAQ,UAAW,KAAK,CAAC,EACtE,MAAO,CACH,MAAAe,EACA,SAAU,GAAGL,CAAS,MACtB,QAAS,EAAAxB,QAAG,aAAa8B,EAAQ,MAAM,CAC3C,CACJ,CAAC,EAEKC,EAAM,MAAM,MAAM,GAAGJ,CAAa,aAAc,CAClD,OAAQ,OACR,QAAS,CACL,aAAcD,EACd,eAAgB,kBACpB,EACA,KAAM,KAAK,UAAU,CAAE,MAAOE,CAAS,CAAC,CAC5C,CAAC,EAED,GAAIG,EAAI,GACJ,QAAQ,IAAI,EAAAzB,QAAG,MAAM,sCAAiC,CAAC,MACpD,CACH,IAAM0B,EAAU,MAAMD,EAAI,KAAK,EAAE,MAAM,KAAO,CAAC,EAAE,EACjD,QAAQ,IAAI,EAAAzB,QAAG,OAAO,sCAAiC0B,EAAQ,OAASD,EAAI,UAAU,EAAE,CAAC,CAC7F,CACJ,OAASN,EAAQ,CACb,QAAQ,IAAI,EAAAnB,QAAG,OAAO,qCAAgCmB,EAAE,OAAO,EAAE,CAAC,CACtE,CACJ,CAEA,QAAQ,IAAI,EAAAnB,QAAG,IAAI;AAAA,CAA8C,CAAC,CACtE,CAEAD,EAAK,EAAE,MAAM,GAAK,CACd,QAAQ,MAAM,EAAAC,QAAG,IAAI,uBAAuB,EAAE,OAAO,EAAE,CAAC,EACxD,QAAQ,KAAK,CAAC,CAClB,CAAC","names":["import_glob","import_fs","import_path","import_picocolors","cheerio","import_turndown","turndownService","TurndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio","loadEnv","envPath","path","fs","line","trimmedLine","key","valueParts","main","pc","cwd","nextAppDirDir","ontoPublicDir","files","totalOriginalSize","totalMarkdownSize","totalFilesProcessed","file","inputPath","outputPathRelative","outputPath","htmlContent","result","extractContent","outputDir","origKb","mdKb","routeName","e","ONTO_API_KEY","DASHBOARD_URL","manifest","route","mdPath","res","errData"]}
|
package/dist/cli.mjs
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import{glob as
|
|
3
|
-
`)+
|
|
2
|
+
import{glob as j}from"glob";import r from"fs";import h from"path";import e from"picocolors";import*as x from"cheerio";import v from"turndown";var P=new v({headingStyle:"atx",codeBlockStyle:"fenced"});function O(i,f="Generated Output"){let n=i.length,t=x.load(i),l=t("title").text()||t("h1").first().text()||"Untitled Page",d=t('meta[name="description"]').attr("content")||"No description found.",m=[];t('script[type="application/ld+json"]').each((p,c)=>{try{let w=t(c).html()||"",$=JSON.parse(w);m.push($)}catch{}}),t("script, style, noscript, iframe, svg, nav, footer, meta, link, header").remove();let g="";t("main").length>0?g=t("main").html()||"":t("article").length>0?g=t("article").html()||"":g=t("body").html()||"";let S=P.turndown(g),o=[`# ${l}`,`> ${d}`,"",`**Source:** ${f}`,`**Extracted:** ${new Date().toISOString()}`,"","---",""].join(`
|
|
3
|
+
`)+S;m.length>0&&(o+=`
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
## Structured Data (JSON-LD)
|
|
7
7
|
\`\`\`json
|
|
8
|
-
`,
|
|
9
|
-
`}),
|
|
10
|
-
[Onto] Starting Semantic Output Generation...`));let
|
|
11
|
-
|
|
12
|
-
`))}b().catch(r=>{console.error(e.red(`[Onto] Fatal Error: ${r.message}`)),process.exit(1)});
|
|
8
|
+
`,m.forEach(p=>{o+=JSON.stringify(p,null,2)+`
|
|
9
|
+
`}),o+="```\n");let s=o.length,u=n>0?(n-s)/n*100:0;return{markdown:o,metadata:{title:l,description:d,jsonLd:m},stats:{originalHtmlSize:n,markdownSize:s,tokenReductionRatio:u}}}function b(){let i=h.join(process.cwd(),".env.local");r.existsSync(i)&&r.readFileSync(i,"utf8").split(/\r?\n/).forEach(n=>{let t=n.trim();if(!t||t.startsWith("#"))return;let[l,...d]=t.split("=");l&&d.length>0&&(process.env[l.trim()]=d.join("=").trim().replace(/^["']|["']$/g,""))})}async function F(){b(),console.log(e.cyan(`
|
|
10
|
+
[Onto] Starting Semantic Output Generation...`));let i=process.cwd(),f=h.join(i,".next/server/app"),n=h.join(i,"public/.onto");if(!r.existsSync(f)){console.log(e.yellow(`[Onto] Could not find Next.js app output at ${f}`)),console.log(e.yellow('[Onto] Ensure this is run after "next build" and you are using the App Router.'));return}let t=await j("**/*.html",{cwd:f});if(t.length===0){console.log(e.yellow("[Onto] No static HTML files found to process."));return}r.existsSync(n)||r.mkdirSync(n,{recursive:!0});let l=0,d=0,m=0;for(let a of t){let o=h.join(f,a),s=a.replace(/\.html$/,".md"),u=h.join(n,s);try{let p=r.readFileSync(o,"utf8"),c=O(p,`/${s.replace(/\.md$/,"")}`),w=h.dirname(u);r.existsSync(w)||r.mkdirSync(w,{recursive:!0}),r.writeFileSync(u,c.markdown,"utf8"),l+=c.stats.originalHtmlSize,d+=c.stats.markdownSize,m++;let $=(c.stats.originalHtmlSize/1024).toFixed(1),k=(c.stats.markdownSize/1024).toFixed(1),y=a.replace(/\.html$/,"");y==="index"?y="/":y=`/${y}`,console.log(e.green("\u2713 Optimized")+e.dim(` ${y} `)+e.blue(`[${$}KB -> ${k}KB]`))}catch(p){console.error(e.red(`\u2717 Failed to process ${a}: ${p.message}`))}}console.log(e.bold(e.magenta(`Processed ${m} pages. Total Size: ${(l/1024).toFixed(1)}KB -> ${(d/1024).toFixed(1)}KB`)));let g=process.env.ONTO_API_KEY,S=process.env.ONTO_DASHBOARD_URL||"https://app.buildonto.dev";if(g&&m>0){console.log(e.cyan(`[Onto] Syncing manifest with Control Plane [${S}]...`));try{let a=t.map(s=>{let u=s.replace(/\.html$/,""),p=u==="index"?"/":`/${u}`,c=h.join(n,s.replace(/\.html$/,".md"));return{route:p,filename:`${u}.md`,content:r.readFileSync(c,"utf8")}}),o=await fetch(`${S}/api/files`,{method:"POST",headers:{"x-onto-key":g,"Content-Type":"application/json"},body:JSON.stringify({files:a})});if(o.ok)console.log(e.green("\u2713 Control Plane sync successful"));else{let s=await o.json().catch(()=>({}));console.log(e.yellow(`\u26A0 Control Plane sync skipped: ${s.error||o.statusText}`))}}catch(a){console.log(e.yellow(`\u26A0 Control Plane sync failed: ${a.message}`))}}console.log(e.dim(`Edge payloads are ready at /public/.onto/*
|
|
11
|
+
`))}F().catch(i=>{console.error(e.red(`[Onto] Fatal Error: ${i.message}`)),process.exit(1)});
|
|
13
12
|
//# sourceMappingURL=cli.mjs.map
|
package/dist/cli.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts","../src/extractor.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { glob } from 'glob';\r\nimport fs from 'fs';\r\nimport path from 'path';\r\nimport pc from 'picocolors';\r\nimport { extractContent } from './extractor';\r\n\r\nasync function main() {\r\n console.log(pc.cyan('\\n[Onto] Starting Semantic Output Generation...'));\r\n\r\n const cwd = process.cwd();\r\n const nextAppDirDir = path.join(cwd, '.next/server/app');\r\n const ontoPublicDir = path.join(cwd, 'public/.onto');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));\r\n console.log(pc.yellow(`[Onto] Ensure this is run after \"next build\" and you are using the App Router.`));\r\n return;\r\n }\r\n\r\n // Find all HTML files rendered by Next.js in the app directory\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n\r\n if (files.length === 0) {\r\n console.log(pc.yellow(`[Onto] No static HTML files found to process.`));\r\n return;\r\n }\r\n\r\n // Ensure output directory exists\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalOriginalSize = 0;\r\n let totalMarkdownSize = 0;\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n\r\n // We map file path e.g. \"pricing.html\" to \"pricing.md\", or \"blog/post.html\" to \"blog/post.md\"\r\n let outputPathRelative = file.replace(/\\.html$/, '.md');\r\n // If it's a dynamic route page, or purely root index.html\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\\.md$/, '')}`);\r\n\r\n // Ensure specific sub-directory exists (e.g., for blog/post.md)\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n\r\n totalOriginalSize += result.stats.originalHtmlSize;\r\n totalMarkdownSize += result.stats.markdownSize;\r\n totalFilesProcessed++;\r\n\r\n const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);\r\n const mdKb = (result.stats.markdownSize / 1024).toFixed(1);\r\n\r\n // /index.html -> /\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n console.log(\r\n pc.green(`✓ Optimized`) +\r\n pc.dim(` ${routeName} `) +\r\n pc.blue(`[${origKb}KB -> ${mdKb}KB]`)\r\n );\r\n } catch (e: any) {\r\n console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(pc.cyan(`\\n[Onto] Finished generation.`));\r\n console.log(\r\n pc.bold(\r\n pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)\r\n )\r\n );\r\n console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\\n`));\r\n}\r\n\r\nmain().catch(e => {\r\n console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));\r\n process.exit(1);\r\n});\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":";AACA,OAAS,QAAAA,MAAY,OACrB,OAAOC,MAAQ,KACf,OAAOC,MAAU,OACjB,OAAOC,MAAQ,aCJf,UAAYC,MAAa,UACzB,OAAOC,MAAqB,WAE5B,IAAMC,EAAkB,IAAID,EAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASE,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWd,EAAgB,SAASa,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ,CDtGA,eAAeC,GAAO,CAClB,QAAQ,IAAIC,EAAG,KAAK;AAAA,8CAAiD,CAAC,EAEtE,IAAMC,EAAM,QAAQ,IAAI,EAClBC,EAAgBC,EAAK,KAAKF,EAAK,kBAAkB,EACjDG,EAAgBD,EAAK,KAAKF,EAAK,cAAc,EAEnD,GAAI,CAACI,EAAG,WAAWH,CAAa,EAAG,CAC/B,QAAQ,IAAIF,EAAG,OAAO,+CAA+CE,CAAa,EAAE,CAAC,EACrF,QAAQ,IAAIF,EAAG,OAAO,gFAAgF,CAAC,EACvG,MACJ,CAGA,IAAMM,EAAQ,MAAMC,EAAK,YAAa,CAAE,IAAKL,CAAc,CAAC,EAE5D,GAAII,EAAM,SAAW,EAAG,CACpB,QAAQ,IAAIN,EAAG,OAAO,+CAA+C,CAAC,EACtE,MACJ,CAGKK,EAAG,WAAWD,CAAa,GAC5BC,EAAG,UAAUD,EAAe,CAAE,UAAW,EAAK,CAAC,EAGnD,IAAII,EAAoB,EACpBC,EAAoB,EACpBC,EAAsB,EAE1B,QAAWC,KAAQL,EAAO,CACtB,IAAMM,EAAYT,EAAK,KAAKD,EAAeS,CAAI,EAG3CE,EAAqBF,EAAK,QAAQ,UAAW,KAAK,EAEhDG,EAAaX,EAAK,KAAKC,EAAeS,CAAkB,EAE9D,GAAI,CACA,IAAME,EAAcV,EAAG,aAAaO,EAAW,MAAM,EAE/CI,EAASC,EAAeF,EAAa,IAAIF,EAAmB,QAAQ,QAAS,EAAE,CAAC,EAAE,EAGlFK,EAAYf,EAAK,QAAQW,CAAU,EACpCT,EAAG,WAAWa,CAAS,GACxBb,EAAG,UAAUa,EAAW,CAAE,UAAW,EAAK,CAAC,EAG/Cb,EAAG,cAAcS,EAAYE,EAAO,SAAU,MAAM,EAEpDR,GAAqBQ,EAAO,MAAM,iBAClCP,GAAqBO,EAAO,MAAM,aAClCN,IAEA,IAAMS,GAAUH,EAAO,MAAM,iBAAmB,MAAM,QAAQ,CAAC,EACzDI,GAAQJ,EAAO,MAAM,aAAe,MAAM,QAAQ,CAAC,EAGrDK,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCU,IAAc,QAASA,EAAY,IAClCA,EAAY,IAAIA,CAAS,GAE9B,QAAQ,IACJrB,EAAG,MAAM,kBAAa,EACtBA,EAAG,IAAI,IAAIqB,CAAS,GAAG,EACvBrB,EAAG,KAAK,IAAImB,CAAM,SAASC,CAAI,KAAK,CACxC,CACJ,OAASE,EAAQ,CACb,QAAQ,MAAMtB,EAAG,IAAI,4BAAuBW,CAAI,KAAKW,EAAE,OAAO,EAAE,CAAC,CACrE,CACJ,CAEA,QAAQ,IAAItB,EAAG,KAAK;AAAA,4BAA+B,CAAC,EACpD,QAAQ,IACJA,EAAG,KACCA,EAAG,QAAQ,aAAaU,CAAmB,wBAAwBF,EAAoB,MAAM,QAAQ,CAAC,CAAC,UAAUC,EAAoB,MAAM,QAAQ,CAAC,CAAC,IAAI,CAC7J,CACJ,EACA,QAAQ,IAAIT,EAAG,IAAI;AAAA,CAA8C,CAAC,CACtE,CAEAD,EAAK,EAAE,MAAMuB,GAAK,CACd,QAAQ,MAAMtB,EAAG,IAAI,uBAAuBsB,EAAE,OAAO,EAAE,CAAC,EACxD,QAAQ,KAAK,CAAC,CAClB,CAAC","names":["glob","fs","path","pc","cheerio","TurndownService","turndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio","main","pc","cwd","nextAppDirDir","path","ontoPublicDir","fs","files","glob","totalOriginalSize","totalMarkdownSize","totalFilesProcessed","file","inputPath","outputPathRelative","outputPath","htmlContent","result","extractContent","outputDir","origKb","mdKb","routeName","e"]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts","../src/extractor.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { glob } from 'glob';\r\nimport fs from 'fs';\r\nimport path from 'path';\r\nimport pc from 'picocolors';\r\nimport { extractContent } from './extractor';\r\n\r\n// Simple helper to load .env.local from the current working directory\r\nfunction loadEnv() {\r\n const envPath = path.join(process.cwd(), '.env.local');\r\n if (fs.existsSync(envPath)) {\r\n const envContent = fs.readFileSync(envPath, 'utf8');\r\n envContent.split(/\\r?\\n/).forEach(line => {\r\n const trimmedLine = line.trim();\r\n if (!trimmedLine || trimmedLine.startsWith('#')) return;\r\n const [key, ...valueParts] = trimmedLine.split('=');\r\n if (key && valueParts.length > 0) {\r\n process.env[key.trim()] = valueParts.join('=').trim().replace(/^[\"']|[\"']$/g, '');\r\n }\r\n });\r\n }\r\n}\r\n\r\nasync function main() {\r\n loadEnv();\r\n console.log(pc.cyan('\\n[Onto] Starting Semantic Output Generation...'));\r\n\r\n const cwd = process.cwd();\r\n const nextAppDirDir = path.join(cwd, '.next/server/app');\r\n const ontoPublicDir = path.join(cwd, 'public/.onto');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n console.log(pc.yellow(`[Onto] Could not find Next.js app output at ${nextAppDirDir}`));\r\n console.log(pc.yellow(`[Onto] Ensure this is run after \"next build\" and you are using the App Router.`));\r\n return;\r\n }\r\n\r\n // Find all HTML files rendered by Next.js in the app directory\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n\r\n if (files.length === 0) {\r\n console.log(pc.yellow(`[Onto] No static HTML files found to process.`));\r\n return;\r\n }\r\n\r\n // Ensure output directory exists\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalOriginalSize = 0;\r\n let totalMarkdownSize = 0;\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n\r\n // We map file path e.g. \"pricing.html\" to \"pricing.md\", or \"blog/post.html\" to \"blog/post.md\"\r\n let outputPathRelative = file.replace(/\\.html$/, '.md');\r\n // If it's a dynamic route page, or purely root index.html\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n const result = extractContent(htmlContent, `/${outputPathRelative.replace(/\\.md$/, '')}`);\r\n\r\n // Ensure specific sub-directory exists (e.g., for blog/post.md)\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n\r\n totalOriginalSize += result.stats.originalHtmlSize;\r\n totalMarkdownSize += result.stats.markdownSize;\r\n totalFilesProcessed++;\r\n\r\n const origKb = (result.stats.originalHtmlSize / 1024).toFixed(1);\r\n const mdKb = (result.stats.markdownSize / 1024).toFixed(1);\r\n\r\n // /index.html -> /\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n console.log(\r\n pc.green(`✓ Optimized`) +\r\n pc.dim(` ${routeName} `) +\r\n pc.blue(`[${origKb}KB -> ${mdKb}KB]`)\r\n );\r\n } catch (e: any) {\r\n console.error(pc.red(`✗ Failed to process ${file}: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(\r\n pc.bold(\r\n pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)\r\n )\r\n );\r\n\r\n // Sync with Onto Control Plane (Premium)\r\n const ONTO_API_KEY = process.env.ONTO_API_KEY;\r\n const DASHBOARD_URL = process.env.ONTO_DASHBOARD_URL || 'https://app.buildonto.dev';\r\n\r\n if (ONTO_API_KEY && totalFilesProcessed > 0) {\r\n console.log(pc.cyan(`[Onto] Syncing manifest with Control Plane [${DASHBOARD_URL}]...`));\r\n try {\r\n const manifest = files.map(file => {\r\n const routeName = file.replace(/\\.html$/, '');\r\n const route = routeName === 'index' ? '/' : `/${routeName}`;\r\n const mdPath = path.join(ontoPublicDir, file.replace(/\\.html$/, '.md'));\r\n return {\r\n route,\r\n filename: `${routeName}.md`,\r\n content: fs.readFileSync(mdPath, 'utf8')\r\n };\r\n });\r\n\r\n const res = await fetch(`${DASHBOARD_URL}/api/files`, {\r\n method: 'POST',\r\n headers: {\r\n 'x-onto-key': ONTO_API_KEY,\r\n 'Content-Type': 'application/json'\r\n },\r\n body: JSON.stringify({ files: manifest })\r\n });\r\n\r\n if (res.ok) {\r\n console.log(pc.green('✓ Control Plane sync successful'));\r\n } else {\r\n const errData = await res.json().catch(() => ({}));\r\n console.log(pc.yellow(`⚠ Control Plane sync skipped: ${errData.error || res.statusText}`));\r\n }\r\n } catch (e: any) {\r\n console.log(pc.yellow(`⚠ Control Plane sync failed: ${e.message}`));\r\n }\r\n }\r\n\r\n console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\\n`));\r\n}\r\n\r\nmain().catch(e => {\r\n console.error(pc.red(`[Onto] Fatal Error: ${e.message}`));\r\n process.exit(1);\r\n});\r\n","import * as cheerio from 'cheerio';\r\nimport TurndownService from 'turndown';\r\n\r\nconst turndownService = new TurndownService({\r\n headingStyle: 'atx',\r\n codeBlockStyle: 'fenced',\r\n});\r\n\r\n// Configure turndown to keep some layout or handle semantic tags differently if needed\r\n\r\nexport interface ExtractionResult {\r\n markdown: string;\r\n metadata: {\r\n title: string;\r\n description: string;\r\n jsonLd: any[];\r\n };\r\n stats: {\r\n originalHtmlSize: number;\r\n markdownSize: number;\r\n tokenReductionRatio: number;\r\n };\r\n}\r\n\r\n/**\r\n * Extracts pure semantic markdown and metadata from rendered Next.js HTML strings.\r\n * @param html The raw HTML string.\r\n * @param sourceUrl (Optional) the URL this was generated from, to attach as metadata.\r\n * @returns {ExtractionResult} The extracted payload.\r\n */\r\nexport function extractContent(html: string, sourceUrl: string = 'Generated Output'): ExtractionResult {\r\n const originalSize = html.length;\r\n\r\n const $ = cheerio.load(html);\r\n\r\n // 1. Extract Metadata BEFORE removing structure\r\n const title = $('title').text() || $('h1').first().text() || 'Untitled Page';\r\n const description = $('meta[name=\"description\"]').attr('content') || 'No description found.';\r\n\r\n const jsonLdScripts: any[] = [];\r\n $('script[type=\"application/ld+json\"]').each((_, el) => {\r\n try {\r\n const raw = $(el).html() || '';\r\n const parsed = JSON.parse(raw);\r\n jsonLdScripts.push(parsed);\r\n } catch {\r\n // ignore bad json\r\n }\r\n });\r\n\r\n // 2. Strip noise (React boilerplate, styles, unnecessary tags)\r\n $('script, style, noscript, iframe, svg, nav, footer, meta, link, header').remove();\r\n\r\n // Optionally remove typical Next.js hidden wrappers if they don't contain real content.\r\n // Next.js uses <div id=\"__next\"> but we mostly just want semantic content.\r\n\r\n // 3. Find the entry point for content\r\n // Prefer <main> or <article> over <body>\r\n let contentHtml = '';\r\n if ($('main').length > 0) {\r\n contentHtml = $('main').html() || '';\r\n } else if ($('article').length > 0) {\r\n contentHtml = $('article').html() || '';\r\n } else {\r\n contentHtml = $('body').html() || '';\r\n }\r\n\r\n // 4. Convert to Markdown\r\n let markdown = turndownService.turndown(contentHtml);\r\n\r\n // 5. Optionally inject Metadata header\r\n const headerLines = [\r\n `# ${title}`,\r\n `> ${description}`,\r\n ``,\r\n `**Source:** ${sourceUrl}`,\r\n `**Extracted:** ${new Date().toISOString()}`,\r\n ``,\r\n `---`,\r\n ``\r\n ];\r\n\r\n let finalMarkdown = headerLines.join('\\n') + markdown;\r\n\r\n // Add JSON-LD section if exists\r\n if (jsonLdScripts.length > 0) {\r\n finalMarkdown += '\\n\\n---\\n## Structured Data (JSON-LD)\\n```json\\n';\r\n jsonLdScripts.forEach(j => {\r\n finalMarkdown += JSON.stringify(j, null, 2) + '\\n';\r\n });\r\n finalMarkdown += '```\\n';\r\n }\r\n\r\n const markdownSize = finalMarkdown.length;\r\n const tokenReductionRatio = originalSize > 0 ? ((originalSize - markdownSize) / originalSize) * 100 : 0;\r\n\r\n return {\r\n markdown: finalMarkdown,\r\n metadata: {\r\n title,\r\n description,\r\n jsonLd: jsonLdScripts\r\n },\r\n stats: {\r\n originalHtmlSize: originalSize,\r\n markdownSize,\r\n tokenReductionRatio\r\n }\r\n };\r\n}\r\n\r\nexport async function generateStaticPayloads(nextAppDirDir: string, ontoPublicDir: string) {\r\n const fs = await import('fs');\r\n const path = await import('path');\r\n const { glob } = await import('glob');\r\n\r\n if (!fs.existsSync(nextAppDirDir)) {\r\n return;\r\n }\r\n\r\n const files = await glob('**/*.html', { cwd: nextAppDirDir });\r\n if (files.length === 0) return;\r\n\r\n if (!fs.existsSync(ontoPublicDir)) {\r\n fs.mkdirSync(ontoPublicDir, { recursive: true });\r\n }\r\n\r\n let totalFilesProcessed = 0;\r\n\r\n for (const file of files) {\r\n const inputPath = path.join(nextAppDirDir, file);\r\n const outputPathRelative = file.replace(/\\.html$/, '.md');\r\n const outputPath = path.join(ontoPublicDir, outputPathRelative);\r\n\r\n try {\r\n const htmlContent = fs.readFileSync(inputPath, 'utf8');\r\n\r\n let routeName = file.replace(/\\.html$/, '');\r\n if (routeName === 'index') routeName = '/';\r\n else routeName = `/${routeName}`;\r\n\r\n const result = extractContent(htmlContent, routeName);\r\n\r\n const outputDir = path.dirname(outputPath);\r\n if (!fs.existsSync(outputDir)) {\r\n fs.mkdirSync(outputDir, { recursive: true });\r\n }\r\n\r\n fs.writeFileSync(outputPath, result.markdown, 'utf8');\r\n totalFilesProcessed++;\r\n } catch (e: any) {\r\n console.error(`[Onto] Failed to process ${file}: ${e.message}`);\r\n }\r\n }\r\n console.log(`[Onto] Successfully generated ${totalFilesProcessed} semantic markdown endpoints.`);\r\n}\r\n"],"mappings":";AACA,OAAS,QAAAA,MAAY,OACrB,OAAOC,MAAQ,KACf,OAAOC,MAAU,OACjB,OAAOC,MAAQ,aCJf,UAAYC,MAAa,UACzB,OAAOC,MAAqB,WAE5B,IAAMC,EAAkB,IAAID,EAAgB,CACxC,aAAc,MACd,eAAgB,QACpB,CAAC,EAwBM,SAASE,EAAeC,EAAcC,EAAoB,mBAAsC,CACnG,IAAMC,EAAeF,EAAK,OAEpBG,EAAY,OAAKH,CAAI,EAGrBI,EAAQD,EAAE,OAAO,EAAE,KAAK,GAAKA,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,GAAK,gBACvDE,EAAcF,EAAE,0BAA0B,EAAE,KAAK,SAAS,GAAK,wBAE/DG,EAAuB,CAAC,EAC9BH,EAAE,oCAAoC,EAAE,KAAK,CAACI,EAAGC,IAAO,CACpD,GAAI,CACA,IAAMC,EAAMN,EAAEK,CAAE,EAAE,KAAK,GAAK,GACtBE,EAAS,KAAK,MAAMD,CAAG,EAC7BH,EAAc,KAAKI,CAAM,CAC7B,MAAQ,CAER,CACJ,CAAC,EAGDP,EAAE,uEAAuE,EAAE,OAAO,EAOlF,IAAIQ,EAAc,GACdR,EAAE,MAAM,EAAE,OAAS,EACnBQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAC3BA,EAAE,SAAS,EAAE,OAAS,EAC7BQ,EAAcR,EAAE,SAAS,EAAE,KAAK,GAAK,GAErCQ,EAAcR,EAAE,MAAM,EAAE,KAAK,GAAK,GAItC,IAAIS,EAAWd,EAAgB,SAASa,CAAW,EAc/CE,EAXgB,CAChB,KAAKT,CAAK,GACV,KAAKC,CAAW,GAChB,GACA,eAAeJ,CAAS,GACxB,kBAAkB,IAAI,KAAK,EAAE,YAAY,CAAC,GAC1C,GACA,MACA,EACJ,EAEgC,KAAK;AAAA,CAAI,EAAIW,EAGzCN,EAAc,OAAS,IACvBO,GAAiB;AAAA;AAAA;AAAA;AAAA;AAAA,EACjBP,EAAc,QAAQQ,GAAK,CACvBD,GAAiB,KAAK,UAAUC,EAAG,KAAM,CAAC,EAAI;AAAA,CAClD,CAAC,EACDD,GAAiB,SAGrB,IAAME,EAAeF,EAAc,OAC7BG,EAAsBd,EAAe,GAAMA,EAAea,GAAgBb,EAAgB,IAAM,EAEtG,MAAO,CACH,SAAUW,EACV,SAAU,CACN,MAAAT,EACA,YAAAC,EACA,OAAQC,CACZ,EACA,MAAO,CACH,iBAAkBJ,EAClB,aAAAa,EACA,oBAAAC,CACJ,CACJ,CACJ,CDrGA,SAASC,GAAU,CACf,IAAMC,EAAUC,EAAK,KAAK,QAAQ,IAAI,EAAG,YAAY,EACjDC,EAAG,WAAWF,CAAO,GACFE,EAAG,aAAaF,EAAS,MAAM,EACvC,MAAM,OAAO,EAAE,QAAQG,GAAQ,CACtC,IAAMC,EAAcD,EAAK,KAAK,EAC9B,GAAI,CAACC,GAAeA,EAAY,WAAW,GAAG,EAAG,OACjD,GAAM,CAACC,EAAK,GAAGC,CAAU,EAAIF,EAAY,MAAM,GAAG,EAC9CC,GAAOC,EAAW,OAAS,IAC3B,QAAQ,IAAID,EAAI,KAAK,CAAC,EAAIC,EAAW,KAAK,GAAG,EAAE,KAAK,EAAE,QAAQ,eAAgB,EAAE,EAExF,CAAC,CAET,CAEA,eAAeC,GAAO,CAClBR,EAAQ,EACR,QAAQ,IAAIS,EAAG,KAAK;AAAA,8CAAiD,CAAC,EAEtE,IAAMC,EAAM,QAAQ,IAAI,EAClBC,EAAgBT,EAAK,KAAKQ,EAAK,kBAAkB,EACjDE,EAAgBV,EAAK,KAAKQ,EAAK,cAAc,EAEnD,GAAI,CAACP,EAAG,WAAWQ,CAAa,EAAG,CAC/B,QAAQ,IAAIF,EAAG,OAAO,+CAA+CE,CAAa,EAAE,CAAC,EACrF,QAAQ,IAAIF,EAAG,OAAO,gFAAgF,CAAC,EACvG,MACJ,CAGA,IAAMI,EAAQ,MAAMC,EAAK,YAAa,CAAE,IAAKH,CAAc,CAAC,EAE5D,GAAIE,EAAM,SAAW,EAAG,CACpB,QAAQ,IAAIJ,EAAG,OAAO,+CAA+C,CAAC,EACtE,MACJ,CAGKN,EAAG,WAAWS,CAAa,GAC5BT,EAAG,UAAUS,EAAe,CAAE,UAAW,EAAK,CAAC,EAGnD,IAAIG,EAAoB,EACpBC,EAAoB,EACpBC,EAAsB,EAE1B,QAAWC,KAAQL,EAAO,CACtB,IAAMM,EAAYjB,EAAK,KAAKS,EAAeO,CAAI,EAG3CE,EAAqBF,EAAK,QAAQ,UAAW,KAAK,EAEhDG,EAAanB,EAAK,KAAKU,EAAeQ,CAAkB,EAE9D,GAAI,CACA,IAAME,EAAcnB,EAAG,aAAagB,EAAW,MAAM,EAE/CI,EAASC,EAAeF,EAAa,IAAIF,EAAmB,QAAQ,QAAS,EAAE,CAAC,EAAE,EAGlFK,EAAYvB,EAAK,QAAQmB,CAAU,EACpClB,EAAG,WAAWsB,CAAS,GACxBtB,EAAG,UAAUsB,EAAW,CAAE,UAAW,EAAK,CAAC,EAG/CtB,EAAG,cAAckB,EAAYE,EAAO,SAAU,MAAM,EAEpDR,GAAqBQ,EAAO,MAAM,iBAClCP,GAAqBO,EAAO,MAAM,aAClCN,IAEA,IAAMS,GAAUH,EAAO,MAAM,iBAAmB,MAAM,QAAQ,CAAC,EACzDI,GAAQJ,EAAO,MAAM,aAAe,MAAM,QAAQ,CAAC,EAGrDK,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCU,IAAc,QAASA,EAAY,IAClCA,EAAY,IAAIA,CAAS,GAE9B,QAAQ,IACJnB,EAAG,MAAM,kBAAa,EACtBA,EAAG,IAAI,IAAImB,CAAS,GAAG,EACvBnB,EAAG,KAAK,IAAIiB,CAAM,SAASC,CAAI,KAAK,CACxC,CACJ,OAASE,EAAQ,CACb,QAAQ,MAAMpB,EAAG,IAAI,4BAAuBS,CAAI,KAAKW,EAAE,OAAO,EAAE,CAAC,CACrE,CACJ,CAEA,QAAQ,IACJpB,EAAG,KACCA,EAAG,QAAQ,aAAaQ,CAAmB,wBAAwBF,EAAoB,MAAM,QAAQ,CAAC,CAAC,UAAUC,EAAoB,MAAM,QAAQ,CAAC,CAAC,IAAI,CAC7J,CACJ,EAGA,IAAMc,EAAe,QAAQ,IAAI,aAC3BC,EAAgB,QAAQ,IAAI,oBAAsB,4BAExD,GAAID,GAAgBb,EAAsB,EAAG,CACzC,QAAQ,IAAIR,EAAG,KAAK,+CAA+CsB,CAAa,MAAM,CAAC,EACvF,GAAI,CACA,IAAMC,EAAWnB,EAAM,IAAIK,GAAQ,CAC/B,IAAMU,EAAYV,EAAK,QAAQ,UAAW,EAAE,EACtCe,EAAQL,IAAc,QAAU,IAAM,IAAIA,CAAS,GACnDM,EAAShC,EAAK,KAAKU,EAAeM,EAAK,QAAQ,UAAW,KAAK,CAAC,EACtE,MAAO,CACH,MAAAe,EACA,SAAU,GAAGL,CAAS,MACtB,QAASzB,EAAG,aAAa+B,EAAQ,MAAM,CAC3C,CACJ,CAAC,EAEKC,EAAM,MAAM,MAAM,GAAGJ,CAAa,aAAc,CAClD,OAAQ,OACR,QAAS,CACL,aAAcD,EACd,eAAgB,kBACpB,EACA,KAAM,KAAK,UAAU,CAAE,MAAOE,CAAS,CAAC,CAC5C,CAAC,EAED,GAAIG,EAAI,GACJ,QAAQ,IAAI1B,EAAG,MAAM,sCAAiC,CAAC,MACpD,CACH,IAAM2B,EAAU,MAAMD,EAAI,KAAK,EAAE,MAAM,KAAO,CAAC,EAAE,EACjD,QAAQ,IAAI1B,EAAG,OAAO,sCAAiC2B,EAAQ,OAASD,EAAI,UAAU,EAAE,CAAC,CAC7F,CACJ,OAASN,EAAQ,CACb,QAAQ,IAAIpB,EAAG,OAAO,qCAAgCoB,EAAE,OAAO,EAAE,CAAC,CACtE,CACJ,CAEA,QAAQ,IAAIpB,EAAG,IAAI;AAAA,CAA8C,CAAC,CACtE,CAEAD,EAAK,EAAE,MAAMqB,GAAK,CACd,QAAQ,MAAMpB,EAAG,IAAI,uBAAuBoB,EAAE,OAAO,EAAE,CAAC,EACxD,QAAQ,KAAK,CAAC,CAClB,CAAC","names":["glob","fs","path","pc","cheerio","TurndownService","turndownService","extractContent","html","sourceUrl","originalSize","$","title","description","jsonLdScripts","_","el","raw","parsed","contentHtml","markdown","finalMarkdown","j","markdownSize","tokenReductionRatio","loadEnv","envPath","path","fs","line","trimmedLine","key","valueParts","main","pc","cwd","nextAppDirDir","ontoPublicDir","files","glob","totalOriginalSize","totalMarkdownSize","totalFilesProcessed","file","inputPath","outputPathRelative","outputPath","htmlContent","result","extractContent","outputDir","origKb","mdKb","routeName","e","ONTO_API_KEY","DASHBOARD_URL","manifest","route","mdPath","res","errData"]}
|
package/dist/middleware.d.mts
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
import { NextRequest, NextResponse } from 'next/server';
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Comprehensive registry of AI bot user-agent strings.
|
|
5
|
+
* The middleware uses this list to detect AI crawlers and serve optimized markdown.
|
|
6
|
+
*/
|
|
7
|
+
interface AiBot {
|
|
8
|
+
/** The user-agent substring to match against */
|
|
9
|
+
name: string;
|
|
10
|
+
/** The company operating this bot */
|
|
11
|
+
company: string;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Flat list of user-agent substrings for fast matching in the middleware.
|
|
15
|
+
*/
|
|
16
|
+
declare const AI_BOT_USER_AGENTS: string[];
|
|
17
|
+
/**
|
|
18
|
+
* Given a raw user-agent string, returns the matched AiBot entry or undefined.
|
|
19
|
+
* Comparison is case-insensitive to handle inconsistent agent casing.
|
|
20
|
+
*/
|
|
21
|
+
declare function matchBot(userAgent: string): AiBot | undefined;
|
|
4
22
|
|
|
5
|
-
|
|
23
|
+
declare function ontoMiddleware(request: NextRequest): Promise<NextResponse<unknown>>;
|
|
24
|
+
|
|
25
|
+
export { AI_BOT_USER_AGENTS, type AiBot, matchBot, ontoMiddleware };
|
package/dist/middleware.d.ts
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
import { NextRequest, NextResponse } from 'next/server';
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Comprehensive registry of AI bot user-agent strings.
|
|
5
|
+
* The middleware uses this list to detect AI crawlers and serve optimized markdown.
|
|
6
|
+
*/
|
|
7
|
+
interface AiBot {
|
|
8
|
+
/** The user-agent substring to match against */
|
|
9
|
+
name: string;
|
|
10
|
+
/** The company operating this bot */
|
|
11
|
+
company: string;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Flat list of user-agent substrings for fast matching in the middleware.
|
|
15
|
+
*/
|
|
16
|
+
declare const AI_BOT_USER_AGENTS: string[];
|
|
17
|
+
/**
|
|
18
|
+
* Given a raw user-agent string, returns the matched AiBot entry or undefined.
|
|
19
|
+
* Comparison is case-insensitive to handle inconsistent agent casing.
|
|
20
|
+
*/
|
|
21
|
+
declare function matchBot(userAgent: string): AiBot | undefined;
|
|
4
22
|
|
|
5
|
-
|
|
23
|
+
declare function ontoMiddleware(request: NextRequest): Promise<NextResponse<unknown>>;
|
|
24
|
+
|
|
25
|
+
export { AI_BOT_USER_AGENTS, type AiBot, matchBot, ontoMiddleware };
|
package/dist/middleware.js
CHANGED
|
@@ -1,2 +1,6 @@
|
|
|
1
|
-
"use strict";var
|
|
1
|
+
"use strict";var l=Object.defineProperty;var _=Object.getOwnPropertyDescriptor;var w=Object.getOwnPropertyNames;var C=Object.prototype.hasOwnProperty;var T=(e,t)=>{for(var a in t)l(e,a,{get:t[a],enumerable:!0})},R=(e,t,a,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let c of w(t))!C.call(e,c)&&c!==a&&l(e,c,{get:()=>t[c],enumerable:!(n=_(t,c))||n.enumerable});return e};var k=e=>R(l({},"__esModule",{value:!0}),e);var S={};T(S,{AI_BOT_USER_AGENTS:()=>f,matchBot:()=>s,ontoMiddleware:()=>G});module.exports=k(S);var m=require("next/server");var A=[{name:"GPTBot",company:"OpenAI"},{name:"ChatGPT-User",company:"OpenAI"},{name:"OAI-SearchBot",company:"OpenAI"},{name:"Googlebot",company:"Google"},{name:"Google-CloudVertexBot",company:"Google"},{name:"Google-Extended",company:"Google"},{name:"GoogleOther",company:"Google"},{name:"ClaudeBot",company:"Anthropic"},{name:"Claude-User",company:"Anthropic"},{name:"anthropic-ai",company:"Anthropic"},{name:"PerplexityBot",company:"Perplexity"},{name:"Perplexity-User",company:"Perplexity"},{name:"Meta-ExternalAgent",company:"Meta"},{name:"Meta-ExternalFetcher",company:"Meta"},{name:"FacebookBot",company:"Meta"},{name:"CCBot",company:"Common Crawl"},{name:"Bytespider",company:"ByteDance"},{name:"Applebot-Extended",company:"Apple"},{name:"cohere-ai",company:"Cohere"},{name:"YouBot",company:"You.com"}],f=A.map(e=>e.name);function s(e){let t=e.toLowerCase();return A.find(a=>t.includes(a.name.toLowerCase()))}async function G(e){let t=e.headers.get("user-agent")||"",a=e.headers.get("accept")||"",n=s(t),c=!!n,g=a.includes("text/markdown");if(c||g){let r=e.nextUrl.clone();if(r.pathname.startsWith("/_next")||r.pathname.includes("."))return m.NextResponse.next();let o=r.pathname;(o==="/"||o==="")&&(o="/index"),o.endsWith("/")&&o!=="/"&&(o=o.slice(0,-1));let d={"Content-Type":"text/markdown; charset=utf-8","Cache-Control":"public, max-age=3600, s-maxage=3600, stale-while-revalidate=86400"};n&&(d["X-Onto-Bot"]=`${n.name} (${n.company})`);let p=process.env.ONTO_API_KEY,y=process.env.ONTO_DASHBOARD_URL||"https://app.buildonto.dev";if(p){fetch(`${y}/api/track`,{method:"POST",headers:{"x-onto-key":p,"Content-Type":"application/json"},body:JSON.stringify({route:r.pathname,userAgent:t,bot:n?n.name:null,company:n?n.company:null})}).catch(()=>{});try{let i=await fetch(`${y}/api/sdk/inject?route=${r.pathname}`,{headers:{"x-onto-key":p},signal:AbortSignal.timeout(1500)});if(i.ok){let{injection:u}=await i.json();if(u){let B=`${r.origin}/.onto${o}.md`,x=await fetch(B);if(x.ok){let O=`${await x.text()}
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
${u}`;return new m.NextResponse(O,{headers:{...d,"X-Onto-Injected":"true"}})}}}}catch(i){console.error("[Onto] Injection failed",i)}}r.pathname=`/.onto${o}.md`;let h=m.NextResponse.rewrite(r);return n&&h.headers.set("X-Onto-Bot",`${n.name} (${n.company})`),h}return m.NextResponse.next()}0&&(module.exports={AI_BOT_USER_AGENTS,matchBot,ontoMiddleware});
|
|
2
6
|
//# sourceMappingURL=middleware.js.map
|
package/dist/middleware.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/middleware.ts"],"sourcesContent":["import { NextRequest, NextResponse } from 'next/server';\r\n\r\nconst AI_BOT_USER_AGENTS = [\r\n 'GPTBot',\r\n 'ChatGPT-User',\r\n 'ClaudeBot',\r\n 'Claude-Web',\r\n 'anthropic-ai',\r\n 'PerplexityBot',\r\n 'OAI-SearchBot',\r\n 'GoogleExtended',\r\n];\r\n\r\nexport function ontoMiddleware(request: NextRequest) {\r\n const userAgent = request.headers.get('user-agent') || '';\r\n const accept = request.headers.get('accept') || '';\r\n\r\n const isAiBot = AI_BOT_USER_AGENTS.some(bot => userAgent.includes(bot));\r\n const isMarkdownRequested = accept.includes('text/markdown');\r\n\r\n // If traffic is identified as an AI Bot, rewrite the URL\r\n if (isAiBot || isMarkdownRequested) {\r\n const url = request.nextUrl.clone();\r\n\r\n // Ignore internal next.js requests & static assets\r\n if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {\r\n return NextResponse.next();\r\n }\r\n\r\n // Determine the corresponding payload path\r\n let payloadPath = url.pathname;\r\n if (payloadPath === '/' || payloadPath === '') {\r\n payloadPath = '/index';\r\n }\r\n\r\n // Strip trailing slash if present\r\n if (payloadPath.endsWith('/') && payloadPath !== '/') {\r\n payloadPath = payloadPath.slice(0, -1);\r\n }\r\n\r\n url.pathname = `/.onto${payloadPath}.md`;\r\n\r\n // Rewrite implicitly serves the target URL transparently to the client.\r\n return NextResponse.rewrite(url);\r\n }\r\n\r\n return NextResponse.next();\r\n}\r\n"],"mappings":"yaAAA,IAAAA,EAAA,GAAAC,EAAAD,EAAA,oBAAAE,IAAA,eAAAC,EAAAH,GAAA,IAAAI,EAA0C,uBAEpCC,EAAqB,CACvB,SACA,eACA,YACA,aACA,eACA,gBACA,gBACA,gBACJ,EAEO,SAASH,EAAeI,EAAsB,CACjD,IAAMC,EAAYD,EAAQ,QAAQ,IAAI,YAAY,GAAK,GACjDE,EAASF,EAAQ,QAAQ,IAAI,QAAQ,GAAK,GAE1CG,EAAUJ,EAAmB,KAAKK,GAAOH,EAAU,SAASG,CAAG,CAAC,EAChEC,EAAsBH,EAAO,SAAS,eAAe,EAG3D,GAAIC,GAAWE,EAAqB,CAChC,IAAMC,EAAMN,EAAQ,QAAQ,MAAM,EAGlC,GAAIM,EAAI,SAAS,WAAW,QAAQ,GAAKA,EAAI,SAAS,SAAS,GAAG,EAC9D,OAAO,eAAa,KAAK,EAI7B,IAAIC,EAAcD,EAAI,SACtB,OAAIC,IAAgB,KAAOA,IAAgB,MACvCA,EAAc,UAIdA,EAAY,SAAS,GAAG,GAAKA,IAAgB,MAC7CA,EAAcA,EAAY,MAAM,EAAG,EAAE,GAGzCD,EAAI,SAAW,SAASC,CAAW,MAG5B,eAAa,QAAQD,CAAG,CACnC,CAEA,OAAO,eAAa,KAAK,CAC7B","names":["middleware_exports","__export","ontoMiddleware","__toCommonJS","import_server","AI_BOT_USER_AGENTS","request","userAgent","accept","isAiBot","bot","isMarkdownRequested","url","payloadPath"]}
|
|
1
|
+
{"version":3,"sources":["../src/middleware.ts","../src/bots.ts"],"sourcesContent":["import { NextRequest, NextResponse } from 'next/server';\r\nimport { AI_BOT_USER_AGENTS, matchBot } from './bots';\r\n\r\nexport async function ontoMiddleware(request: NextRequest) {\r\n const userAgent = request.headers.get('user-agent') || '';\r\n const accept = request.headers.get('accept') || '';\r\n\r\n const matched = matchBot(userAgent);\r\n const isAiBot = !!matched;\r\n const isMarkdownRequested = accept.includes('text/markdown');\r\n\r\n // If traffic is identified as an AI Bot, rewrite the URL\r\n if (isAiBot || isMarkdownRequested) {\r\n const url = request.nextUrl.clone();\r\n\r\n // Ignore internal next.js requests & static assets\r\n if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {\r\n return NextResponse.next();\r\n }\r\n\r\n // Determine the corresponding payload path\r\n let payloadPath = url.pathname;\r\n if (payloadPath === '/' || payloadPath === '') {\r\n payloadPath = '/index';\r\n }\r\n\r\n // Strip trailing slash if present\r\n if (payloadPath.endsWith('/') && payloadPath !== '/') {\r\n payloadPath = payloadPath.slice(0, -1);\r\n }\r\n\r\n // Common response headers for all bot responses\r\n const botHeaders: Record<string, string> = {\r\n 'Content-Type': 'text/markdown; charset=utf-8',\r\n 'Cache-Control': 'public, max-age=3600, s-maxage=3600, stale-while-revalidate=86400',\r\n };\r\n if (matched) {\r\n botHeaders['X-Onto-Bot'] = `${matched.name} (${matched.company})`;\r\n }\r\n\r\n // --- Onto Control Plane Integration (Premium) ---\r\n const ONTO_API_KEY = process.env.ONTO_API_KEY;\r\n const DASHBOARD_URL = process.env.ONTO_DASHBOARD_URL || 'https://app.buildonto.dev';\r\n\r\n if (ONTO_API_KEY) {\r\n // 1. Fire-and-forget tracking — includes structured bot info\r\n fetch(`${DASHBOARD_URL}/api/track`, {\r\n method: 'POST',\r\n headers: {\r\n 'x-onto-key': ONTO_API_KEY,\r\n 'Content-Type': 'application/json'\r\n },\r\n body: JSON.stringify({\r\n route: url.pathname,\r\n userAgent: userAgent,\r\n bot: matched ? matched.name : null,\r\n company: matched ? matched.company : null,\r\n })\r\n }).catch(() => {});\r\n\r\n // 2. Dynamic Context Injection\r\n try {\r\n // Fetch the injection from the Control Plane\r\n const injectRes = await fetch(`${DASHBOARD_URL}/api/sdk/inject?route=${url.pathname}`, {\r\n headers: { 'x-onto-key': ONTO_API_KEY },\r\n // Set a strict timeout to keep edge fast\r\n signal: AbortSignal.timeout(1500)\r\n });\r\n\r\n if (injectRes.ok) {\r\n const { injection } = await injectRes.json();\r\n \r\n if (injection) {\r\n // To inject, we must fetch the local markdown and append\r\n const localMdUrl = `${url.origin}/.onto${payloadPath}.md`;\r\n const mdRes = await fetch(localMdUrl);\r\n \r\n if (mdRes.ok) {\r\n const baseMarkdown = await mdRes.text();\r\n const finalMarkdown = `${baseMarkdown}\\n\\n---\\n\\n${injection}`;\r\n \r\n return new NextResponse(finalMarkdown, {\r\n headers: {\r\n ...botHeaders,\r\n 'X-Onto-Injected': 'true'\r\n }\r\n });\r\n }\r\n }\r\n }\r\n } catch (err) {\r\n console.error('[Onto] Injection failed', err);\r\n }\r\n }\r\n // ------------------------------------------------\r\n\r\n url.pathname = `/.onto${payloadPath}.md`;\r\n\r\n // Rewrite implicitly serves the target URL transparently to the client.\r\n const response = NextResponse.rewrite(url);\r\n // Attach bot identification headers to the rewrite response\r\n if (matched) {\r\n response.headers.set('X-Onto-Bot', `${matched.name} (${matched.company})`);\r\n }\r\n return response;\r\n }\r\n\r\n return NextResponse.next();\r\n}\r\n\r\n// Re-export the bot registry for consumers who want to extend or inspect it\r\nexport { AI_BOT_USER_AGENTS, matchBot } from './bots';\r\nexport type { AiBot } from './bots';\r\n\r\n","/**\n * Comprehensive registry of AI bot user-agent strings.\n * The middleware uses this list to detect AI crawlers and serve optimized markdown.\n */\n\nexport interface AiBot {\n /** The user-agent substring to match against */\n name: string;\n /** The company operating this bot */\n company: string;\n}\n\n/**\n * Structured registry of all known AI bots, grouped by company.\n * Useful for analytics and the Control Plane dashboard.\n */\nexport const AI_BOTS: AiBot[] = [\n // OpenAI\n { name: 'GPTBot', company: 'OpenAI' },\n { name: 'ChatGPT-User', company: 'OpenAI' },\n { name: 'OAI-SearchBot', company: 'OpenAI' },\n\n // Google\n { name: 'Googlebot', company: 'Google' },\n { name: 'Google-CloudVertexBot', company: 'Google' },\n { name: 'Google-Extended', company: 'Google' },\n { name: 'GoogleOther', company: 'Google' },\n\n // Anthropic\n { name: 'ClaudeBot', company: 'Anthropic' },\n { name: 'Claude-User', company: 'Anthropic' },\n { name: 'anthropic-ai', company: 'Anthropic' },\n\n // Perplexity\n { name: 'PerplexityBot', company: 'Perplexity' },\n { name: 'Perplexity-User', company: 'Perplexity' },\n\n // Meta\n { name: 'Meta-ExternalAgent', company: 'Meta' },\n { name: 'Meta-ExternalFetcher', company: 'Meta' },\n { name: 'FacebookBot', company: 'Meta' },\n\n // Common Crawl (used by most smaller AI companies)\n { name: 'CCBot', company: 'Common Crawl' },\n\n // Other notable AI crawlers\n { name: 'Bytespider', company: 'ByteDance' },\n { name: 'Applebot-Extended', company: 'Apple' },\n { name: 'cohere-ai', company: 'Cohere' },\n { name: 'YouBot', company: 'You.com' },\n];\n\n/**\n * Flat list of user-agent substrings for fast matching in the middleware.\n */\nexport const AI_BOT_USER_AGENTS: string[] = AI_BOTS.map(bot => bot.name);\n\n/**\n * Given a raw user-agent string, returns the matched AiBot entry or undefined.\n * Comparison is case-insensitive to handle inconsistent agent casing.\n */\nexport function matchBot(userAgent: string): AiBot | undefined {\n const ua = userAgent.toLowerCase();\n return AI_BOTS.find(bot => ua.includes(bot.name.toLowerCase()));\n}\n"],"mappings":"yaAAA,IAAAA,EAAA,GAAAC,EAAAD,EAAA,wBAAAE,EAAA,aAAAC,EAAA,mBAAAC,IAAA,eAAAC,EAAAL,GAAA,IAAAM,EAA0C,uBCgBnC,IAAMC,EAAmB,CAE5B,CAAE,KAAM,SAAqB,QAAS,QAAS,EAC/C,CAAE,KAAM,eAAoB,QAAS,QAAS,EAC9C,CAAE,KAAM,gBAAoB,QAAS,QAAS,EAG9C,CAAE,KAAM,YAA0B,QAAS,QAAS,EACpD,CAAE,KAAM,wBAA2B,QAAS,QAAS,EACrD,CAAE,KAAM,kBAA2B,QAAS,QAAS,EACrD,CAAE,KAAM,cAA2B,QAAS,QAAS,EAGrD,CAAE,KAAM,YAAmB,QAAS,WAAY,EAChD,CAAE,KAAM,cAAkB,QAAS,WAAY,EAC/C,CAAE,KAAM,eAAkB,QAAS,WAAY,EAG/C,CAAE,KAAM,gBAAmB,QAAS,YAAa,EACjD,CAAE,KAAM,kBAAmB,QAAS,YAAa,EAGjD,CAAE,KAAM,qBAAwB,QAAS,MAAO,EAChD,CAAE,KAAM,uBAAwB,QAAS,MAAO,EAChD,CAAE,KAAM,cAAuB,QAAS,MAAO,EAG/C,CAAE,KAAM,QAAS,QAAS,cAAe,EAGzC,CAAE,KAAM,aAAqB,QAAS,WAAY,EAClD,CAAE,KAAM,oBAAqB,QAAS,OAAQ,EAC9C,CAAE,KAAM,YAAoB,QAAS,QAAS,EAC9C,CAAE,KAAM,SAAoB,QAAS,SAAU,CACnD,EAKaC,EAA+BD,EAAQ,IAAIE,GAAOA,EAAI,IAAI,EAMhE,SAASC,EAASC,EAAsC,CAC3D,IAAMC,EAAKD,EAAU,YAAY,EACjC,OAAOJ,EAAQ,KAAKE,GAAOG,EAAG,SAASH,EAAI,KAAK,YAAY,CAAC,CAAC,CAClE,CD7DA,eAAsBI,EAAeC,EAAsB,CACvD,IAAMC,EAAYD,EAAQ,QAAQ,IAAI,YAAY,GAAK,GACjDE,EAASF,EAAQ,QAAQ,IAAI,QAAQ,GAAK,GAE1CG,EAAUC,EAASH,CAAS,EAC5BI,EAAU,CAAC,CAACF,EACZG,EAAsBJ,EAAO,SAAS,eAAe,EAG3D,GAAIG,GAAWC,EAAqB,CAChC,IAAMC,EAAMP,EAAQ,QAAQ,MAAM,EAGlC,GAAIO,EAAI,SAAS,WAAW,QAAQ,GAAKA,EAAI,SAAS,SAAS,GAAG,EAC9D,OAAO,eAAa,KAAK,EAI7B,IAAIC,EAAcD,EAAI,UAClBC,IAAgB,KAAOA,IAAgB,MACvCA,EAAc,UAIdA,EAAY,SAAS,GAAG,GAAKA,IAAgB,MAC7CA,EAAcA,EAAY,MAAM,EAAG,EAAE,GAIzC,IAAMC,EAAqC,CACvC,eAAgB,+BAChB,gBAAiB,mEACrB,EACIN,IACAM,EAAW,YAAY,EAAI,GAAGN,EAAQ,IAAI,KAAKA,EAAQ,OAAO,KAIlE,IAAMO,EAAe,QAAQ,IAAI,aAC3BC,EAAgB,QAAQ,IAAI,oBAAsB,4BAExD,GAAID,EAAc,CAEd,MAAM,GAAGC,CAAa,aAAc,CAChC,OAAQ,OACR,QAAS,CACL,aAAcD,EACd,eAAgB,kBACpB,EACA,KAAM,KAAK,UAAU,CACjB,MAAOH,EAAI,SACX,UAAWN,EACX,IAAKE,EAAUA,EAAQ,KAAO,KAC9B,QAASA,EAAUA,EAAQ,QAAU,IACzC,CAAC,CACL,CAAC,EAAE,MAAM,IAAM,CAAC,CAAC,EAGjB,GAAI,CAEA,IAAMS,EAAY,MAAM,MAAM,GAAGD,CAAa,yBAAyBJ,EAAI,QAAQ,GAAI,CACnF,QAAS,CAAE,aAAcG,CAAa,EAEtC,OAAQ,YAAY,QAAQ,IAAI,CACpC,CAAC,EAED,GAAIE,EAAU,GAAI,CACd,GAAM,CAAE,UAAAC,CAAU,EAAI,MAAMD,EAAU,KAAK,EAE3C,GAAIC,EAAW,CAEX,IAAMC,EAAa,GAAGP,EAAI,MAAM,SAASC,CAAW,MAC9CO,EAAQ,MAAM,MAAMD,CAAU,EAEpC,GAAIC,EAAM,GAAI,CAEV,IAAMC,EAAgB,GADD,MAAMD,EAAM,KAAK,CACD;AAAA;AAAA;AAAA;AAAA,EAAcF,CAAS,GAE5D,OAAO,IAAI,eAAaG,EAAe,CACnC,QAAS,CACL,GAAGP,EACH,kBAAmB,MACvB,CACJ,CAAC,CACL,CACJ,CACJ,CACJ,OAASQ,EAAK,CACV,QAAQ,MAAM,0BAA2BA,CAAG,CAChD,CACJ,CAGAV,EAAI,SAAW,SAASC,CAAW,MAGnC,IAAMU,EAAW,eAAa,QAAQX,CAAG,EAEzC,OAAIJ,GACAe,EAAS,QAAQ,IAAI,aAAc,GAAGf,EAAQ,IAAI,KAAKA,EAAQ,OAAO,GAAG,EAEtEe,CACX,CAEA,OAAO,eAAa,KAAK,CAC7B","names":["middleware_exports","__export","AI_BOT_USER_AGENTS","matchBot","ontoMiddleware","__toCommonJS","import_server","AI_BOTS","AI_BOT_USER_AGENTS","bot","matchBot","userAgent","ua","ontoMiddleware","request","userAgent","accept","matched","matchBot","isAiBot","isMarkdownRequested","url","payloadPath","botHeaders","ONTO_API_KEY","DASHBOARD_URL","injectRes","injection","localMdUrl","mdRes","finalMarkdown","err","response"]}
|
package/dist/middleware.mjs
CHANGED
|
@@ -1,2 +1,6 @@
|
|
|
1
|
-
import{NextResponse as
|
|
1
|
+
import{NextResponse as c}from"next/server";var u=[{name:"GPTBot",company:"OpenAI"},{name:"ChatGPT-User",company:"OpenAI"},{name:"OAI-SearchBot",company:"OpenAI"},{name:"Googlebot",company:"Google"},{name:"Google-CloudVertexBot",company:"Google"},{name:"Google-Extended",company:"Google"},{name:"GoogleOther",company:"Google"},{name:"ClaudeBot",company:"Anthropic"},{name:"Claude-User",company:"Anthropic"},{name:"anthropic-ai",company:"Anthropic"},{name:"PerplexityBot",company:"Perplexity"},{name:"Perplexity-User",company:"Perplexity"},{name:"Meta-ExternalAgent",company:"Meta"},{name:"Meta-ExternalFetcher",company:"Meta"},{name:"FacebookBot",company:"Meta"},{name:"CCBot",company:"Common Crawl"},{name:"Bytespider",company:"ByteDance"},{name:"Applebot-Extended",company:"Apple"},{name:"cohere-ai",company:"Cohere"},{name:"YouBot",company:"You.com"}],B=u.map(o=>o.name);function s(o){let a=o.toLowerCase();return u.find(m=>a.includes(m.name.toLowerCase()))}async function k(o){let a=o.headers.get("user-agent")||"",m=o.headers.get("accept")||"",e=s(a),x=!!e,A=m.includes("text/markdown");if(x||A){let n=o.nextUrl.clone();if(n.pathname.startsWith("/_next")||n.pathname.includes("."))return c.next();let t=n.pathname;(t==="/"||t==="")&&(t="/index"),t.endsWith("/")&&t!=="/"&&(t=t.slice(0,-1));let p={"Content-Type":"text/markdown; charset=utf-8","Cache-Control":"public, max-age=3600, s-maxage=3600, stale-while-revalidate=86400"};e&&(p["X-Onto-Bot"]=`${e.name} (${e.company})`);let i=process.env.ONTO_API_KEY,l=process.env.ONTO_DASHBOARD_URL||"https://app.buildonto.dev";if(i){fetch(`${l}/api/track`,{method:"POST",headers:{"x-onto-key":i,"Content-Type":"application/json"},body:JSON.stringify({route:n.pathname,userAgent:a,bot:e?e.name:null,company:e?e.company:null})}).catch(()=>{});try{let r=await fetch(`${l}/api/sdk/inject?route=${n.pathname}`,{headers:{"x-onto-key":i},signal:AbortSignal.timeout(1500)});if(r.ok){let{injection:y}=await r.json();if(y){let f=`${n.origin}/.onto${t}.md`,h=await fetch(f);if(h.ok){let g=`${await h.text()}
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
${y}`;return new c(g,{headers:{...p,"X-Onto-Injected":"true"}})}}}}catch(r){console.error("[Onto] Injection failed",r)}}n.pathname=`/.onto${t}.md`;let d=c.rewrite(n);return e&&d.headers.set("X-Onto-Bot",`${e.name} (${e.company})`),d}return c.next()}export{B as AI_BOT_USER_AGENTS,s as matchBot,k as ontoMiddleware};
|
|
2
6
|
//# sourceMappingURL=middleware.mjs.map
|
package/dist/middleware.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/middleware.ts"],"sourcesContent":["import { NextRequest, NextResponse } from 'next/server';\r\n\r\nconst AI_BOT_USER_AGENTS = [\r\n 'GPTBot',\r\n 'ChatGPT-User',\r\n 'ClaudeBot',\r\n 'Claude-Web',\r\n 'anthropic-ai',\r\n 'PerplexityBot',\r\n 'OAI-SearchBot',\r\n 'GoogleExtended',\r\n];\r\n\r\nexport function ontoMiddleware(request: NextRequest) {\r\n const userAgent = request.headers.get('user-agent') || '';\r\n const accept = request.headers.get('accept') || '';\r\n\r\n const isAiBot = AI_BOT_USER_AGENTS.some(bot => userAgent.includes(bot));\r\n const isMarkdownRequested = accept.includes('text/markdown');\r\n\r\n // If traffic is identified as an AI Bot, rewrite the URL\r\n if (isAiBot || isMarkdownRequested) {\r\n const url = request.nextUrl.clone();\r\n\r\n // Ignore internal next.js requests & static assets\r\n if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {\r\n return NextResponse.next();\r\n }\r\n\r\n // Determine the corresponding payload path\r\n let payloadPath = url.pathname;\r\n if (payloadPath === '/' || payloadPath === '') {\r\n payloadPath = '/index';\r\n }\r\n\r\n // Strip trailing slash if present\r\n if (payloadPath.endsWith('/') && payloadPath !== '/') {\r\n payloadPath = payloadPath.slice(0, -1);\r\n }\r\n\r\n url.pathname = `/.onto${payloadPath}.md`;\r\n\r\n // Rewrite implicitly serves the target URL transparently to the client.\r\n return NextResponse.rewrite(url);\r\n }\r\n\r\n return NextResponse.next();\r\n}\r\n"],"mappings":"AAAA,OAAsB,gBAAAA,MAAoB,cAE1C,IAAMC,EAAqB,CACvB,SACA,eACA,YACA,aACA,eACA,gBACA,gBACA,gBACJ,EAEO,SAASC,EAAeC,EAAsB,CACjD,IAAMC,EAAYD,EAAQ,QAAQ,IAAI,YAAY,GAAK,GACjDE,EAASF,EAAQ,QAAQ,IAAI,QAAQ,GAAK,GAE1CG,EAAUL,EAAmB,KAAKM,GAAOH,EAAU,SAASG,CAAG,CAAC,EAChEC,EAAsBH,EAAO,SAAS,eAAe,EAG3D,GAAIC,GAAWE,EAAqB,CAChC,IAAMC,EAAMN,EAAQ,QAAQ,MAAM,EAGlC,GAAIM,EAAI,SAAS,WAAW,QAAQ,GAAKA,EAAI,SAAS,SAAS,GAAG,EAC9D,OAAOT,EAAa,KAAK,EAI7B,IAAIU,EAAcD,EAAI,SACtB,OAAIC,IAAgB,KAAOA,IAAgB,MACvCA,EAAc,UAIdA,EAAY,SAAS,GAAG,GAAKA,IAAgB,MAC7CA,EAAcA,EAAY,MAAM,EAAG,EAAE,GAGzCD,EAAI,SAAW,SAASC,CAAW,MAG5BV,EAAa,QAAQS,CAAG,CACnC,CAEA,OAAOT,EAAa,KAAK,CAC7B","names":["NextResponse","AI_BOT_USER_AGENTS","ontoMiddleware","request","userAgent","accept","isAiBot","bot","isMarkdownRequested","url","payloadPath"]}
|
|
1
|
+
{"version":3,"sources":["../src/middleware.ts","../src/bots.ts"],"sourcesContent":["import { NextRequest, NextResponse } from 'next/server';\r\nimport { AI_BOT_USER_AGENTS, matchBot } from './bots';\r\n\r\nexport async function ontoMiddleware(request: NextRequest) {\r\n const userAgent = request.headers.get('user-agent') || '';\r\n const accept = request.headers.get('accept') || '';\r\n\r\n const matched = matchBot(userAgent);\r\n const isAiBot = !!matched;\r\n const isMarkdownRequested = accept.includes('text/markdown');\r\n\r\n // If traffic is identified as an AI Bot, rewrite the URL\r\n if (isAiBot || isMarkdownRequested) {\r\n const url = request.nextUrl.clone();\r\n\r\n // Ignore internal next.js requests & static assets\r\n if (url.pathname.startsWith('/_next') || url.pathname.includes('.')) {\r\n return NextResponse.next();\r\n }\r\n\r\n // Determine the corresponding payload path\r\n let payloadPath = url.pathname;\r\n if (payloadPath === '/' || payloadPath === '') {\r\n payloadPath = '/index';\r\n }\r\n\r\n // Strip trailing slash if present\r\n if (payloadPath.endsWith('/') && payloadPath !== '/') {\r\n payloadPath = payloadPath.slice(0, -1);\r\n }\r\n\r\n // Common response headers for all bot responses\r\n const botHeaders: Record<string, string> = {\r\n 'Content-Type': 'text/markdown; charset=utf-8',\r\n 'Cache-Control': 'public, max-age=3600, s-maxage=3600, stale-while-revalidate=86400',\r\n };\r\n if (matched) {\r\n botHeaders['X-Onto-Bot'] = `${matched.name} (${matched.company})`;\r\n }\r\n\r\n // --- Onto Control Plane Integration (Premium) ---\r\n const ONTO_API_KEY = process.env.ONTO_API_KEY;\r\n const DASHBOARD_URL = process.env.ONTO_DASHBOARD_URL || 'https://app.buildonto.dev';\r\n\r\n if (ONTO_API_KEY) {\r\n // 1. Fire-and-forget tracking — includes structured bot info\r\n fetch(`${DASHBOARD_URL}/api/track`, {\r\n method: 'POST',\r\n headers: {\r\n 'x-onto-key': ONTO_API_KEY,\r\n 'Content-Type': 'application/json'\r\n },\r\n body: JSON.stringify({\r\n route: url.pathname,\r\n userAgent: userAgent,\r\n bot: matched ? matched.name : null,\r\n company: matched ? matched.company : null,\r\n })\r\n }).catch(() => {});\r\n\r\n // 2. Dynamic Context Injection\r\n try {\r\n // Fetch the injection from the Control Plane\r\n const injectRes = await fetch(`${DASHBOARD_URL}/api/sdk/inject?route=${url.pathname}`, {\r\n headers: { 'x-onto-key': ONTO_API_KEY },\r\n // Set a strict timeout to keep edge fast\r\n signal: AbortSignal.timeout(1500)\r\n });\r\n\r\n if (injectRes.ok) {\r\n const { injection } = await injectRes.json();\r\n \r\n if (injection) {\r\n // To inject, we must fetch the local markdown and append\r\n const localMdUrl = `${url.origin}/.onto${payloadPath}.md`;\r\n const mdRes = await fetch(localMdUrl);\r\n \r\n if (mdRes.ok) {\r\n const baseMarkdown = await mdRes.text();\r\n const finalMarkdown = `${baseMarkdown}\\n\\n---\\n\\n${injection}`;\r\n \r\n return new NextResponse(finalMarkdown, {\r\n headers: {\r\n ...botHeaders,\r\n 'X-Onto-Injected': 'true'\r\n }\r\n });\r\n }\r\n }\r\n }\r\n } catch (err) {\r\n console.error('[Onto] Injection failed', err);\r\n }\r\n }\r\n // ------------------------------------------------\r\n\r\n url.pathname = `/.onto${payloadPath}.md`;\r\n\r\n // Rewrite implicitly serves the target URL transparently to the client.\r\n const response = NextResponse.rewrite(url);\r\n // Attach bot identification headers to the rewrite response\r\n if (matched) {\r\n response.headers.set('X-Onto-Bot', `${matched.name} (${matched.company})`);\r\n }\r\n return response;\r\n }\r\n\r\n return NextResponse.next();\r\n}\r\n\r\n// Re-export the bot registry for consumers who want to extend or inspect it\r\nexport { AI_BOT_USER_AGENTS, matchBot } from './bots';\r\nexport type { AiBot } from './bots';\r\n\r\n","/**\n * Comprehensive registry of AI bot user-agent strings.\n * The middleware uses this list to detect AI crawlers and serve optimized markdown.\n */\n\nexport interface AiBot {\n /** The user-agent substring to match against */\n name: string;\n /** The company operating this bot */\n company: string;\n}\n\n/**\n * Structured registry of all known AI bots, grouped by company.\n * Useful for analytics and the Control Plane dashboard.\n */\nexport const AI_BOTS: AiBot[] = [\n // OpenAI\n { name: 'GPTBot', company: 'OpenAI' },\n { name: 'ChatGPT-User', company: 'OpenAI' },\n { name: 'OAI-SearchBot', company: 'OpenAI' },\n\n // Google\n { name: 'Googlebot', company: 'Google' },\n { name: 'Google-CloudVertexBot', company: 'Google' },\n { name: 'Google-Extended', company: 'Google' },\n { name: 'GoogleOther', company: 'Google' },\n\n // Anthropic\n { name: 'ClaudeBot', company: 'Anthropic' },\n { name: 'Claude-User', company: 'Anthropic' },\n { name: 'anthropic-ai', company: 'Anthropic' },\n\n // Perplexity\n { name: 'PerplexityBot', company: 'Perplexity' },\n { name: 'Perplexity-User', company: 'Perplexity' },\n\n // Meta\n { name: 'Meta-ExternalAgent', company: 'Meta' },\n { name: 'Meta-ExternalFetcher', company: 'Meta' },\n { name: 'FacebookBot', company: 'Meta' },\n\n // Common Crawl (used by most smaller AI companies)\n { name: 'CCBot', company: 'Common Crawl' },\n\n // Other notable AI crawlers\n { name: 'Bytespider', company: 'ByteDance' },\n { name: 'Applebot-Extended', company: 'Apple' },\n { name: 'cohere-ai', company: 'Cohere' },\n { name: 'YouBot', company: 'You.com' },\n];\n\n/**\n * Flat list of user-agent substrings for fast matching in the middleware.\n */\nexport const AI_BOT_USER_AGENTS: string[] = AI_BOTS.map(bot => bot.name);\n\n/**\n * Given a raw user-agent string, returns the matched AiBot entry or undefined.\n * Comparison is case-insensitive to handle inconsistent agent casing.\n */\nexport function matchBot(userAgent: string): AiBot | undefined {\n const ua = userAgent.toLowerCase();\n return AI_BOTS.find(bot => ua.includes(bot.name.toLowerCase()));\n}\n"],"mappings":"AAAA,OAAsB,gBAAAA,MAAoB,cCgBnC,IAAMC,EAAmB,CAE5B,CAAE,KAAM,SAAqB,QAAS,QAAS,EAC/C,CAAE,KAAM,eAAoB,QAAS,QAAS,EAC9C,CAAE,KAAM,gBAAoB,QAAS,QAAS,EAG9C,CAAE,KAAM,YAA0B,QAAS,QAAS,EACpD,CAAE,KAAM,wBAA2B,QAAS,QAAS,EACrD,CAAE,KAAM,kBAA2B,QAAS,QAAS,EACrD,CAAE,KAAM,cAA2B,QAAS,QAAS,EAGrD,CAAE,KAAM,YAAmB,QAAS,WAAY,EAChD,CAAE,KAAM,cAAkB,QAAS,WAAY,EAC/C,CAAE,KAAM,eAAkB,QAAS,WAAY,EAG/C,CAAE,KAAM,gBAAmB,QAAS,YAAa,EACjD,CAAE,KAAM,kBAAmB,QAAS,YAAa,EAGjD,CAAE,KAAM,qBAAwB,QAAS,MAAO,EAChD,CAAE,KAAM,uBAAwB,QAAS,MAAO,EAChD,CAAE,KAAM,cAAuB,QAAS,MAAO,EAG/C,CAAE,KAAM,QAAS,QAAS,cAAe,EAGzC,CAAE,KAAM,aAAqB,QAAS,WAAY,EAClD,CAAE,KAAM,oBAAqB,QAAS,OAAQ,EAC9C,CAAE,KAAM,YAAoB,QAAS,QAAS,EAC9C,CAAE,KAAM,SAAoB,QAAS,SAAU,CACnD,EAKaC,EAA+BD,EAAQ,IAAIE,GAAOA,EAAI,IAAI,EAMhE,SAASC,EAASC,EAAsC,CAC3D,IAAMC,EAAKD,EAAU,YAAY,EACjC,OAAOJ,EAAQ,KAAKE,GAAOG,EAAG,SAASH,EAAI,KAAK,YAAY,CAAC,CAAC,CAClE,CD7DA,eAAsBI,EAAeC,EAAsB,CACvD,IAAMC,EAAYD,EAAQ,QAAQ,IAAI,YAAY,GAAK,GACjDE,EAASF,EAAQ,QAAQ,IAAI,QAAQ,GAAK,GAE1CG,EAAUC,EAASH,CAAS,EAC5BI,EAAU,CAAC,CAACF,EACZG,EAAsBJ,EAAO,SAAS,eAAe,EAG3D,GAAIG,GAAWC,EAAqB,CAChC,IAAMC,EAAMP,EAAQ,QAAQ,MAAM,EAGlC,GAAIO,EAAI,SAAS,WAAW,QAAQ,GAAKA,EAAI,SAAS,SAAS,GAAG,EAC9D,OAAOC,EAAa,KAAK,EAI7B,IAAIC,EAAcF,EAAI,UAClBE,IAAgB,KAAOA,IAAgB,MACvCA,EAAc,UAIdA,EAAY,SAAS,GAAG,GAAKA,IAAgB,MAC7CA,EAAcA,EAAY,MAAM,EAAG,EAAE,GAIzC,IAAMC,EAAqC,CACvC,eAAgB,+BAChB,gBAAiB,mEACrB,EACIP,IACAO,EAAW,YAAY,EAAI,GAAGP,EAAQ,IAAI,KAAKA,EAAQ,OAAO,KAIlE,IAAMQ,EAAe,QAAQ,IAAI,aAC3BC,EAAgB,QAAQ,IAAI,oBAAsB,4BAExD,GAAID,EAAc,CAEd,MAAM,GAAGC,CAAa,aAAc,CAChC,OAAQ,OACR,QAAS,CACL,aAAcD,EACd,eAAgB,kBACpB,EACA,KAAM,KAAK,UAAU,CACjB,MAAOJ,EAAI,SACX,UAAWN,EACX,IAAKE,EAAUA,EAAQ,KAAO,KAC9B,QAASA,EAAUA,EAAQ,QAAU,IACzC,CAAC,CACL,CAAC,EAAE,MAAM,IAAM,CAAC,CAAC,EAGjB,GAAI,CAEA,IAAMU,EAAY,MAAM,MAAM,GAAGD,CAAa,yBAAyBL,EAAI,QAAQ,GAAI,CACnF,QAAS,CAAE,aAAcI,CAAa,EAEtC,OAAQ,YAAY,QAAQ,IAAI,CACpC,CAAC,EAED,GAAIE,EAAU,GAAI,CACd,GAAM,CAAE,UAAAC,CAAU,EAAI,MAAMD,EAAU,KAAK,EAE3C,GAAIC,EAAW,CAEX,IAAMC,EAAa,GAAGR,EAAI,MAAM,SAASE,CAAW,MAC9CO,EAAQ,MAAM,MAAMD,CAAU,EAEpC,GAAIC,EAAM,GAAI,CAEV,IAAMC,EAAgB,GADD,MAAMD,EAAM,KAAK,CACD;AAAA;AAAA;AAAA;AAAA,EAAcF,CAAS,GAE5D,OAAO,IAAIN,EAAaS,EAAe,CACnC,QAAS,CACL,GAAGP,EACH,kBAAmB,MACvB,CACJ,CAAC,CACL,CACJ,CACJ,CACJ,OAASQ,EAAK,CACV,QAAQ,MAAM,0BAA2BA,CAAG,CAChD,CACJ,CAGAX,EAAI,SAAW,SAASE,CAAW,MAGnC,IAAMU,EAAWX,EAAa,QAAQD,CAAG,EAEzC,OAAIJ,GACAgB,EAAS,QAAQ,IAAI,aAAc,GAAGhB,EAAQ,IAAI,KAAKA,EAAQ,OAAO,GAAG,EAEtEgB,CACX,CAEA,OAAOX,EAAa,KAAK,CAC7B","names":["NextResponse","AI_BOTS","AI_BOT_USER_AGENTS","bot","matchBot","userAgent","ua","ontoMiddleware","request","userAgent","accept","matched","matchBot","isAiBot","isMarkdownRequested","url","NextResponse","payloadPath","botHeaders","ONTO_API_KEY","DASHBOARD_URL","injectRes","injection","localMdUrl","mdRes","finalMarkdown","err","response"]}
|
package/package.json
CHANGED
package/src/bots.ts
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Comprehensive registry of AI bot user-agent strings.
|
|
3
|
+
* The middleware uses this list to detect AI crawlers and serve optimized markdown.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export interface AiBot {
|
|
7
|
+
/** The user-agent substring to match against */
|
|
8
|
+
name: string;
|
|
9
|
+
/** The company operating this bot */
|
|
10
|
+
company: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Structured registry of all known AI bots, grouped by company.
|
|
15
|
+
* Useful for analytics and the Control Plane dashboard.
|
|
16
|
+
*/
|
|
17
|
+
export const AI_BOTS: AiBot[] = [
|
|
18
|
+
// OpenAI
|
|
19
|
+
{ name: 'GPTBot', company: 'OpenAI' },
|
|
20
|
+
{ name: 'ChatGPT-User', company: 'OpenAI' },
|
|
21
|
+
{ name: 'OAI-SearchBot', company: 'OpenAI' },
|
|
22
|
+
|
|
23
|
+
// Google
|
|
24
|
+
{ name: 'Googlebot', company: 'Google' },
|
|
25
|
+
{ name: 'Google-CloudVertexBot', company: 'Google' },
|
|
26
|
+
{ name: 'Google-Extended', company: 'Google' },
|
|
27
|
+
{ name: 'GoogleOther', company: 'Google' },
|
|
28
|
+
|
|
29
|
+
// Anthropic
|
|
30
|
+
{ name: 'ClaudeBot', company: 'Anthropic' },
|
|
31
|
+
{ name: 'Claude-User', company: 'Anthropic' },
|
|
32
|
+
{ name: 'anthropic-ai', company: 'Anthropic' },
|
|
33
|
+
|
|
34
|
+
// Perplexity
|
|
35
|
+
{ name: 'PerplexityBot', company: 'Perplexity' },
|
|
36
|
+
{ name: 'Perplexity-User', company: 'Perplexity' },
|
|
37
|
+
|
|
38
|
+
// Meta
|
|
39
|
+
{ name: 'Meta-ExternalAgent', company: 'Meta' },
|
|
40
|
+
{ name: 'Meta-ExternalFetcher', company: 'Meta' },
|
|
41
|
+
{ name: 'FacebookBot', company: 'Meta' },
|
|
42
|
+
|
|
43
|
+
// Common Crawl (used by most smaller AI companies)
|
|
44
|
+
{ name: 'CCBot', company: 'Common Crawl' },
|
|
45
|
+
|
|
46
|
+
// Other notable AI crawlers
|
|
47
|
+
{ name: 'Bytespider', company: 'ByteDance' },
|
|
48
|
+
{ name: 'Applebot-Extended', company: 'Apple' },
|
|
49
|
+
{ name: 'cohere-ai', company: 'Cohere' },
|
|
50
|
+
{ name: 'YouBot', company: 'You.com' },
|
|
51
|
+
];
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Flat list of user-agent substrings for fast matching in the middleware.
|
|
55
|
+
*/
|
|
56
|
+
export const AI_BOT_USER_AGENTS: string[] = AI_BOTS.map(bot => bot.name);
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Given a raw user-agent string, returns the matched AiBot entry or undefined.
|
|
60
|
+
* Comparison is case-insensitive to handle inconsistent agent casing.
|
|
61
|
+
*/
|
|
62
|
+
export function matchBot(userAgent: string): AiBot | undefined {
|
|
63
|
+
const ua = userAgent.toLowerCase();
|
|
64
|
+
return AI_BOTS.find(bot => ua.includes(bot.name.toLowerCase()));
|
|
65
|
+
}
|
package/src/cli.ts
CHANGED
|
@@ -5,7 +5,24 @@ import path from 'path';
|
|
|
5
5
|
import pc from 'picocolors';
|
|
6
6
|
import { extractContent } from './extractor';
|
|
7
7
|
|
|
8
|
+
// Simple helper to load .env.local from the current working directory
|
|
9
|
+
function loadEnv() {
|
|
10
|
+
const envPath = path.join(process.cwd(), '.env.local');
|
|
11
|
+
if (fs.existsSync(envPath)) {
|
|
12
|
+
const envContent = fs.readFileSync(envPath, 'utf8');
|
|
13
|
+
envContent.split(/\r?\n/).forEach(line => {
|
|
14
|
+
const trimmedLine = line.trim();
|
|
15
|
+
if (!trimmedLine || trimmedLine.startsWith('#')) return;
|
|
16
|
+
const [key, ...valueParts] = trimmedLine.split('=');
|
|
17
|
+
if (key && valueParts.length > 0) {
|
|
18
|
+
process.env[key.trim()] = valueParts.join('=').trim().replace(/^["']|["']$/g, '');
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
8
24
|
async function main() {
|
|
25
|
+
loadEnv();
|
|
9
26
|
console.log(pc.cyan('\n[Onto] Starting Semantic Output Generation...'));
|
|
10
27
|
|
|
11
28
|
const cwd = process.cwd();
|
|
@@ -78,12 +95,50 @@ async function main() {
|
|
|
78
95
|
}
|
|
79
96
|
}
|
|
80
97
|
|
|
81
|
-
console.log(pc.cyan(`\n[Onto] Finished generation.`));
|
|
82
98
|
console.log(
|
|
83
99
|
pc.bold(
|
|
84
100
|
pc.magenta(`Processed ${totalFilesProcessed} pages. Total Size: ${(totalOriginalSize / 1024).toFixed(1)}KB -> ${(totalMarkdownSize / 1024).toFixed(1)}KB`)
|
|
85
101
|
)
|
|
86
102
|
);
|
|
103
|
+
|
|
104
|
+
// Sync with Onto Control Plane (Premium)
|
|
105
|
+
const ONTO_API_KEY = process.env.ONTO_API_KEY;
|
|
106
|
+
const DASHBOARD_URL = process.env.ONTO_DASHBOARD_URL || 'https://app.buildonto.dev';
|
|
107
|
+
|
|
108
|
+
if (ONTO_API_KEY && totalFilesProcessed > 0) {
|
|
109
|
+
console.log(pc.cyan(`[Onto] Syncing manifest with Control Plane [${DASHBOARD_URL}]...`));
|
|
110
|
+
try {
|
|
111
|
+
const manifest = files.map(file => {
|
|
112
|
+
const routeName = file.replace(/\.html$/, '');
|
|
113
|
+
const route = routeName === 'index' ? '/' : `/${routeName}`;
|
|
114
|
+
const mdPath = path.join(ontoPublicDir, file.replace(/\.html$/, '.md'));
|
|
115
|
+
return {
|
|
116
|
+
route,
|
|
117
|
+
filename: `${routeName}.md`,
|
|
118
|
+
content: fs.readFileSync(mdPath, 'utf8')
|
|
119
|
+
};
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
const res = await fetch(`${DASHBOARD_URL}/api/files`, {
|
|
123
|
+
method: 'POST',
|
|
124
|
+
headers: {
|
|
125
|
+
'x-onto-key': ONTO_API_KEY,
|
|
126
|
+
'Content-Type': 'application/json'
|
|
127
|
+
},
|
|
128
|
+
body: JSON.stringify({ files: manifest })
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
if (res.ok) {
|
|
132
|
+
console.log(pc.green('✓ Control Plane sync successful'));
|
|
133
|
+
} else {
|
|
134
|
+
const errData = await res.json().catch(() => ({}));
|
|
135
|
+
console.log(pc.yellow(`⚠ Control Plane sync skipped: ${errData.error || res.statusText}`));
|
|
136
|
+
}
|
|
137
|
+
} catch (e: any) {
|
|
138
|
+
console.log(pc.yellow(`⚠ Control Plane sync failed: ${e.message}`));
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
87
142
|
console.log(pc.dim(`Edge payloads are ready at /public/.onto/*\n`));
|
|
88
143
|
}
|
|
89
144
|
|
package/src/middleware.ts
CHANGED
|
@@ -1,21 +1,12 @@
|
|
|
1
1
|
import { NextRequest, NextResponse } from 'next/server';
|
|
2
|
+
import { AI_BOT_USER_AGENTS, matchBot } from './bots';
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
'GPTBot',
|
|
5
|
-
'ChatGPT-User',
|
|
6
|
-
'ClaudeBot',
|
|
7
|
-
'Claude-Web',
|
|
8
|
-
'anthropic-ai',
|
|
9
|
-
'PerplexityBot',
|
|
10
|
-
'OAI-SearchBot',
|
|
11
|
-
'GoogleExtended',
|
|
12
|
-
];
|
|
13
|
-
|
|
14
|
-
export function ontoMiddleware(request: NextRequest) {
|
|
4
|
+
export async function ontoMiddleware(request: NextRequest) {
|
|
15
5
|
const userAgent = request.headers.get('user-agent') || '';
|
|
16
6
|
const accept = request.headers.get('accept') || '';
|
|
17
7
|
|
|
18
|
-
const
|
|
8
|
+
const matched = matchBot(userAgent);
|
|
9
|
+
const isAiBot = !!matched;
|
|
19
10
|
const isMarkdownRequested = accept.includes('text/markdown');
|
|
20
11
|
|
|
21
12
|
// If traffic is identified as an AI Bot, rewrite the URL
|
|
@@ -38,11 +29,86 @@ export function ontoMiddleware(request: NextRequest) {
|
|
|
38
29
|
payloadPath = payloadPath.slice(0, -1);
|
|
39
30
|
}
|
|
40
31
|
|
|
32
|
+
// Common response headers for all bot responses
|
|
33
|
+
const botHeaders: Record<string, string> = {
|
|
34
|
+
'Content-Type': 'text/markdown; charset=utf-8',
|
|
35
|
+
'Cache-Control': 'public, max-age=3600, s-maxage=3600, stale-while-revalidate=86400',
|
|
36
|
+
};
|
|
37
|
+
if (matched) {
|
|
38
|
+
botHeaders['X-Onto-Bot'] = `${matched.name} (${matched.company})`;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// --- Onto Control Plane Integration (Premium) ---
|
|
42
|
+
const ONTO_API_KEY = process.env.ONTO_API_KEY;
|
|
43
|
+
const DASHBOARD_URL = process.env.ONTO_DASHBOARD_URL || 'https://app.buildonto.dev';
|
|
44
|
+
|
|
45
|
+
if (ONTO_API_KEY) {
|
|
46
|
+
// 1. Fire-and-forget tracking — includes structured bot info
|
|
47
|
+
fetch(`${DASHBOARD_URL}/api/track`, {
|
|
48
|
+
method: 'POST',
|
|
49
|
+
headers: {
|
|
50
|
+
'x-onto-key': ONTO_API_KEY,
|
|
51
|
+
'Content-Type': 'application/json'
|
|
52
|
+
},
|
|
53
|
+
body: JSON.stringify({
|
|
54
|
+
route: url.pathname,
|
|
55
|
+
userAgent: userAgent,
|
|
56
|
+
bot: matched ? matched.name : null,
|
|
57
|
+
company: matched ? matched.company : null,
|
|
58
|
+
})
|
|
59
|
+
}).catch(() => {});
|
|
60
|
+
|
|
61
|
+
// 2. Dynamic Context Injection
|
|
62
|
+
try {
|
|
63
|
+
// Fetch the injection from the Control Plane
|
|
64
|
+
const injectRes = await fetch(`${DASHBOARD_URL}/api/sdk/inject?route=${url.pathname}`, {
|
|
65
|
+
headers: { 'x-onto-key': ONTO_API_KEY },
|
|
66
|
+
// Set a strict timeout to keep edge fast
|
|
67
|
+
signal: AbortSignal.timeout(1500)
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
if (injectRes.ok) {
|
|
71
|
+
const { injection } = await injectRes.json();
|
|
72
|
+
|
|
73
|
+
if (injection) {
|
|
74
|
+
// To inject, we must fetch the local markdown and append
|
|
75
|
+
const localMdUrl = `${url.origin}/.onto${payloadPath}.md`;
|
|
76
|
+
const mdRes = await fetch(localMdUrl);
|
|
77
|
+
|
|
78
|
+
if (mdRes.ok) {
|
|
79
|
+
const baseMarkdown = await mdRes.text();
|
|
80
|
+
const finalMarkdown = `${baseMarkdown}\n\n---\n\n${injection}`;
|
|
81
|
+
|
|
82
|
+
return new NextResponse(finalMarkdown, {
|
|
83
|
+
headers: {
|
|
84
|
+
...botHeaders,
|
|
85
|
+
'X-Onto-Injected': 'true'
|
|
86
|
+
}
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
} catch (err) {
|
|
92
|
+
console.error('[Onto] Injection failed', err);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
// ------------------------------------------------
|
|
96
|
+
|
|
41
97
|
url.pathname = `/.onto${payloadPath}.md`;
|
|
42
98
|
|
|
43
99
|
// Rewrite implicitly serves the target URL transparently to the client.
|
|
44
|
-
|
|
100
|
+
const response = NextResponse.rewrite(url);
|
|
101
|
+
// Attach bot identification headers to the rewrite response
|
|
102
|
+
if (matched) {
|
|
103
|
+
response.headers.set('X-Onto-Bot', `${matched.name} (${matched.company})`);
|
|
104
|
+
}
|
|
105
|
+
return response;
|
|
45
106
|
}
|
|
46
107
|
|
|
47
108
|
return NextResponse.next();
|
|
48
109
|
}
|
|
110
|
+
|
|
111
|
+
// Re-export the bot registry for consumers who want to extend or inspect it
|
|
112
|
+
export { AI_BOT_USER_AGENTS, matchBot } from './bots';
|
|
113
|
+
export type { AiBot } from './bots';
|
|
114
|
+
|