@alete-ai/gate-ingest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,12 @@
1
+ 'use strict';var js=require('@mdream/js'),plugins=require('@mdream/js/plugins'),openredaction=require('openredaction');var p=plugins.createPlugin({onNodeEnter(a){let e=a.name,t=a.attributes||{};switch(e){case "form":return `
2
+ [FORM_START]
3
+ `;case "input":{let r=t.type||"text",n=t.name||t.id||"",c=t.placeholder||"";return `[INPUT:${r}:${n}:${c}]`}case "select":return "[SELECT_START]";case "option":return " (OPTION:";case "button":return "[BUTTON:";case "label":return "LABEL[";case "nav":return `
4
+ [NAV_START]
5
+ `;case "a":return `[LINK:${t.href||"#"}]`;default:return}},onNodeExit(a){switch(a.name){case "form":return `
6
+ [FORM_END]
7
+ `;case "select":return "[SELECT_END]";case "option":return ") ";case "button":return "]";case "label":return "]";case "nav":return `
8
+ [NAV_END]
9
+ `;default:return}}});function d(a){let e=a.replace(/\\?\[FORM_START\\?\]/g,"structFormStart").replace(/\\?\[FORM_END\\?\]/g,"structFormEnd").replace(/\\?\[SELECT_START\\?\]/g,"structSelectStart").replace(/\\?\[SELECT_END\\?\]/g,"structSelectEnd").replace(/\\?\[NAV_START\\?\]/g,"structNavStart").replace(/\\?\[NAV_END\\?\]/g,"structNavEnd");return e=e.replace(/LABEL\\?\[/g,"structLabel "),e=e.replace(/\\?\[INPUT:([^:]+):([^:]*):([^\\\]]*)\\?\]/g,(t,r,n)=>{let c=n.replace(/[^a-zA-Z0-9]/g,"").slice(0,20);return `structInput${r.charAt(0).toUpperCase()+r.slice(1)}${c}`}).replace(/\\?\[LINK:[^\\\]]+\\?\]/g,()=>"structLinkElement").replace(/\\?\[BUTTON:([^\\\]]+)\\?\]/g,(t,r)=>`structButton${r.replace(/[^a-zA-Z0-9]/g,"").slice(0,20)}`),e=e.replace(/\[([^\]]*)\]\(([^)]+)\)/g,(t,r)=>`structLinkElement${r.replace(/[^a-zA-Z0-9]/g,"").slice(0,20)}`).replace(/!\[([^\]]*)\]\(([^)]+)\)/g,(t,r)=>`structImage${r.replace(/[^a-zA-Z0-9]/g,"").slice(0,20)}`),e=e.replace(/^# (.*$)/gm,(t,r)=>`sysHeader1 ${r.replace(/\[([^\]]+)\]\([^)]+\)/g,"$1").replace(/[^a-zA-Z0-9]/g,"").slice(0,30)}`).replace(/^## (.*$)/gm,(t,r)=>`sysHeader2 ${r.replace(/\[([^\]]+)\]\([^)]+\)/g,"$1").replace(/[^a-zA-Z0-9]/g,"").slice(0,30)}`).replace(/^### (.*$)/gm,(t,r)=>`sysHeader3 ${r.replace(/\[([^\]]+)\]\([^)]+\)/g,"$1").replace(/[^a-zA-Z0-9]/g,"").slice(0,30)}`),e=e.replace(/https?:\/\/[^\s]+/g,"").replace(/[#*`_\[\]():]/g," "),e.split(/\s+/).filter(t=>t.startsWith("struct")||t.startsWith("sys")||/^[A-Z]/.test(t)&&t.length>2).join(" ").trim()}var o=class{engine;constructor(e={}){let t=[];e.redactPii!==false&&(t.push("EMAIL"),t.push("PHONE_US","PHONE_UK","PHONE_INTERNATIONAL"),t.push(...openredaction.getPatternsByCategory("government").map(n=>n.type))),e.redactFinancials!==false&&t.push(...openredaction.getPatternsByCategory("financial").map(n=>n.type)),e.redactCredentials!==false&&t.push(...openredaction.getPatternsByCategory("credentials").map(n=>n.type)),e.redactMedical!==false&&t.push(...openredaction.getPatternsByCategory("healthcare").map(n=>n.type)),e.redactInfrastructure!==false&&t.push(...openredaction.getPatternsByCategory("network").map(n=>n.type));let r={patterns:t,includeNames:false,includeAddresses:false,redactionMode:"placeholder",enableContextAnalysis:true,enableFalsePositiveFilter:true,falsePositiveThreshold:.7,deterministic:true};this.engine=new openredaction.OpenRedaction(r);}async redact(e){return e&&(await this.engine.detect(e)).redacted}async hasSensitiveInfo(e){return e?(await this.engine.detect(e)).detections.length>0:false}async process(e){if(!e)return {redacted:e,hasSensitiveInfo:false};let t=await this.engine.detect(e);return {redacted:t.redacted,hasSensitiveInfo:t.detections.length>0}}};var _=(n=>(n.SENSITIVE_PORTAL="sensitive_portal",n.DIGESTIBLE_ARTICLE="digestible_article",n.NOISE="noise",n.UNKNOWN="unknown",n))(_||{}),T={SEMANTIC_TOKEN_CAP:15e3},l={defaultSemanticTokenCap:T.SEMANTIC_TOKEN_CAP},h=false;function I(a={}){l={...l,...a},console.log(`\u{1F6E1}\uFE0F Alete Gate: Ingestion substrate initialized. Semantic cap: ${l.defaultSemanticTokenCap} tokens. Explore our ecosystem at https://alete.ai/`),h=true;}function b(a){let e=a.trim().split(/\s+/).length;return Math.ceil(e*1.33)}function C(a,e){if(b(a)<=e)return {truncated:a,isTruncated:false};let r=Math.floor(e/1.33);return {truncated:a.trim().split(/\s+/).slice(0,r).join(" ")+`
10
+
11
+ ... [Content truncated due to token cap]`,isTruncated:true}}async function w(a,e={}){h||I();let t=js.htmlToMarkdown(a,{hooks:[p]}),r={},n=js.htmlToMarkdown(a,js.withMinimalPreset({isolateMain:false,plugins:{tagOverrides:{a:{enter:"",exit:""},img:{enter:"",exit:""},svg:{enter:"",exit:""},canvas:{enter:"",exit:""}},frontmatter:{onExtract:s=>{r=s;}}}})).trim();if(!r.title){let s=n.match(/^# (.*)$/m);s&&(r.title=s[1].replace(/[#*`_]/g,"").trim());}if(!r.description){let s=n.replace(/^# .*$/m,"").trim();if(s){let u=s.replace(/[#*`_]/g,"").trim();r.description=u.slice(0,150).replace(/\n/g," ")+(u.length>150?"...":"");}}let c=e.semanticTokenCap??l.defaultSemanticTokenCap??T.SEMANTIC_TOKEN_CAP,{truncated:E,isTruncated:N}=C(n,c);n=E;let g=false;if(e.redact){let s=typeof e.redact=="object"?e.redact:{},f=await new o(s).process(n);n=f.redacted,g=f.hasSensitiveInfo;}return {structural:d(t),semantic:n,hasSensitiveInfo:g,metadata:r,isTruncated:N}}exports.GateLabel=_;exports.Redactor=o;exports.initialize=I;exports.mapToTokens=d;exports.plugin=p;exports.processHtml=w;//# sourceMappingURL=index.cjs.map
12
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/config.ts","../src/token-mapper.ts","../src/sanitization/Redactor.ts","../src/index.ts"],"names":["structuralPlugin","createPlugin","element","tag","attrs","type","name","placeholder","mapToTokens","text","processed","_","cleanName","content","alt","word","Redactor","options","patterns","getPatternsByCategory","p","engineOptions","OpenRedaction","result","GateLabel","DEFAULTS","globalConfig","isInitialized","initialize","config","estimateTokens","wordCount","truncateToCap","cap","wordsToKeep","processHtml","html","structuralMd","htmlToMarkdown","metadata","semantic","withMinimalPreset","fm","h1Match","bodyOnly","cleanBody","truncated","isTruncated","hasSensitiveInfo","redactorOptions"],"mappings":"uHAOO,IAAMA,CAAAA,CAAmBC,oBAAAA,CAAa,CAC3C,WAAA,CAAYC,CAAAA,CAAS,CACnB,IAAMC,CAAAA,CAAMD,CAAAA,CAAQ,IAAA,CACdE,CAAAA,CAAQF,CAAAA,CAAQ,UAAA,EAAc,EAAC,CAErC,OAAQC,CAAAA,EACN,KAAK,OACH,OAAO;AAAA;AAAA,CAAA,CACT,KAAK,QAAS,CACZ,IAAME,EAAOD,CAAAA,CAAM,IAAA,EAAQ,OACrBE,CAAAA,CAAOF,CAAAA,CAAM,MAAQA,CAAAA,CAAM,EAAA,EAAM,GACjCG,CAAAA,CAAcH,CAAAA,CAAM,aAAe,EAAA,CACzC,OAAO,UAAUC,CAAI,CAAA,CAAA,EAAIC,CAAI,CAAA,CAAA,EAAIC,CAAW,GAC9C,CACA,KAAK,SACH,OAAO,gBAAA,CACT,KAAK,QAAA,CACH,OAAO,YACT,KAAK,QAAA,CACH,OAAO,UAAA,CACT,KAAK,QACH,OAAO,QAAA,CACT,KAAK,KAAA,CACH,OAAO;AAAA;AAAA,CAAA,CACT,KAAK,IAEH,OAAO,CAAA,MAAA,EADMH,EAAM,IAAA,EAAQ,GACP,IAEtB,QACE,MACJ,CACF,CAAA,CAEA,UAAA,CAAWF,EAAS,CAElB,OADYA,EAAQ,IAAA,EAElB,KAAK,MAAA,CACH,OAAO;AAAA;AAAA,CAAA,CACT,KAAK,QAAA,CACH,OAAO,eACT,KAAK,QAAA,CACH,OAAO,IAAA,CACT,KAAK,QAAA,CACH,OAAO,IACT,KAAK,OAAA,CACH,OAAO,GAAA,CACT,KAAK,MACH,OAAO;AAAA;AAAA,CAAA,CACT,QACE,MACJ,CACF,CACF,CAAC,ECtDM,SAASM,CAAAA,CAAYC,CAAAA,CAAsB,CAGhD,IAAIC,CAAAA,CAAYD,EACb,OAAA,CAAQ,uBAAA,CAAyB,iBAAiB,CAAA,CAClD,OAAA,CAAQ,qBAAA,CAAuB,eAAe,CAAA,CAC9C,OAAA,CAAQ,yBAAA,CAA2B,mBAAmB,CAAA,CACtD,OAAA,CAAQ,uBAAA,CAAyB,iBAAiB,CAAA,CAClD,QAAQ,sBAAA,CAAwB,gBAAgB,CAAA,CAChD,OAAA,CAAQ,oBAAA,CAAsB,cAAc,CAAA,CAG/C,OAAAC,CAAAA,CAAYA,CAAAA,CAAU,OAAA,CAAQ,aAAA,CAAe,cAAc,CAAA,CAG3DA,CAAAA,CAAYA,CAAAA,CAET,QAAQ,6CAAA,CAA+C,CAACC,CAAAA,CAAGN,CAAAA,CAAMC,CAAAA,GAAS,CACzE,IAAMM,CAAAA,CAAYN,EAAK,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAAA,CAC/D,OAAO,CAAA,WAAA,EAAcD,CAAAA,CAAK,MAAA,CAAO,CAAC,CAAA,CAAE,WAAA,EAAY,CAAIA,CAAAA,CAAK,KAAA,CAAM,CAAC,CAAC,CAAA,EAAGO,CAAS,CAAA,CAC/E,CAAC,CAAA,CAEA,QAAQ,0BAAA,CAA4B,IAAM,mBAAmB,CAAA,CAE7D,OAAA,CAAQ,8BAAA,CAAgC,CAACD,CAAAA,CAAGF,CAAAA,GAEpC,CAAA,YAAA,EADOA,CAAAA,CAAK,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,EAAG,EAAE,CAChC,CAAA,CAC5B,CAAA,CAGHC,CAAAA,CAAYA,CAAAA,CAET,OAAA,CAAQ,0BAAA,CAA4B,CAACC,CAAAA,CAAGE,CAAAA,GAEhC,CAAA,iBAAA,EADOA,CAAAA,CAAQ,OAAA,CAAQ,eAAA,CAAiB,EAAE,EAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC9B,CAAA,CACjC,CAAA,CAEA,OAAA,CAAQ,2BAAA,CAA6B,CAACF,CAAAA,CAAGG,CAAAA,GAEjC,CAAA,WAAA,EADOA,CAAAA,CAAI,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,MAAM,CAAA,CAAG,EAAE,CAChC,CAAA,CAC3B,CAAA,CAGHJ,CAAAA,CAAYA,CAAAA,CACT,OAAA,CAAQ,YAAA,CAAc,CAACC,CAAAA,CAAGE,CAAAA,GAElB,CAAA,WAAA,EADOA,CAAAA,CAAQ,OAAA,CAAQ,wBAAA,CAA0B,IAAI,CAAA,CAAE,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC5E,CAAA,CAC3B,CAAA,CACA,OAAA,CAAQ,aAAA,CAAe,CAACF,CAAAA,CAAGE,CAAAA,GAEnB,cADOA,CAAAA,CAAQ,OAAA,CAAQ,wBAAA,CAA0B,IAAI,CAAA,CAAE,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC5E,CAAA,CAC3B,CAAA,CACA,OAAA,CAAQ,eAAgB,CAACF,CAAAA,CAAGE,CAAAA,GAEpB,CAAA,WAAA,EADOA,CAAAA,CAAQ,OAAA,CAAQ,wBAAA,CAA0B,IAAI,EAAE,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC5E,EAC3B,CAAA,CAGHH,CAAAA,CAAYA,CAAAA,CACT,OAAA,CAAQ,oBAAA,CAAsB,EAAE,CAAA,CAChC,OAAA,CAAQ,gBAAA,CAAkB,GAAG,CAAA,CAGzBA,CAAAA,CACJ,KAAA,CAAM,KAAK,CAAA,CACX,MAAA,CAAOK,GACCA,CAAAA,CAAK,UAAA,CAAW,QAAQ,CAAA,EACxBA,CAAAA,CAAK,UAAA,CAAW,KAAK,CAAA,EACpB,SAAS,IAAA,CAAKA,CAAI,CAAA,EAAKA,CAAAA,CAAK,MAAA,CAAS,CAC9C,CAAA,CACA,IAAA,CAAK,GAAG,CAAA,CACR,IAAA,EACL,CC7DO,IAAMC,CAAAA,CAAN,KAAe,CACZ,MAAA,CAER,WAAA,CAAYC,EAA2B,EAAC,CAAG,CACzC,IAAMC,CAAAA,CAAqB,EAAC,CAGxBD,CAAAA,CAAQ,YAAc,KAAA,GACxBC,CAAAA,CAAS,IAAA,CAAK,OAAO,CAAA,CACrBA,CAAAA,CAAS,IAAA,CAAK,UAAA,CAAY,WAAY,qBAAqB,CAAA,CAC3DA,CAAAA,CAAS,IAAA,CAAK,GAAGC,mCAAAA,CAAsB,YAAY,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAAA,CAInEH,CAAAA,CAAQ,gBAAA,GAAqB,OAC/BC,CAAAA,CAAS,IAAA,CAAK,GAAGC,mCAAAA,CAAsB,WAAW,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAIlEH,CAAAA,CAAQ,iBAAA,GAAsB,KAAA,EAChCC,CAAAA,CAAS,KAAK,GAAGC,mCAAAA,CAAsB,aAAa,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAIpEH,CAAAA,CAAQ,aAAA,GAAkB,KAAA,EAC5BC,CAAAA,CAAS,IAAA,CAAK,GAAGC,oCAAsB,YAAY,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAInEH,EAAQ,oBAAA,GAAyB,KAAA,EACnCC,CAAAA,CAAS,IAAA,CAAK,GAAGC,mCAAAA,CAAsB,SAAS,CAAA,CAAE,IAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAGpE,IAAMC,CAAAA,CAAsC,CAC1C,QAAA,CAAAH,CAAAA,CAEA,YAAA,CAAc,KAAA,CACd,gBAAA,CAAkB,KAAA,CAElB,aAAA,CAAe,aAAA,CACf,sBAAuB,IAAA,CACvB,yBAAA,CAA2B,IAAA,CAC3B,sBAAA,CAAwB,EAAA,CACxB,aAAA,CAAe,IACjB,CAAA,CAEA,KAAK,MAAA,CAAS,IAAII,2BAAAA,CAAcD,CAAa,EAC/C,CAKA,MAAa,MAAA,CAAOZ,EAA+B,CACjD,OAAKA,CAAAA,EAAAA,CACU,MAAM,IAAA,CAAK,MAAA,CAAO,MAAA,CAAOA,CAAI,CAAA,EAC9B,QAChB,CAKA,MAAa,gBAAA,CAAiBA,CAAAA,CAAgC,CAC5D,OAAKA,GACU,MAAM,IAAA,CAAK,MAAA,CAAO,MAAA,CAAOA,CAAI,CAAA,EAC9B,UAAA,CAAW,MAAA,CAAS,EAFhB,KAGpB,CAKA,MAAa,OAAA,CAAQA,CAAAA,CAAwE,CAC3F,GAAI,CAACA,EAAM,OAAO,CAAE,QAAA,CAAUA,CAAAA,CAAM,gBAAA,CAAkB,KAAM,CAAA,CAC5D,IAAMc,CAAAA,CAAS,MAAM,IAAA,CAAK,MAAA,CAAO,MAAA,CAAOd,CAAI,CAAA,CAC5C,OAAO,CACL,QAAA,CAAUc,CAAAA,CAAO,QAAA,CACjB,gBAAA,CAAkBA,CAAAA,CAAO,UAAA,CAAW,MAAA,CAAS,CAC/C,CACF,CACF,ECzFO,IAAKC,CAAAA,CAAAA,CAAAA,CAAAA,GACVA,CAAAA,CAAA,gBAAA,CAAmB,kBAAA,CACnBA,EAAA,kBAAA,CAAqB,oBAAA,CACrBA,CAAAA,CAAA,KAAA,CAAQ,OAAA,CACRA,CAAAA,CAAA,OAAA,CAAU,SAAA,CAJAA,CAAAA,CAAAA,EAAAA,CAAAA,EAAA,EAAA,CAAA,CAuDNC,CAAAA,CAAW,CACf,kBAAA,CAAoB,IACtB,CAAA,CAEIC,CAAAA,CAA6B,CAC/B,uBAAA,CAAyBD,CAAAA,CAAS,kBACpC,CAAA,CAEIE,CAAAA,CAAgB,MAKb,SAASC,CAAAA,CAAWC,EAAuB,EAAC,CAAS,CAC1DH,CAAAA,CAAe,CAAE,GAAGA,CAAAA,CAAc,GAAGG,CAAO,CAAA,CAC5C,OAAA,CAAQ,GAAA,CAAI,CAAA,2EAAA,EAAkEH,CAAAA,CAAa,uBAAuB,CAAA,mDAAA,CAAqD,CAAA,CACvKC,CAAAA,CAAgB,KAClB,CAMA,SAASG,CAAAA,CAAerB,CAAAA,CAAsB,CAC5C,IAAMsB,EAAYtB,CAAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,KAAK,CAAA,CAAE,MAAA,CAC3C,OAAO,KAAK,IAAA,CAAKsB,CAAAA,CAAY,IAAI,CACnC,CAKA,SAASC,CAAAA,CAAcvB,CAAAA,CAAcwB,EAA0D,CAE7F,GADeH,CAAAA,CAAerB,CAAI,CAAA,EACpBwB,CAAAA,CACZ,OAAO,CAAE,SAAA,CAAWxB,CAAAA,CAAM,WAAA,CAAa,KAAM,CAAA,CAI/C,IAAMyB,CAAAA,CAAc,IAAA,CAAK,MAAMD,CAAAA,CAAM,IAAI,CAAA,CAIzC,OAAO,CAAE,SAAA,CAHKxB,CAAAA,CAAK,IAAA,GAAO,KAAA,CAAM,KAAK,CAAA,CACb,KAAA,CAAM,CAAA,CAAGyB,CAAW,CAAA,CAAE,IAAA,CAAK,GAAG,CAAA,CAAI;;AAAA,wCAAA,CAAA,CAEtC,WAAA,CAAa,IAAK,CACxC,CAMA,eAAsBC,CAAAA,CAAYC,CAAAA,CAAcnB,CAAAA,CAA4B,EAAC,CAA6B,CACnGU,GACHC,CAAAA,EAAW,CAIb,IAAMS,CAAAA,CAAeC,iBAAAA,CAAeF,CAAAA,CAAM,CACxC,KAAA,CAAO,CAACpC,CAAgB,CAC1B,CAAC,CAAA,CAEGuC,EAAmC,EAAC,CACpCC,CAAAA,CAAWF,iBAAAA,CAAeF,CAAAA,CAAMK,oBAAAA,CAAkB,CACpD,WAAA,CAAa,KAAA,CACb,OAAA,CAAS,CACP,YAAA,CAAc,CACZ,EAAG,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAAA,CACzB,IAAK,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAAA,CAC3B,IAAK,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAAA,CAC3B,OAAQ,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAChC,EACA,WAAA,CAAa,CACX,SAAA,CAAYC,CAAAA,EAAO,CACjBH,CAAAA,CAAWG,EACb,CACF,CACF,CACF,CAAC,CAAC,CAAA,CAAE,MAAK,CAGT,GAAI,CAACH,CAAAA,CAAS,KAAA,CAAO,CACnB,IAAMI,CAAAA,CAAUH,CAAAA,CAAS,KAAA,CAAM,WAAW,CAAA,CACtCG,CAAAA,GAEFJ,EAAS,KAAA,CAAQI,CAAAA,CAAQ,CAAC,CAAA,CAAE,OAAA,CAAQ,SAAA,CAAW,EAAE,CAAA,CAAE,IAAA,EAAK,EAE5D,CACA,GAAI,CAACJ,EAAS,WAAA,CAAa,CAEzB,IAAMK,CAAAA,CAAWJ,CAAAA,CAAS,QAAQ,SAAA,CAAW,EAAE,CAAA,CAAE,IAAA,EAAK,CACtD,GAAII,EAAU,CAEZ,IAAMC,CAAAA,CAAYD,CAAAA,CAAS,OAAA,CAAQ,SAAA,CAAW,EAAE,CAAA,CAAE,IAAA,EAAK,CACvDL,CAAAA,CAAS,WAAA,CAAcM,CAAAA,CAAU,MAAM,CAAA,CAAG,GAAG,CAAA,CAAE,OAAA,CAAQ,KAAA,CAAO,GAAG,GAAKA,CAAAA,CAAU,MAAA,CAAS,GAAA,CAAM,KAAA,CAAQ,EAAA,EACzG,CACF,CAGA,IAAMZ,CAAAA,CAAMhB,CAAAA,CAAQ,gBAAA,EAAoBS,CAAAA,CAAa,uBAAA,EAA2BD,EAAS,kBAAA,CACnF,CAAE,SAAA,CAAAqB,CAAAA,CAAW,WAAA,CAAAC,CAAY,EAAIf,CAAAA,CAAcQ,CAAAA,CAAUP,CAAG,CAAA,CAC9DO,CAAAA,CAAWM,CAAAA,CAEX,IAAIE,CAAAA,CAAmB,KAAA,CAGvB,GAAI/B,CAAAA,CAAQ,MAAA,CAAQ,CAClB,IAAMgC,CAAAA,CAAkB,OAAOhC,CAAAA,CAAQ,MAAA,EAAW,QAAA,CAAWA,CAAAA,CAAQ,OAAS,EAAC,CAEzEM,CAAAA,CAAS,MADE,IAAIP,CAAAA,CAASiC,CAAe,CAAA,CACf,OAAA,CAAQT,CAAQ,CAAA,CAC9CA,CAAAA,CAAWjB,CAAAA,CAAO,SAClByB,CAAAA,CAAmBzB,CAAAA,CAAO,iBAC5B,CAEA,OAAO,CACL,WAAYf,CAAAA,CAAY6B,CAAY,CAAA,CACpC,QAAA,CAAAG,CAAAA,CACA,gBAAA,CAAAQ,EACA,QAAA,CAAAT,CAAAA,CACA,WAAA,CAAAQ,CACF,CACF","file":"index.cjs","sourcesContent":["import { createPlugin } from '@mdream/js/plugins';\n\n/**\n * Custom mdream plugin to retain structural artifacts.\n * This preserves the \"structural footprint\" of sensitive portals\n * by injecting intermediate markers with attribute context.\n */\nexport const structuralPlugin = createPlugin({\n onNodeEnter(element) {\n const tag = element.name;\n const attrs = element.attributes || {};\n\n switch (tag) {\n case 'form':\n return '\\n[FORM_START]\\n';\n case 'input': {\n const type = attrs.type || 'text';\n const name = attrs.name || attrs.id || '';\n const placeholder = attrs.placeholder || '';\n return `[INPUT:${type}:${name}:${placeholder}]`;\n }\n case 'select':\n return '[SELECT_START]';\n case 'option':\n return ' (OPTION:';\n case 'button':\n return '[BUTTON:';\n case 'label':\n return 'LABEL[';\n case 'nav':\n return '\\n[NAV_START]\\n';\n case 'a': {\n const href = attrs.href || '#';\n return `[LINK:${href}]`;\n }\n default:\n return undefined;\n }\n },\n\n onNodeExit(element) {\n const tag = element.name;\n switch (tag) {\n case 'form':\n return '\\n[FORM_END]\\n';\n case 'select':\n return '[SELECT_END]';\n case 'option':\n return ') ';\n case 'button':\n return ']';\n case 'label':\n return ']';\n case 'nav':\n return '\\n[NAV_END]\\n';\n default:\n return undefined;\n }\n }\n});\n","/**\n * Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.\n * This prevents Apple's NLTokenizer from stripping critical punctuation.\n * We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).\n */\nexport function mapToTokens(text: string): string {\n // 1. Process explicit markers from structuralPlugin\n // We handle potential escaping from mdream\n let processed = text\n .replace(/\\\\?\\[FORM_START\\\\?\\]/g, 'structFormStart')\n .replace(/\\\\?\\[FORM_END\\\\?\\]/g, 'structFormEnd')\n .replace(/\\\\?\\[SELECT_START\\\\?\\]/g, 'structSelectStart')\n .replace(/\\\\?\\[SELECT_END\\\\?\\]/g, 'structSelectEnd')\n .replace(/\\\\?\\[NAV_START\\\\?\\]/g, 'structNavStart')\n .replace(/\\\\?\\[NAV_END\\\\?\\]/g, 'structNavEnd');\n\n // 1.1 Process Label marker\n processed = processed.replace(/LABEL\\\\?\\[/g, 'structLabel ');\n\n // 2. Process attribute-based markers\n processed = processed\n // Inputs: [INPUT:type:name:placeholder] -> structInputType {type} {name}\n .replace(/\\\\?\\[INPUT:([^:]+):([^:]*):([^\\\\\\]]*)\\\\?\\]/g, (_, type, name) => {\n const cleanName = name.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structInput${type.charAt(0).toUpperCase() + type.slice(1)}${cleanName}`;\n })\n // Links: [LINK:url] -> structLink\n .replace(/\\\\?\\[LINK:[^\\\\\\]]+\\\\?\\]/g, () => 'structLinkElement')\n // Buttons: [BUTTON:text] -> structButton {text}\n .replace(/\\\\?\\[BUTTON:([^\\\\\\]]+)\\\\?\\]/g, (_, text) => {\n const clean = text.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structButton${clean}`;\n });\n\n // 3. Process Standard Markdown artifacts (if any remain) into clean tokens\n processed = processed\n // Links: [text](url) -> structLinkElement {text}\n .replace(/\\[([^\\]]*)\\]\\(([^)]+)\\)/g, (_, content) => {\n const clean = content.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structLinkElement${clean}`;\n })\n // Images: ![alt](url) -> structImage {alt}\n .replace(/!\\[([^\\]]*)\\]\\(([^)]+)\\)/g, (_, alt) => {\n const clean = alt.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structImage${clean}`;\n });\n\n // 4. Process Headers into clean tokens\n processed = processed\n .replace(/^# (.*$)/gm, (_, content) => {\n const clean = content.replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);\n return `sysHeader1 ${clean}`;\n })\n .replace(/^## (.*$)/gm, (_, content) => {\n const clean = content.replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);\n return `sysHeader2 ${clean}`;\n })\n .replace(/^### (.*$)/gm, (_, content) => {\n const clean = content.replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);\n return `sysHeader3 ${clean}`;\n });\n\n // 5. Final cleaning: remove URLs and punctuation (including colons now)\n processed = processed\n .replace(/https?:\\/\\/[^\\s]+/g, '') // Remove URLs\n .replace(/[#*`_\\[\\]():]/g, ' '); // Remove remaining markdown chars + colons\n\n // 6. Aggressively strip remaining natural language noise\n return processed\n .split(/\\s+/)\n .filter(word => {\n return word.startsWith('struct') || \n word.startsWith('sys') || \n (/^[A-Z]/.test(word) && word.length > 2); // Keep capitalized words (titles, labels) > 2 chars\n })\n .join(' ')\n .trim();\n}\n","import { OpenRedaction, type OpenRedactionOptions, getPatternsByCategory } from 'openredaction';\n\nexport interface RedactorOptions {\n redactPii?: boolean;\n redactFinancials?: boolean;\n redactCredentials?: boolean;\n redactInfrastructure?: boolean;\n redactMedical?: boolean;\n customPlaceholders?: Record<string, string>;\n}\n\n/**\n * Sovereign wrapper for the PII Shield engine.\n * Adopts a \"Narrative-First\" strategy: Preserves names, dates, and locations\n * to maintain story coherence while redacting \"Toxic Identifiers\" (SSNs, Credit Cards, Secrets).\n */\nexport class Redactor {\n private engine: OpenRedaction;\n\n constructor(options: RedactorOptions = {}) {\n const patterns: string[] = [];\n \n // 1. Personal & Contact (excluding Names, Addresses, Dates)\n if (options.redactPii !== false) {\n patterns.push('EMAIL');\n patterns.push('PHONE_US', 'PHONE_UK', 'PHONE_INTERNATIONAL');\n patterns.push(...getPatternsByCategory('government').map(p => p.type));\n }\n\n // 2. Financial\n if (options.redactFinancials !== false) {\n patterns.push(...getPatternsByCategory('financial').map(p => p.type));\n }\n\n // 3. Credentials\n if (options.redactCredentials !== false) {\n patterns.push(...getPatternsByCategory('credentials').map(p => p.type));\n }\n\n // 4. Medical\n if (options.redactMedical !== false) {\n patterns.push(...getPatternsByCategory('healthcare').map(p => p.type));\n }\n\n // 5. Infrastructure\n if (options.redactInfrastructure !== false) {\n patterns.push(...getPatternsByCategory('network').map(p => p.type));\n }\n\n const engineOptions: OpenRedactionOptions = {\n patterns,\n // Narrative-First safety: Double-down on disabling entities\n includeNames: false,\n includeAddresses: false,\n \n redactionMode: 'placeholder',\n enableContextAnalysis: true,\n enableFalsePositiveFilter: true,\n falsePositiveThreshold: 0.7,\n deterministic: true,\n };\n\n this.engine = new OpenRedaction(engineOptions);\n }\n\n /**\n * Redacts sensitive information from the given text.\n */\n public async redact(text: string): Promise<string> {\n if (!text) return text;\n const result = await this.engine.detect(text);\n return result.redacted;\n }\n\n /**\n * Checks if the text contains any sensitive information without modifying it.\n */\n public async hasSensitiveInfo(text: string): Promise<boolean> {\n if (!text) return false;\n const result = await this.engine.detect(text);\n return result.detections.length > 0;\n }\n\n /**\n * Performs both detection and redaction in a single pass.\n */\n public async process(text: string): Promise<{ redacted: string, hasSensitiveInfo: boolean }> {\n if (!text) return { redacted: text, hasSensitiveInfo: false };\n const result = await this.engine.detect(text);\n return {\n redacted: result.redacted,\n hasSensitiveInfo: result.detections.length > 0\n };\n }\n}\n","import { htmlToMarkdown, withMinimalPreset } from '@mdream/js';\nimport { structuralPlugin } from './config.js';\nimport { mapToTokens } from './token-mapper.js';\nimport { Redactor, type RedactorOptions } from './sanitization/Redactor.js';\n\nexport enum GateLabel {\n SENSITIVE_PORTAL = 'sensitive_portal',\n DIGESTIBLE_ARTICLE = 'digestible_article',\n NOISE = 'noise',\n UNKNOWN = 'unknown',\n}\n\nexport interface IngestionResult {\n /**\n * Alphanumeric tokenized text for Apple MaxEnt classifier.\n * High structural fidelity, low natural language noise.\n */\n structural: string;\n \n /**\n * Clean, readable Markdown for LLM analysis.\n * Low structural noise, high semantic fidelity.\n */\n semantic: string;\n\n /**\n * Whether the semantic content contains sensitive PII that was redacted.\n */\n hasSensitiveInfo?: boolean;\n\n /**\n * Extracted metadata from the HTML (title, description, author, etc.)\n */\n metadata?: Record<string, string>;\n\n /**\n * Whether the semantic content was truncated due to the token cap.\n */\n isTruncated?: boolean;\n}\n\nexport interface IngestionOptions {\n /**\n * Redaction configuration. If true, uses default settings.\n */\n redact?: RedactorOptions | boolean;\n\n /**\n * Override the semantic token cap for this specific call.\n */\n semanticTokenCap?: number;\n}\n\nexport interface GlobalConfig {\n /**\n * The default token cap for semantic returns. Defaults to 15,000.\n */\n defaultSemanticTokenCap?: number;\n}\n\nconst DEFAULTS = {\n SEMANTIC_TOKEN_CAP: 15000,\n};\n\nlet globalConfig: GlobalConfig = {\n defaultSemanticTokenCap: DEFAULTS.SEMANTIC_TOKEN_CAP,\n};\n\nlet isInitialized = false;\n\n/**\n * Initializes the Alete Gate ingestion substrate with global configuration.\n */\nexport function initialize(config: GlobalConfig = {}): void {\n globalConfig = { ...globalConfig, ...config };\n console.log(`🛡️ Alete Gate: Ingestion substrate initialized. Semantic cap: ${globalConfig.defaultSemanticTokenCap} tokens. Explore our ecosystem at https://alete.ai/`);\n isInitialized = true;\n}\n\n/**\n * Estimates the number of tokens in a text string.\n * Uses the 1.33x multiplier (tokens per word) for a safe estimation.\n */\nfunction estimateTokens(text: string): number {\n const wordCount = text.trim().split(/\\s+/).length;\n return Math.ceil(wordCount * 1.33);\n}\n\n/**\n * Truncates text to fit within a token cap.\n */\nfunction truncateToCap(text: string, cap: number): { truncated: string; isTruncated: boolean } {\n const tokens = estimateTokens(text);\n if (tokens <= cap) {\n return { truncated: text, isTruncated: false };\n }\n\n // Calculate approximate words to keep\n const wordsToKeep = Math.floor(cap / 1.33);\n const words = text.trim().split(/\\s+/);\n const truncated = words.slice(0, wordsToKeep).join(' ') + '\\n\\n... [Content truncated due to token cap]';\n \n return { truncated, isTruncated: true };\n}\n\n/**\n * The unified ingestion pipeline for Alete Gate.\n * Converts raw HTML into both structural tokens and semantic Markdown.\n */\nexport async function processHtml(html: string, options: IngestionOptions = {}): Promise<IngestionResult> {\n if (!isInitialized) {\n initialize();\n }\n\n // 1. Generate Structural Markdown using the custom plugin\n const structuralMd = htmlToMarkdown(html, {\n hooks: [structuralPlugin]\n });\n\n let metadata: Record<string, string> = {};\n let semantic = htmlToMarkdown(html, withMinimalPreset({\n isolateMain: false,\n plugins: {\n tagOverrides: {\n a: { enter: '', exit: '' },\n img: { enter: '', exit: '' },\n svg: { enter: '', exit: '' },\n canvas: { enter: '', exit: '' },\n },\n frontmatter: {\n onExtract: (fm) => {\n metadata = fm;\n }\n }\n }\n })).trim();\n\n // 2.1 Fallback metadata if not found in head\n if (!metadata.title) {\n const h1Match = semantic.match(/^# (.*)$/m);\n if (h1Match) {\n // Clean markdown from title\n metadata.title = h1Match[1].replace(/[#*`_]/g, '').trim();\n }\n }\n if (!metadata.description) {\n // Take first 150 chars of semantic (excluding title)\n const bodyOnly = semantic.replace(/^# .*$/m, '').trim();\n if (bodyOnly) {\n // Clean markdown from description\n const cleanBody = bodyOnly.replace(/[#*`_]/g, '').trim();\n metadata.description = cleanBody.slice(0, 150).replace(/\\n/g, ' ') + (cleanBody.length > 150 ? '...' : '');\n }\n }\n\n // 2.2 Apply token cap\n const cap = options.semanticTokenCap ?? globalConfig.defaultSemanticTokenCap ?? DEFAULTS.SEMANTIC_TOKEN_CAP;\n const { truncated, isTruncated } = truncateToCap(semantic, cap);\n semantic = truncated;\n\n let hasSensitiveInfo = false;\n\n // 3. Redaction Pipeline\n if (options.redact) {\n const redactorOptions = typeof options.redact === 'object' ? options.redact : {};\n const redactor = new Redactor(redactorOptions);\n const result = await redactor.process(semantic);\n semantic = result.redacted;\n hasSensitiveInfo = result.hasSensitiveInfo;\n }\n\n return {\n structural: mapToTokens(structuralMd),\n semantic,\n hasSensitiveInfo,\n metadata,\n isTruncated\n };\n}\n\nexport { structuralPlugin as plugin } from './config.js';\nexport { mapToTokens } from './token-mapper.js';\nexport { Redactor, type RedactorOptions };\n"]}
@@ -0,0 +1,106 @@
1
+ import * as _mdream_js from '@mdream/js';
2
+
3
+ interface RedactorOptions {
4
+ redactPii?: boolean;
5
+ redactFinancials?: boolean;
6
+ redactCredentials?: boolean;
7
+ redactInfrastructure?: boolean;
8
+ redactMedical?: boolean;
9
+ customPlaceholders?: Record<string, string>;
10
+ }
11
+ /**
12
+ * Sovereign wrapper for the PII Shield engine.
13
+ * Adopts a "Narrative-First" strategy: Preserves names, dates, and locations
14
+ * to maintain story coherence while redacting "Toxic Identifiers" (SSNs, Credit Cards, Secrets).
15
+ */
16
+ declare class Redactor {
17
+ private engine;
18
+ constructor(options?: RedactorOptions);
19
+ /**
20
+ * Redacts sensitive information from the given text.
21
+ */
22
+ redact(text: string): Promise<string>;
23
+ /**
24
+ * Checks if the text contains any sensitive information without modifying it.
25
+ */
26
+ hasSensitiveInfo(text: string): Promise<boolean>;
27
+ /**
28
+ * Performs both detection and redaction in a single pass.
29
+ */
30
+ process(text: string): Promise<{
31
+ redacted: string;
32
+ hasSensitiveInfo: boolean;
33
+ }>;
34
+ }
35
+
36
+ /**
37
+ * Custom mdream plugin to retain structural artifacts.
38
+ * This preserves the "structural footprint" of sensitive portals
39
+ * by injecting intermediate markers with attribute context.
40
+ */
41
+ declare const structuralPlugin: _mdream_js.TransformPlugin;
42
+
43
+ /**
44
+ * Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.
45
+ * This prevents Apple's NLTokenizer from stripping critical punctuation.
46
+ * We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).
47
+ */
48
+ declare function mapToTokens(text: string): string;
49
+
50
+ declare enum GateLabel {
51
+ SENSITIVE_PORTAL = "sensitive_portal",
52
+ DIGESTIBLE_ARTICLE = "digestible_article",
53
+ NOISE = "noise",
54
+ UNKNOWN = "unknown"
55
+ }
56
+ interface IngestionResult {
57
+ /**
58
+ * Alphanumeric tokenized text for Apple MaxEnt classifier.
59
+ * High structural fidelity, low natural language noise.
60
+ */
61
+ structural: string;
62
+ /**
63
+ * Clean, readable Markdown for LLM analysis.
64
+ * Low structural noise, high semantic fidelity.
65
+ */
66
+ semantic: string;
67
+ /**
68
+ * Whether the semantic content contains sensitive PII that was redacted.
69
+ */
70
+ hasSensitiveInfo?: boolean;
71
+ /**
72
+ * Extracted metadata from the HTML (title, description, author, etc.)
73
+ */
74
+ metadata?: Record<string, string>;
75
+ /**
76
+ * Whether the semantic content was truncated due to the token cap.
77
+ */
78
+ isTruncated?: boolean;
79
+ }
80
+ interface IngestionOptions {
81
+ /**
82
+ * Redaction configuration. If true, uses default settings.
83
+ */
84
+ redact?: RedactorOptions | boolean;
85
+ /**
86
+ * Override the semantic token cap for this specific call.
87
+ */
88
+ semanticTokenCap?: number;
89
+ }
90
+ interface GlobalConfig {
91
+ /**
92
+ * The default token cap for semantic returns. Defaults to 15,000.
93
+ */
94
+ defaultSemanticTokenCap?: number;
95
+ }
96
+ /**
97
+ * Initializes the Alete Gate ingestion substrate with global configuration.
98
+ */
99
+ declare function initialize(config?: GlobalConfig): void;
100
+ /**
101
+ * The unified ingestion pipeline for Alete Gate.
102
+ * Converts raw HTML into both structural tokens and semantic Markdown.
103
+ */
104
+ declare function processHtml(html: string, options?: IngestionOptions): Promise<IngestionResult>;
105
+
106
+ export { GateLabel, type GlobalConfig, type IngestionOptions, type IngestionResult, Redactor, type RedactorOptions, initialize, mapToTokens, structuralPlugin as plugin, processHtml };
@@ -0,0 +1,106 @@
1
+ import * as _mdream_js from '@mdream/js';
2
+
3
+ interface RedactorOptions {
4
+ redactPii?: boolean;
5
+ redactFinancials?: boolean;
6
+ redactCredentials?: boolean;
7
+ redactInfrastructure?: boolean;
8
+ redactMedical?: boolean;
9
+ customPlaceholders?: Record<string, string>;
10
+ }
11
+ /**
12
+ * Sovereign wrapper for the PII Shield engine.
13
+ * Adopts a "Narrative-First" strategy: Preserves names, dates, and locations
14
+ * to maintain story coherence while redacting "Toxic Identifiers" (SSNs, Credit Cards, Secrets).
15
+ */
16
+ declare class Redactor {
17
+ private engine;
18
+ constructor(options?: RedactorOptions);
19
+ /**
20
+ * Redacts sensitive information from the given text.
21
+ */
22
+ redact(text: string): Promise<string>;
23
+ /**
24
+ * Checks if the text contains any sensitive information without modifying it.
25
+ */
26
+ hasSensitiveInfo(text: string): Promise<boolean>;
27
+ /**
28
+ * Performs both detection and redaction in a single pass.
29
+ */
30
+ process(text: string): Promise<{
31
+ redacted: string;
32
+ hasSensitiveInfo: boolean;
33
+ }>;
34
+ }
35
+
36
+ /**
37
+ * Custom mdream plugin to retain structural artifacts.
38
+ * This preserves the "structural footprint" of sensitive portals
39
+ * by injecting intermediate markers with attribute context.
40
+ */
41
+ declare const structuralPlugin: _mdream_js.TransformPlugin;
42
+
43
+ /**
44
+ * Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.
45
+ * This prevents Apple's NLTokenizer from stripping critical punctuation.
46
+ * We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).
47
+ */
48
+ declare function mapToTokens(text: string): string;
49
+
50
+ declare enum GateLabel {
51
+ SENSITIVE_PORTAL = "sensitive_portal",
52
+ DIGESTIBLE_ARTICLE = "digestible_article",
53
+ NOISE = "noise",
54
+ UNKNOWN = "unknown"
55
+ }
56
+ interface IngestionResult {
57
+ /**
58
+ * Alphanumeric tokenized text for Apple MaxEnt classifier.
59
+ * High structural fidelity, low natural language noise.
60
+ */
61
+ structural: string;
62
+ /**
63
+ * Clean, readable Markdown for LLM analysis.
64
+ * Low structural noise, high semantic fidelity.
65
+ */
66
+ semantic: string;
67
+ /**
68
+ * Whether the semantic content contains sensitive PII that was redacted.
69
+ */
70
+ hasSensitiveInfo?: boolean;
71
+ /**
72
+ * Extracted metadata from the HTML (title, description, author, etc.)
73
+ */
74
+ metadata?: Record<string, string>;
75
+ /**
76
+ * Whether the semantic content was truncated due to the token cap.
77
+ */
78
+ isTruncated?: boolean;
79
+ }
80
+ interface IngestionOptions {
81
+ /**
82
+ * Redaction configuration. If true, uses default settings.
83
+ */
84
+ redact?: RedactorOptions | boolean;
85
+ /**
86
+ * Override the semantic token cap for this specific call.
87
+ */
88
+ semanticTokenCap?: number;
89
+ }
90
+ interface GlobalConfig {
91
+ /**
92
+ * The default token cap for semantic returns. Defaults to 15,000.
93
+ */
94
+ defaultSemanticTokenCap?: number;
95
+ }
96
+ /**
97
+ * Initializes the Alete Gate ingestion substrate with global configuration.
98
+ */
99
+ declare function initialize(config?: GlobalConfig): void;
100
+ /**
101
+ * The unified ingestion pipeline for Alete Gate.
102
+ * Converts raw HTML into both structural tokens and semantic Markdown.
103
+ */
104
+ declare function processHtml(html: string, options?: IngestionOptions): Promise<IngestionResult>;
105
+
106
+ export { GateLabel, type GlobalConfig, type IngestionOptions, type IngestionResult, Redactor, type RedactorOptions, initialize, mapToTokens, structuralPlugin as plugin, processHtml };
package/dist/index.js ADDED
@@ -0,0 +1,12 @@
1
+ import {htmlToMarkdown,withMinimalPreset}from'@mdream/js';import {createPlugin}from'@mdream/js/plugins';import {getPatternsByCategory,OpenRedaction}from'openredaction';var p=createPlugin({onNodeEnter(a){let e=a.name,t=a.attributes||{};switch(e){case "form":return `
2
+ [FORM_START]
3
+ `;case "input":{let r=t.type||"text",n=t.name||t.id||"",c=t.placeholder||"";return `[INPUT:${r}:${n}:${c}]`}case "select":return "[SELECT_START]";case "option":return " (OPTION:";case "button":return "[BUTTON:";case "label":return "LABEL[";case "nav":return `
4
+ [NAV_START]
5
+ `;case "a":return `[LINK:${t.href||"#"}]`;default:return}},onNodeExit(a){switch(a.name){case "form":return `
6
+ [FORM_END]
7
+ `;case "select":return "[SELECT_END]";case "option":return ") ";case "button":return "]";case "label":return "]";case "nav":return `
8
+ [NAV_END]
9
+ `;default:return}}});function d(a){let e=a.replace(/\\?\[FORM_START\\?\]/g,"structFormStart").replace(/\\?\[FORM_END\\?\]/g,"structFormEnd").replace(/\\?\[SELECT_START\\?\]/g,"structSelectStart").replace(/\\?\[SELECT_END\\?\]/g,"structSelectEnd").replace(/\\?\[NAV_START\\?\]/g,"structNavStart").replace(/\\?\[NAV_END\\?\]/g,"structNavEnd");return e=e.replace(/LABEL\\?\[/g,"structLabel "),e=e.replace(/\\?\[INPUT:([^:]+):([^:]*):([^\\\]]*)\\?\]/g,(t,r,n)=>{let c=n.replace(/[^a-zA-Z0-9]/g,"").slice(0,20);return `structInput${r.charAt(0).toUpperCase()+r.slice(1)}${c}`}).replace(/\\?\[LINK:[^\\\]]+\\?\]/g,()=>"structLinkElement").replace(/\\?\[BUTTON:([^\\\]]+)\\?\]/g,(t,r)=>`structButton${r.replace(/[^a-zA-Z0-9]/g,"").slice(0,20)}`),e=e.replace(/\[([^\]]*)\]\(([^)]+)\)/g,(t,r)=>`structLinkElement${r.replace(/[^a-zA-Z0-9]/g,"").slice(0,20)}`).replace(/!\[([^\]]*)\]\(([^)]+)\)/g,(t,r)=>`structImage${r.replace(/[^a-zA-Z0-9]/g,"").slice(0,20)}`),e=e.replace(/^# (.*$)/gm,(t,r)=>`sysHeader1 ${r.replace(/\[([^\]]+)\]\([^)]+\)/g,"$1").replace(/[^a-zA-Z0-9]/g,"").slice(0,30)}`).replace(/^## (.*$)/gm,(t,r)=>`sysHeader2 ${r.replace(/\[([^\]]+)\]\([^)]+\)/g,"$1").replace(/[^a-zA-Z0-9]/g,"").slice(0,30)}`).replace(/^### (.*$)/gm,(t,r)=>`sysHeader3 ${r.replace(/\[([^\]]+)\]\([^)]+\)/g,"$1").replace(/[^a-zA-Z0-9]/g,"").slice(0,30)}`),e=e.replace(/https?:\/\/[^\s]+/g,"").replace(/[#*`_\[\]():]/g," "),e.split(/\s+/).filter(t=>t.startsWith("struct")||t.startsWith("sys")||/^[A-Z]/.test(t)&&t.length>2).join(" ").trim()}var o=class{engine;constructor(e={}){let t=[];e.redactPii!==false&&(t.push("EMAIL"),t.push("PHONE_US","PHONE_UK","PHONE_INTERNATIONAL"),t.push(...getPatternsByCategory("government").map(n=>n.type))),e.redactFinancials!==false&&t.push(...getPatternsByCategory("financial").map(n=>n.type)),e.redactCredentials!==false&&t.push(...getPatternsByCategory("credentials").map(n=>n.type)),e.redactMedical!==false&&t.push(...getPatternsByCategory("healthcare").map(n=>n.type)),e.redactInfrastructure!==false&&t.push(...getPatternsByCategory("network").map(n=>n.type));let r={patterns:t,includeNames:false,includeAddresses:false,redactionMode:"placeholder",enableContextAnalysis:true,enableFalsePositiveFilter:true,falsePositiveThreshold:.7,deterministic:true};this.engine=new OpenRedaction(r);}async redact(e){return e&&(await this.engine.detect(e)).redacted}async hasSensitiveInfo(e){return e?(await this.engine.detect(e)).detections.length>0:false}async process(e){if(!e)return {redacted:e,hasSensitiveInfo:false};let t=await this.engine.detect(e);return {redacted:t.redacted,hasSensitiveInfo:t.detections.length>0}}};var _=(n=>(n.SENSITIVE_PORTAL="sensitive_portal",n.DIGESTIBLE_ARTICLE="digestible_article",n.NOISE="noise",n.UNKNOWN="unknown",n))(_||{}),T={SEMANTIC_TOKEN_CAP:15e3},l={defaultSemanticTokenCap:T.SEMANTIC_TOKEN_CAP},h=false;function I(a={}){l={...l,...a},console.log(`\u{1F6E1}\uFE0F Alete Gate: Ingestion substrate initialized. Semantic cap: ${l.defaultSemanticTokenCap} tokens. Explore our ecosystem at https://alete.ai/`),h=true;}function b(a){let e=a.trim().split(/\s+/).length;return Math.ceil(e*1.33)}function C(a,e){if(b(a)<=e)return {truncated:a,isTruncated:false};let r=Math.floor(e/1.33);return {truncated:a.trim().split(/\s+/).slice(0,r).join(" ")+`
10
+
11
+ ... [Content truncated due to token cap]`,isTruncated:true}}async function w(a,e={}){h||I();let t=htmlToMarkdown(a,{hooks:[p]}),r={},n=htmlToMarkdown(a,withMinimalPreset({isolateMain:false,plugins:{tagOverrides:{a:{enter:"",exit:""},img:{enter:"",exit:""},svg:{enter:"",exit:""},canvas:{enter:"",exit:""}},frontmatter:{onExtract:s=>{r=s;}}}})).trim();if(!r.title){let s=n.match(/^# (.*)$/m);s&&(r.title=s[1].replace(/[#*`_]/g,"").trim());}if(!r.description){let s=n.replace(/^# .*$/m,"").trim();if(s){let u=s.replace(/[#*`_]/g,"").trim();r.description=u.slice(0,150).replace(/\n/g," ")+(u.length>150?"...":"");}}let c=e.semanticTokenCap??l.defaultSemanticTokenCap??T.SEMANTIC_TOKEN_CAP,{truncated:E,isTruncated:N}=C(n,c);n=E;let g=false;if(e.redact){let s=typeof e.redact=="object"?e.redact:{},f=await new o(s).process(n);n=f.redacted,g=f.hasSensitiveInfo;}return {structural:d(t),semantic:n,hasSensitiveInfo:g,metadata:r,isTruncated:N}}export{_ as GateLabel,o as Redactor,I as initialize,d as mapToTokens,p as plugin,w as processHtml};//# sourceMappingURL=index.js.map
12
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/config.ts","../src/token-mapper.ts","../src/sanitization/Redactor.ts","../src/index.ts"],"names":["structuralPlugin","createPlugin","element","tag","attrs","type","name","placeholder","mapToTokens","text","processed","_","cleanName","content","alt","word","Redactor","options","patterns","getPatternsByCategory","p","engineOptions","OpenRedaction","result","GateLabel","DEFAULTS","globalConfig","isInitialized","initialize","config","estimateTokens","wordCount","truncateToCap","cap","wordsToKeep","processHtml","html","structuralMd","htmlToMarkdown","metadata","semantic","withMinimalPreset","fm","h1Match","bodyOnly","cleanBody","truncated","isTruncated","hasSensitiveInfo","redactorOptions"],"mappings":"wKAOO,IAAMA,CAAAA,CAAmBC,YAAAA,CAAa,CAC3C,WAAA,CAAYC,CAAAA,CAAS,CACnB,IAAMC,CAAAA,CAAMD,CAAAA,CAAQ,IAAA,CACdE,CAAAA,CAAQF,CAAAA,CAAQ,UAAA,EAAc,EAAC,CAErC,OAAQC,CAAAA,EACN,KAAK,OACH,OAAO;AAAA;AAAA,CAAA,CACT,KAAK,QAAS,CACZ,IAAME,EAAOD,CAAAA,CAAM,IAAA,EAAQ,OACrBE,CAAAA,CAAOF,CAAAA,CAAM,MAAQA,CAAAA,CAAM,EAAA,EAAM,GACjCG,CAAAA,CAAcH,CAAAA,CAAM,aAAe,EAAA,CACzC,OAAO,UAAUC,CAAI,CAAA,CAAA,EAAIC,CAAI,CAAA,CAAA,EAAIC,CAAW,GAC9C,CACA,KAAK,SACH,OAAO,gBAAA,CACT,KAAK,QAAA,CACH,OAAO,YACT,KAAK,QAAA,CACH,OAAO,UAAA,CACT,KAAK,QACH,OAAO,QAAA,CACT,KAAK,KAAA,CACH,OAAO;AAAA;AAAA,CAAA,CACT,KAAK,IAEH,OAAO,CAAA,MAAA,EADMH,EAAM,IAAA,EAAQ,GACP,IAEtB,QACE,MACJ,CACF,CAAA,CAEA,UAAA,CAAWF,EAAS,CAElB,OADYA,EAAQ,IAAA,EAElB,KAAK,MAAA,CACH,OAAO;AAAA;AAAA,CAAA,CACT,KAAK,QAAA,CACH,OAAO,eACT,KAAK,QAAA,CACH,OAAO,IAAA,CACT,KAAK,QAAA,CACH,OAAO,IACT,KAAK,OAAA,CACH,OAAO,GAAA,CACT,KAAK,MACH,OAAO;AAAA;AAAA,CAAA,CACT,QACE,MACJ,CACF,CACF,CAAC,ECtDM,SAASM,CAAAA,CAAYC,CAAAA,CAAsB,CAGhD,IAAIC,CAAAA,CAAYD,EACb,OAAA,CAAQ,uBAAA,CAAyB,iBAAiB,CAAA,CAClD,OAAA,CAAQ,qBAAA,CAAuB,eAAe,CAAA,CAC9C,OAAA,CAAQ,yBAAA,CAA2B,mBAAmB,CAAA,CACtD,OAAA,CAAQ,uBAAA,CAAyB,iBAAiB,CAAA,CAClD,QAAQ,sBAAA,CAAwB,gBAAgB,CAAA,CAChD,OAAA,CAAQ,oBAAA,CAAsB,cAAc,CAAA,CAG/C,OAAAC,CAAAA,CAAYA,CAAAA,CAAU,OAAA,CAAQ,aAAA,CAAe,cAAc,CAAA,CAG3DA,CAAAA,CAAYA,CAAAA,CAET,QAAQ,6CAAA,CAA+C,CAACC,CAAAA,CAAGN,CAAAA,CAAMC,CAAAA,GAAS,CACzE,IAAMM,CAAAA,CAAYN,EAAK,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAAA,CAC/D,OAAO,CAAA,WAAA,EAAcD,CAAAA,CAAK,MAAA,CAAO,CAAC,CAAA,CAAE,WAAA,EAAY,CAAIA,CAAAA,CAAK,KAAA,CAAM,CAAC,CAAC,CAAA,EAAGO,CAAS,CAAA,CAC/E,CAAC,CAAA,CAEA,QAAQ,0BAAA,CAA4B,IAAM,mBAAmB,CAAA,CAE7D,OAAA,CAAQ,8BAAA,CAAgC,CAACD,CAAAA,CAAGF,CAAAA,GAEpC,CAAA,YAAA,EADOA,CAAAA,CAAK,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,EAAG,EAAE,CAChC,CAAA,CAC5B,CAAA,CAGHC,CAAAA,CAAYA,CAAAA,CAET,OAAA,CAAQ,0BAAA,CAA4B,CAACC,CAAAA,CAAGE,CAAAA,GAEhC,CAAA,iBAAA,EADOA,CAAAA,CAAQ,OAAA,CAAQ,eAAA,CAAiB,EAAE,EAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC9B,CAAA,CACjC,CAAA,CAEA,OAAA,CAAQ,2BAAA,CAA6B,CAACF,CAAAA,CAAGG,CAAAA,GAEjC,CAAA,WAAA,EADOA,CAAAA,CAAI,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,MAAM,CAAA,CAAG,EAAE,CAChC,CAAA,CAC3B,CAAA,CAGHJ,CAAAA,CAAYA,CAAAA,CACT,OAAA,CAAQ,YAAA,CAAc,CAACC,CAAAA,CAAGE,CAAAA,GAElB,CAAA,WAAA,EADOA,CAAAA,CAAQ,OAAA,CAAQ,wBAAA,CAA0B,IAAI,CAAA,CAAE,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC5E,CAAA,CAC3B,CAAA,CACA,OAAA,CAAQ,aAAA,CAAe,CAACF,CAAAA,CAAGE,CAAAA,GAEnB,cADOA,CAAAA,CAAQ,OAAA,CAAQ,wBAAA,CAA0B,IAAI,CAAA,CAAE,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC5E,CAAA,CAC3B,CAAA,CACA,OAAA,CAAQ,eAAgB,CAACF,CAAAA,CAAGE,CAAAA,GAEpB,CAAA,WAAA,EADOA,CAAAA,CAAQ,OAAA,CAAQ,wBAAA,CAA0B,IAAI,EAAE,OAAA,CAAQ,eAAA,CAAiB,EAAE,CAAA,CAAE,KAAA,CAAM,CAAA,CAAG,EAAE,CAC5E,EAC3B,CAAA,CAGHH,CAAAA,CAAYA,CAAAA,CACT,OAAA,CAAQ,oBAAA,CAAsB,EAAE,CAAA,CAChC,OAAA,CAAQ,gBAAA,CAAkB,GAAG,CAAA,CAGzBA,CAAAA,CACJ,KAAA,CAAM,KAAK,CAAA,CACX,MAAA,CAAOK,GACCA,CAAAA,CAAK,UAAA,CAAW,QAAQ,CAAA,EACxBA,CAAAA,CAAK,UAAA,CAAW,KAAK,CAAA,EACpB,SAAS,IAAA,CAAKA,CAAI,CAAA,EAAKA,CAAAA,CAAK,MAAA,CAAS,CAC9C,CAAA,CACA,IAAA,CAAK,GAAG,CAAA,CACR,IAAA,EACL,CC7DO,IAAMC,CAAAA,CAAN,KAAe,CACZ,MAAA,CAER,WAAA,CAAYC,EAA2B,EAAC,CAAG,CACzC,IAAMC,CAAAA,CAAqB,EAAC,CAGxBD,CAAAA,CAAQ,YAAc,KAAA,GACxBC,CAAAA,CAAS,IAAA,CAAK,OAAO,CAAA,CACrBA,CAAAA,CAAS,IAAA,CAAK,UAAA,CAAY,WAAY,qBAAqB,CAAA,CAC3DA,CAAAA,CAAS,IAAA,CAAK,GAAGC,qBAAAA,CAAsB,YAAY,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAAA,CAInEH,CAAAA,CAAQ,gBAAA,GAAqB,OAC/BC,CAAAA,CAAS,IAAA,CAAK,GAAGC,qBAAAA,CAAsB,WAAW,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAIlEH,CAAAA,CAAQ,iBAAA,GAAsB,KAAA,EAChCC,CAAAA,CAAS,KAAK,GAAGC,qBAAAA,CAAsB,aAAa,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAIpEH,CAAAA,CAAQ,aAAA,GAAkB,KAAA,EAC5BC,CAAAA,CAAS,IAAA,CAAK,GAAGC,sBAAsB,YAAY,CAAA,CAAE,GAAA,CAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAInEH,EAAQ,oBAAA,GAAyB,KAAA,EACnCC,CAAAA,CAAS,IAAA,CAAK,GAAGC,qBAAAA,CAAsB,SAAS,CAAA,CAAE,IAAIC,CAAAA,EAAKA,CAAAA,CAAE,IAAI,CAAC,CAAA,CAGpE,IAAMC,CAAAA,CAAsC,CAC1C,QAAA,CAAAH,CAAAA,CAEA,YAAA,CAAc,KAAA,CACd,gBAAA,CAAkB,KAAA,CAElB,aAAA,CAAe,aAAA,CACf,sBAAuB,IAAA,CACvB,yBAAA,CAA2B,IAAA,CAC3B,sBAAA,CAAwB,EAAA,CACxB,aAAA,CAAe,IACjB,CAAA,CAEA,KAAK,MAAA,CAAS,IAAII,aAAAA,CAAcD,CAAa,EAC/C,CAKA,MAAa,MAAA,CAAOZ,EAA+B,CACjD,OAAKA,CAAAA,EAAAA,CACU,MAAM,IAAA,CAAK,MAAA,CAAO,MAAA,CAAOA,CAAI,CAAA,EAC9B,QAChB,CAKA,MAAa,gBAAA,CAAiBA,CAAAA,CAAgC,CAC5D,OAAKA,GACU,MAAM,IAAA,CAAK,MAAA,CAAO,MAAA,CAAOA,CAAI,CAAA,EAC9B,UAAA,CAAW,MAAA,CAAS,EAFhB,KAGpB,CAKA,MAAa,OAAA,CAAQA,CAAAA,CAAwE,CAC3F,GAAI,CAACA,EAAM,OAAO,CAAE,QAAA,CAAUA,CAAAA,CAAM,gBAAA,CAAkB,KAAM,CAAA,CAC5D,IAAMc,CAAAA,CAAS,MAAM,IAAA,CAAK,MAAA,CAAO,MAAA,CAAOd,CAAI,CAAA,CAC5C,OAAO,CACL,QAAA,CAAUc,CAAAA,CAAO,QAAA,CACjB,gBAAA,CAAkBA,CAAAA,CAAO,UAAA,CAAW,MAAA,CAAS,CAC/C,CACF,CACF,ECzFO,IAAKC,CAAAA,CAAAA,CAAAA,CAAAA,GACVA,CAAAA,CAAA,gBAAA,CAAmB,kBAAA,CACnBA,EAAA,kBAAA,CAAqB,oBAAA,CACrBA,CAAAA,CAAA,KAAA,CAAQ,OAAA,CACRA,CAAAA,CAAA,OAAA,CAAU,SAAA,CAJAA,CAAAA,CAAAA,EAAAA,CAAAA,EAAA,EAAA,CAAA,CAuDNC,CAAAA,CAAW,CACf,kBAAA,CAAoB,IACtB,CAAA,CAEIC,CAAAA,CAA6B,CAC/B,uBAAA,CAAyBD,CAAAA,CAAS,kBACpC,CAAA,CAEIE,CAAAA,CAAgB,MAKb,SAASC,CAAAA,CAAWC,EAAuB,EAAC,CAAS,CAC1DH,CAAAA,CAAe,CAAE,GAAGA,CAAAA,CAAc,GAAGG,CAAO,CAAA,CAC5C,OAAA,CAAQ,GAAA,CAAI,CAAA,2EAAA,EAAkEH,CAAAA,CAAa,uBAAuB,CAAA,mDAAA,CAAqD,CAAA,CACvKC,CAAAA,CAAgB,KAClB,CAMA,SAASG,CAAAA,CAAerB,CAAAA,CAAsB,CAC5C,IAAMsB,EAAYtB,CAAAA,CAAK,IAAA,EAAK,CAAE,KAAA,CAAM,KAAK,CAAA,CAAE,MAAA,CAC3C,OAAO,KAAK,IAAA,CAAKsB,CAAAA,CAAY,IAAI,CACnC,CAKA,SAASC,CAAAA,CAAcvB,CAAAA,CAAcwB,EAA0D,CAE7F,GADeH,CAAAA,CAAerB,CAAI,CAAA,EACpBwB,CAAAA,CACZ,OAAO,CAAE,SAAA,CAAWxB,CAAAA,CAAM,WAAA,CAAa,KAAM,CAAA,CAI/C,IAAMyB,CAAAA,CAAc,IAAA,CAAK,MAAMD,CAAAA,CAAM,IAAI,CAAA,CAIzC,OAAO,CAAE,SAAA,CAHKxB,CAAAA,CAAK,IAAA,GAAO,KAAA,CAAM,KAAK,CAAA,CACb,KAAA,CAAM,CAAA,CAAGyB,CAAW,CAAA,CAAE,IAAA,CAAK,GAAG,CAAA,CAAI;;AAAA,wCAAA,CAAA,CAEtC,WAAA,CAAa,IAAK,CACxC,CAMA,eAAsBC,CAAAA,CAAYC,CAAAA,CAAcnB,CAAAA,CAA4B,EAAC,CAA6B,CACnGU,GACHC,CAAAA,EAAW,CAIb,IAAMS,CAAAA,CAAeC,cAAAA,CAAeF,CAAAA,CAAM,CACxC,KAAA,CAAO,CAACpC,CAAgB,CAC1B,CAAC,CAAA,CAEGuC,EAAmC,EAAC,CACpCC,CAAAA,CAAWF,cAAAA,CAAeF,CAAAA,CAAMK,iBAAAA,CAAkB,CACpD,WAAA,CAAa,KAAA,CACb,OAAA,CAAS,CACP,YAAA,CAAc,CACZ,EAAG,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAAA,CACzB,IAAK,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAAA,CAC3B,IAAK,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAAA,CAC3B,OAAQ,CAAE,KAAA,CAAO,EAAA,CAAI,IAAA,CAAM,EAAG,CAChC,EACA,WAAA,CAAa,CACX,SAAA,CAAYC,CAAAA,EAAO,CACjBH,CAAAA,CAAWG,EACb,CACF,CACF,CACF,CAAC,CAAC,CAAA,CAAE,MAAK,CAGT,GAAI,CAACH,CAAAA,CAAS,KAAA,CAAO,CACnB,IAAMI,CAAAA,CAAUH,CAAAA,CAAS,KAAA,CAAM,WAAW,CAAA,CACtCG,CAAAA,GAEFJ,EAAS,KAAA,CAAQI,CAAAA,CAAQ,CAAC,CAAA,CAAE,OAAA,CAAQ,SAAA,CAAW,EAAE,CAAA,CAAE,IAAA,EAAK,EAE5D,CACA,GAAI,CAACJ,EAAS,WAAA,CAAa,CAEzB,IAAMK,CAAAA,CAAWJ,CAAAA,CAAS,QAAQ,SAAA,CAAW,EAAE,CAAA,CAAE,IAAA,EAAK,CACtD,GAAII,EAAU,CAEZ,IAAMC,CAAAA,CAAYD,CAAAA,CAAS,OAAA,CAAQ,SAAA,CAAW,EAAE,CAAA,CAAE,IAAA,EAAK,CACvDL,CAAAA,CAAS,WAAA,CAAcM,CAAAA,CAAU,MAAM,CAAA,CAAG,GAAG,CAAA,CAAE,OAAA,CAAQ,KAAA,CAAO,GAAG,GAAKA,CAAAA,CAAU,MAAA,CAAS,GAAA,CAAM,KAAA,CAAQ,EAAA,EACzG,CACF,CAGA,IAAMZ,CAAAA,CAAMhB,CAAAA,CAAQ,gBAAA,EAAoBS,CAAAA,CAAa,uBAAA,EAA2BD,EAAS,kBAAA,CACnF,CAAE,SAAA,CAAAqB,CAAAA,CAAW,WAAA,CAAAC,CAAY,EAAIf,CAAAA,CAAcQ,CAAAA,CAAUP,CAAG,CAAA,CAC9DO,CAAAA,CAAWM,CAAAA,CAEX,IAAIE,CAAAA,CAAmB,KAAA,CAGvB,GAAI/B,CAAAA,CAAQ,MAAA,CAAQ,CAClB,IAAMgC,CAAAA,CAAkB,OAAOhC,CAAAA,CAAQ,MAAA,EAAW,QAAA,CAAWA,CAAAA,CAAQ,OAAS,EAAC,CAEzEM,CAAAA,CAAS,MADE,IAAIP,CAAAA,CAASiC,CAAe,CAAA,CACf,OAAA,CAAQT,CAAQ,CAAA,CAC9CA,CAAAA,CAAWjB,CAAAA,CAAO,SAClByB,CAAAA,CAAmBzB,CAAAA,CAAO,iBAC5B,CAEA,OAAO,CACL,WAAYf,CAAAA,CAAY6B,CAAY,CAAA,CACpC,QAAA,CAAAG,CAAAA,CACA,gBAAA,CAAAQ,EACA,QAAA,CAAAT,CAAAA,CACA,WAAA,CAAAQ,CACF,CACF","file":"index.js","sourcesContent":["import { createPlugin } from '@mdream/js/plugins';\n\n/**\n * Custom mdream plugin to retain structural artifacts.\n * This preserves the \"structural footprint\" of sensitive portals\n * by injecting intermediate markers with attribute context.\n */\nexport const structuralPlugin = createPlugin({\n onNodeEnter(element) {\n const tag = element.name;\n const attrs = element.attributes || {};\n\n switch (tag) {\n case 'form':\n return '\\n[FORM_START]\\n';\n case 'input': {\n const type = attrs.type || 'text';\n const name = attrs.name || attrs.id || '';\n const placeholder = attrs.placeholder || '';\n return `[INPUT:${type}:${name}:${placeholder}]`;\n }\n case 'select':\n return '[SELECT_START]';\n case 'option':\n return ' (OPTION:';\n case 'button':\n return '[BUTTON:';\n case 'label':\n return 'LABEL[';\n case 'nav':\n return '\\n[NAV_START]\\n';\n case 'a': {\n const href = attrs.href || '#';\n return `[LINK:${href}]`;\n }\n default:\n return undefined;\n }\n },\n\n onNodeExit(element) {\n const tag = element.name;\n switch (tag) {\n case 'form':\n return '\\n[FORM_END]\\n';\n case 'select':\n return '[SELECT_END]';\n case 'option':\n return ') ';\n case 'button':\n return ']';\n case 'label':\n return ']';\n case 'nav':\n return '\\n[NAV_END]\\n';\n default:\n return undefined;\n }\n }\n});\n","/**\n * Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.\n * This prevents Apple's NLTokenizer from stripping critical punctuation.\n * We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).\n */\nexport function mapToTokens(text: string): string {\n // 1. Process explicit markers from structuralPlugin\n // We handle potential escaping from mdream\n let processed = text\n .replace(/\\\\?\\[FORM_START\\\\?\\]/g, 'structFormStart')\n .replace(/\\\\?\\[FORM_END\\\\?\\]/g, 'structFormEnd')\n .replace(/\\\\?\\[SELECT_START\\\\?\\]/g, 'structSelectStart')\n .replace(/\\\\?\\[SELECT_END\\\\?\\]/g, 'structSelectEnd')\n .replace(/\\\\?\\[NAV_START\\\\?\\]/g, 'structNavStart')\n .replace(/\\\\?\\[NAV_END\\\\?\\]/g, 'structNavEnd');\n\n // 1.1 Process Label marker\n processed = processed.replace(/LABEL\\\\?\\[/g, 'structLabel ');\n\n // 2. Process attribute-based markers\n processed = processed\n // Inputs: [INPUT:type:name:placeholder] -> structInputType {type} {name}\n .replace(/\\\\?\\[INPUT:([^:]+):([^:]*):([^\\\\\\]]*)\\\\?\\]/g, (_, type, name) => {\n const cleanName = name.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structInput${type.charAt(0).toUpperCase() + type.slice(1)}${cleanName}`;\n })\n // Links: [LINK:url] -> structLink\n .replace(/\\\\?\\[LINK:[^\\\\\\]]+\\\\?\\]/g, () => 'structLinkElement')\n // Buttons: [BUTTON:text] -> structButton {text}\n .replace(/\\\\?\\[BUTTON:([^\\\\\\]]+)\\\\?\\]/g, (_, text) => {\n const clean = text.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structButton${clean}`;\n });\n\n // 3. Process Standard Markdown artifacts (if any remain) into clean tokens\n processed = processed\n // Links: [text](url) -> structLinkElement {text}\n .replace(/\\[([^\\]]*)\\]\\(([^)]+)\\)/g, (_, content) => {\n const clean = content.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structLinkElement${clean}`;\n })\n // Images: ![alt](url) -> structImage {alt}\n .replace(/!\\[([^\\]]*)\\]\\(([^)]+)\\)/g, (_, alt) => {\n const clean = alt.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);\n return `structImage${clean}`;\n });\n\n // 4. Process Headers into clean tokens\n processed = processed\n .replace(/^# (.*$)/gm, (_, content) => {\n const clean = content.replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);\n return `sysHeader1 ${clean}`;\n })\n .replace(/^## (.*$)/gm, (_, content) => {\n const clean = content.replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);\n return `sysHeader2 ${clean}`;\n })\n .replace(/^### (.*$)/gm, (_, content) => {\n const clean = content.replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);\n return `sysHeader3 ${clean}`;\n });\n\n // 5. Final cleaning: remove URLs and punctuation (including colons now)\n processed = processed\n .replace(/https?:\\/\\/[^\\s]+/g, '') // Remove URLs\n .replace(/[#*`_\\[\\]():]/g, ' '); // Remove remaining markdown chars + colons\n\n // 6. Aggressively strip remaining natural language noise\n return processed\n .split(/\\s+/)\n .filter(word => {\n return word.startsWith('struct') || \n word.startsWith('sys') || \n (/^[A-Z]/.test(word) && word.length > 2); // Keep capitalized words (titles, labels) > 2 chars\n })\n .join(' ')\n .trim();\n}\n","import { OpenRedaction, type OpenRedactionOptions, getPatternsByCategory } from 'openredaction';\n\nexport interface RedactorOptions {\n redactPii?: boolean;\n redactFinancials?: boolean;\n redactCredentials?: boolean;\n redactInfrastructure?: boolean;\n redactMedical?: boolean;\n customPlaceholders?: Record<string, string>;\n}\n\n/**\n * Sovereign wrapper for the PII Shield engine.\n * Adopts a \"Narrative-First\" strategy: Preserves names, dates, and locations\n * to maintain story coherence while redacting \"Toxic Identifiers\" (SSNs, Credit Cards, Secrets).\n */\nexport class Redactor {\n private engine: OpenRedaction;\n\n constructor(options: RedactorOptions = {}) {\n const patterns: string[] = [];\n \n // 1. Personal & Contact (excluding Names, Addresses, Dates)\n if (options.redactPii !== false) {\n patterns.push('EMAIL');\n patterns.push('PHONE_US', 'PHONE_UK', 'PHONE_INTERNATIONAL');\n patterns.push(...getPatternsByCategory('government').map(p => p.type));\n }\n\n // 2. Financial\n if (options.redactFinancials !== false) {\n patterns.push(...getPatternsByCategory('financial').map(p => p.type));\n }\n\n // 3. Credentials\n if (options.redactCredentials !== false) {\n patterns.push(...getPatternsByCategory('credentials').map(p => p.type));\n }\n\n // 4. Medical\n if (options.redactMedical !== false) {\n patterns.push(...getPatternsByCategory('healthcare').map(p => p.type));\n }\n\n // 5. Infrastructure\n if (options.redactInfrastructure !== false) {\n patterns.push(...getPatternsByCategory('network').map(p => p.type));\n }\n\n const engineOptions: OpenRedactionOptions = {\n patterns,\n // Narrative-First safety: Double-down on disabling entities\n includeNames: false,\n includeAddresses: false,\n \n redactionMode: 'placeholder',\n enableContextAnalysis: true,\n enableFalsePositiveFilter: true,\n falsePositiveThreshold: 0.7,\n deterministic: true,\n };\n\n this.engine = new OpenRedaction(engineOptions);\n }\n\n /**\n * Redacts sensitive information from the given text.\n */\n public async redact(text: string): Promise<string> {\n if (!text) return text;\n const result = await this.engine.detect(text);\n return result.redacted;\n }\n\n /**\n * Checks if the text contains any sensitive information without modifying it.\n */\n public async hasSensitiveInfo(text: string): Promise<boolean> {\n if (!text) return false;\n const result = await this.engine.detect(text);\n return result.detections.length > 0;\n }\n\n /**\n * Performs both detection and redaction in a single pass.\n */\n public async process(text: string): Promise<{ redacted: string, hasSensitiveInfo: boolean }> {\n if (!text) return { redacted: text, hasSensitiveInfo: false };\n const result = await this.engine.detect(text);\n return {\n redacted: result.redacted,\n hasSensitiveInfo: result.detections.length > 0\n };\n }\n}\n","import { htmlToMarkdown, withMinimalPreset } from '@mdream/js';\nimport { structuralPlugin } from './config.js';\nimport { mapToTokens } from './token-mapper.js';\nimport { Redactor, type RedactorOptions } from './sanitization/Redactor.js';\n\nexport enum GateLabel {\n SENSITIVE_PORTAL = 'sensitive_portal',\n DIGESTIBLE_ARTICLE = 'digestible_article',\n NOISE = 'noise',\n UNKNOWN = 'unknown',\n}\n\nexport interface IngestionResult {\n /**\n * Alphanumeric tokenized text for Apple MaxEnt classifier.\n * High structural fidelity, low natural language noise.\n */\n structural: string;\n \n /**\n * Clean, readable Markdown for LLM analysis.\n * Low structural noise, high semantic fidelity.\n */\n semantic: string;\n\n /**\n * Whether the semantic content contains sensitive PII that was redacted.\n */\n hasSensitiveInfo?: boolean;\n\n /**\n * Extracted metadata from the HTML (title, description, author, etc.)\n */\n metadata?: Record<string, string>;\n\n /**\n * Whether the semantic content was truncated due to the token cap.\n */\n isTruncated?: boolean;\n}\n\nexport interface IngestionOptions {\n /**\n * Redaction configuration. If true, uses default settings.\n */\n redact?: RedactorOptions | boolean;\n\n /**\n * Override the semantic token cap for this specific call.\n */\n semanticTokenCap?: number;\n}\n\nexport interface GlobalConfig {\n /**\n * The default token cap for semantic returns. Defaults to 15,000.\n */\n defaultSemanticTokenCap?: number;\n}\n\nconst DEFAULTS = {\n SEMANTIC_TOKEN_CAP: 15000,\n};\n\nlet globalConfig: GlobalConfig = {\n defaultSemanticTokenCap: DEFAULTS.SEMANTIC_TOKEN_CAP,\n};\n\nlet isInitialized = false;\n\n/**\n * Initializes the Alete Gate ingestion substrate with global configuration.\n */\nexport function initialize(config: GlobalConfig = {}): void {\n globalConfig = { ...globalConfig, ...config };\n console.log(`🛡️ Alete Gate: Ingestion substrate initialized. Semantic cap: ${globalConfig.defaultSemanticTokenCap} tokens. Explore our ecosystem at https://alete.ai/`);\n isInitialized = true;\n}\n\n/**\n * Estimates the number of tokens in a text string.\n * Uses the 1.33x multiplier (tokens per word) for a safe estimation.\n */\nfunction estimateTokens(text: string): number {\n const wordCount = text.trim().split(/\\s+/).length;\n return Math.ceil(wordCount * 1.33);\n}\n\n/**\n * Truncates text to fit within a token cap.\n */\nfunction truncateToCap(text: string, cap: number): { truncated: string; isTruncated: boolean } {\n const tokens = estimateTokens(text);\n if (tokens <= cap) {\n return { truncated: text, isTruncated: false };\n }\n\n // Calculate approximate words to keep\n const wordsToKeep = Math.floor(cap / 1.33);\n const words = text.trim().split(/\\s+/);\n const truncated = words.slice(0, wordsToKeep).join(' ') + '\\n\\n... [Content truncated due to token cap]';\n \n return { truncated, isTruncated: true };\n}\n\n/**\n * The unified ingestion pipeline for Alete Gate.\n * Converts raw HTML into both structural tokens and semantic Markdown.\n */\nexport async function processHtml(html: string, options: IngestionOptions = {}): Promise<IngestionResult> {\n if (!isInitialized) {\n initialize();\n }\n\n // 1. Generate Structural Markdown using the custom plugin\n const structuralMd = htmlToMarkdown(html, {\n hooks: [structuralPlugin]\n });\n\n let metadata: Record<string, string> = {};\n let semantic = htmlToMarkdown(html, withMinimalPreset({\n isolateMain: false,\n plugins: {\n tagOverrides: {\n a: { enter: '', exit: '' },\n img: { enter: '', exit: '' },\n svg: { enter: '', exit: '' },\n canvas: { enter: '', exit: '' },\n },\n frontmatter: {\n onExtract: (fm) => {\n metadata = fm;\n }\n }\n }\n })).trim();\n\n // 2.1 Fallback metadata if not found in head\n if (!metadata.title) {\n const h1Match = semantic.match(/^# (.*)$/m);\n if (h1Match) {\n // Clean markdown from title\n metadata.title = h1Match[1].replace(/[#*`_]/g, '').trim();\n }\n }\n if (!metadata.description) {\n // Take first 150 chars of semantic (excluding title)\n const bodyOnly = semantic.replace(/^# .*$/m, '').trim();\n if (bodyOnly) {\n // Clean markdown from description\n const cleanBody = bodyOnly.replace(/[#*`_]/g, '').trim();\n metadata.description = cleanBody.slice(0, 150).replace(/\\n/g, ' ') + (cleanBody.length > 150 ? '...' : '');\n }\n }\n\n // 2.2 Apply token cap\n const cap = options.semanticTokenCap ?? globalConfig.defaultSemanticTokenCap ?? DEFAULTS.SEMANTIC_TOKEN_CAP;\n const { truncated, isTruncated } = truncateToCap(semantic, cap);\n semantic = truncated;\n\n let hasSensitiveInfo = false;\n\n // 3. Redaction Pipeline\n if (options.redact) {\n const redactorOptions = typeof options.redact === 'object' ? options.redact : {};\n const redactor = new Redactor(redactorOptions);\n const result = await redactor.process(semantic);\n semantic = result.redacted;\n hasSensitiveInfo = result.hasSensitiveInfo;\n }\n\n return {\n structural: mapToTokens(structuralMd),\n semantic,\n hasSensitiveInfo,\n metadata,\n isTruncated\n };\n}\n\nexport { structuralPlugin as plugin } from './config.js';\nexport { mapToTokens } from './token-mapper.js';\nexport { Redactor, type RedactorOptions };\n"]}
package/package.json ADDED
@@ -0,0 +1,39 @@
1
+ {
2
+ "name": "@alete-ai/gate-ingest",
3
+ "version": "0.1.0",
4
+ "description": "Unified ingestion and token-mapping pipeline for the Alete PrivacyGatekeeper.",
5
+ "keywords": [
6
+ "edge",
7
+ "crawler",
8
+ "scraper",
9
+ "firecrawl",
10
+ "crawl4ai",
11
+ "analysis"
12
+ ],
13
+ "type": "module",
14
+ "main": "./dist/index.js",
15
+ "module": "./dist/index.js",
16
+ "types": "./dist/index.d.ts",
17
+ "browser": "./dist/index.browser.js",
18
+ "exports": {
19
+ ".": {
20
+ "types": "./dist/index.d.ts",
21
+ "browser": "./dist/index.browser.js",
22
+ "import": "./dist/index.js",
23
+ "require": "./dist/index.cjs"
24
+ }
25
+ },
26
+ "dependencies": {
27
+ "@mdream/js": "^1.1.1",
28
+ "openredaction": "^1.1.2"
29
+ },
30
+ "devDependencies": {
31
+ "tsup": "^8.0.0",
32
+ "typescript": "^5.0.0"
33
+ },
34
+ "scripts": {
35
+ "build": "tsup",
36
+ "dev": "tsup --watch",
37
+ "test": "vitest run"
38
+ }
39
+ }
package/src/config.ts ADDED
@@ -0,0 +1,60 @@
1
+ import { createPlugin } from '@mdream/js/plugins';
2
+
3
+ /**
4
+ * Custom mdream plugin to retain structural artifacts.
5
+ * This preserves the "structural footprint" of sensitive portals
6
+ * by injecting intermediate markers with attribute context.
7
+ */
8
+ export const structuralPlugin = createPlugin({
9
+ onNodeEnter(element) {
10
+ const tag = element.name;
11
+ const attrs = element.attributes || {};
12
+
13
+ switch (tag) {
14
+ case 'form':
15
+ return '\n[FORM_START]\n';
16
+ case 'input': {
17
+ const type = attrs.type || 'text';
18
+ const name = attrs.name || attrs.id || '';
19
+ const placeholder = attrs.placeholder || '';
20
+ return `[INPUT:${type}:${name}:${placeholder}]`;
21
+ }
22
+ case 'select':
23
+ return '[SELECT_START]';
24
+ case 'option':
25
+ return ' (OPTION:';
26
+ case 'button':
27
+ return '[BUTTON:';
28
+ case 'label':
29
+ return 'LABEL[';
30
+ case 'nav':
31
+ return '\n[NAV_START]\n';
32
+ case 'a': {
33
+ const href = attrs.href || '#';
34
+ return `[LINK:${href}]`;
35
+ }
36
+ default:
37
+ return undefined;
38
+ }
39
+ },
40
+
41
+ onNodeExit(element) {
42
+ const tag = element.name;
43
+ switch (tag) {
44
+ case 'form':
45
+ return '\n[FORM_END]\n';
46
+ case 'select':
47
+ return '[SELECT_END]';
48
+ case 'option':
49
+ return ') ';
50
+ case 'button':
51
+ return ']';
52
+ case 'label':
53
+ return ']';
54
+ case 'nav':
55
+ return '\n[NAV_END]\n';
56
+ default:
57
+ return undefined;
58
+ }
59
+ }
60
+ });
@@ -0,0 +1,100 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { processHtml } from './index.js';
3
+
4
+ describe('gate-ingest pipeline', () => {
5
+ it('processes a login form into structural tokens', async () => {
6
+ const html = `
7
+ <html>
8
+ <body>
9
+ <nav>
10
+ <a href="/home">Home</a>
11
+ </nav>
12
+ <form action="/login" method="POST">
13
+ <label for="username">Username:</label>
14
+ <input type="text" name="username" id="username" placeholder="Enter username">
15
+ <label for="password">Password:</label>
16
+ <input type="password" name="password" id="password">
17
+ <button type="submit">Login</button>
18
+ </form>
19
+ </body>
20
+ </html>
21
+ `;
22
+
23
+ const { structural, semantic } = await processHtml(html);
24
+
25
+ // Check structural tokens
26
+ expect(structural).toContain('structNavStart');
27
+ expect(structural).toContain('structLinkElement');
28
+ expect(structural).toContain('structNavEnd');
29
+ expect(structural).toContain('structFormStart');
30
+ expect(structural).toContain('structLabel Username');
31
+ expect(structural).toContain('structInputTextusername');
32
+ expect(structural).toContain('structLabel Password');
33
+ expect(structural).toContain('structButtonLogin');
34
+ expect(structural).toContain('structFormEnd');
35
+
36
+ // Check semantic markdown - Form elements are filtered by withMinimalPreset
37
+ // so we don't expect them in the semantic output, only in structural.
38
+ expect(semantic).not.toContain('STRUCT_FORM_START');
39
+ });
40
+
41
+ it('processes a news article into clean markdown', async () => {
42
+ const html = `
43
+ <html>
44
+ <body>
45
+ <h1>Breaking News</h1>
46
+ <p>This is a <strong>major</strong> story about privacy.</p>
47
+ <div class="sidebar">
48
+ <a href="/related">Related article</a>
49
+ </div>
50
+ </body>
51
+ </html>
52
+ `;
53
+
54
+ const { structural, semantic } = await processHtml(html);
55
+
56
+ // Check structural tokens
57
+ expect(structural).toContain('sysHeader1 BreakingNews');
58
+
59
+ // Check semantic markdown
60
+ expect(semantic).toContain('# Breaking News');
61
+ expect(semantic).toContain('This is a **major** story about privacy.');
62
+ });
63
+
64
+ it('extracts page metadata', async () => {
65
+ const html = `
66
+ <html>
67
+ <head>
68
+ <title>Privacy Policy</title>
69
+ <meta name="description" content="Our commitment to your data.">
70
+ </head>
71
+ <body>
72
+ <p>We respect your privacy.</p>
73
+ </body>
74
+ </html>
75
+ `;
76
+
77
+ const { metadata } = await processHtml(html);
78
+
79
+ expect(metadata).toBeDefined();
80
+ expect(metadata?.title).toBe('Privacy Policy');
81
+ expect(metadata?.description).toBe('Our commitment to your data.');
82
+ });
83
+
84
+ it('respects the semantic token cap', async () => {
85
+ // Generate a long text with varied content
86
+ const words = ['apple', 'banana', 'cherry', 'date', 'elderberry', 'fig', 'grape', 'honeydew'];
87
+ const longText = Array(100).fill(0).map((_, i) => words[i % words.length]).join(' ');
88
+ const html = `<html><body><main><p>${longText}</p></main></body></html>`;
89
+
90
+ // 1.33 * 100 = 133 tokens. Cap at 50 tokens.
91
+ const { semantic, isTruncated } = await processHtml(html, { semanticTokenCap: 50 });
92
+
93
+ expect(isTruncated).toBe(true);
94
+ expect(semantic).toContain('... [Content truncated due to token cap]');
95
+ // 50 / 1.33 = 37.5 words
96
+ const wordCount = semantic.split('...')[0].trim().split(/\s+/).length;
97
+ expect(wordCount).toBeLessThanOrEqual(40);
98
+ });
99
+ });
100
+