npm - @alete-ai/gate-ingest - Versions diffs - 0.1.0 - Mend

@alete-ai/gate-ingest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/LICENSE +622 -0
package/dist/chunk-G7HIHPC6.js +2 -0
package/dist/chunk-G7HIHPC6.js.map +1 -0
package/dist/empty-shim-IFLK4AY7.js +2 -0
package/dist/empty-shim-IFLK4AY7.js.map +1 -0
package/dist/index.browser.d.ts +106 -0
package/dist/index.browser.js +416 -0
package/dist/index.browser.js.map +1 -0
package/dist/index.cjs +12 -0
package/dist/index.cjs.map +1 -0
package/dist/index.d.cts +106 -0
package/dist/index.d.ts +106 -0
package/dist/index.js +12 -0
package/dist/index.js.map +1 -0
package/package.json +39 -0
package/src/config.ts +60 -0
package/src/index.test.ts +100 -0
package/src/index.ts +183 -0
package/src/platform/empty-shim.ts +34 -0
package/src/sanitization/Redactor.ts +95 -0
package/src/token-mapper.test.ts +32 -0
package/src/token-mapper.ts +78 -0
package/tsconfig.json +14 -0
package/tsup.config.ts +48 -0

package/src/index.ts ADDED Viewed

@@ -0,0 +1,183 @@
+import { htmlToMarkdown, withMinimalPreset } from '@mdream/js';
+import { structuralPlugin } from './config.js';
+import { mapToTokens } from './token-mapper.js';
+import { Redactor, type RedactorOptions } from './sanitization/Redactor.js';
+export enum GateLabel {
+  SENSITIVE_PORTAL = 'sensitive_portal',
+  DIGESTIBLE_ARTICLE = 'digestible_article',
+  NOISE = 'noise',
+  UNKNOWN = 'unknown',
+}
+export interface IngestionResult {
+  /**
+   * Alphanumeric tokenized text for Apple MaxEnt classifier.
+   * High structural fidelity, low natural language noise.
+   */
+  structural: string;
+  /**
+   * Clean, readable Markdown for LLM analysis.
+   * Low structural noise, high semantic fidelity.
+   */
+  semantic: string;
+  /**
+   * Whether the semantic content contains sensitive PII that was redacted.
+   */
+  hasSensitiveInfo?: boolean;
+  /**
+   * Extracted metadata from the HTML (title, description, author, etc.)
+   */
+  metadata?: Record<string, string>;
+  /**
+   * Whether the semantic content was truncated due to the token cap.
+   */
+  isTruncated?: boolean;
+}
+export interface IngestionOptions {
+  /**
+   * Redaction configuration. If true, uses default settings.
+   */
+  redact?: RedactorOptions | boolean;
+  /**
+   * Override the semantic token cap for this specific call.
+   */
+  semanticTokenCap?: number;
+}
+export interface GlobalConfig {
+  /**
+   * The default token cap for semantic returns. Defaults to 15,000.
+   */
+  defaultSemanticTokenCap?: number;
+}
+const DEFAULTS = {
+  SEMANTIC_TOKEN_CAP: 15000,
+};
+let globalConfig: GlobalConfig = {
+  defaultSemanticTokenCap: DEFAULTS.SEMANTIC_TOKEN_CAP,
+};
+let isInitialized = false;
+/**
+ * Initializes the Alete Gate ingestion substrate with global configuration.
+ */
+export function initialize(config: GlobalConfig = {}): void {
+  globalConfig = { ...globalConfig, ...config };
+  console.log(`🛡️ Alete Gate: Ingestion substrate initialized. Semantic cap: ${globalConfig.defaultSemanticTokenCap} tokens. Explore our ecosystem at https://alete.ai/`);
+  isInitialized = true;
+}
+/**
+ * Estimates the number of tokens in a text string.
+ * Uses the 1.33x multiplier (tokens per word) for a safe estimation.
+ */
+function estimateTokens(text: string): number {
+  const wordCount = text.trim().split(/\s+/).length;
+  return Math.ceil(wordCount * 1.33);
+}
+/**
+ * Truncates text to fit within a token cap.
+ */
+function truncateToCap(text: string, cap: number): { truncated: string; isTruncated: boolean } {
+  const tokens = estimateTokens(text);
+  if (tokens <= cap) {
+    return { truncated: text, isTruncated: false };
+  }
+  // Calculate approximate words to keep
+  const wordsToKeep = Math.floor(cap / 1.33);
+  const words = text.trim().split(/\s+/);
+  const truncated = words.slice(0, wordsToKeep).join(' ') + '\n\n... [Content truncated due to token cap]';
+  return { truncated, isTruncated: true };
+}
+/**
+ * The unified ingestion pipeline for Alete Gate.
+ * Converts raw HTML into both structural tokens and semantic Markdown.
+ */
+export async function processHtml(html: string, options: IngestionOptions = {}): Promise<IngestionResult> {
+  if (!isInitialized) {
+    initialize();
+  }
+  // 1. Generate Structural Markdown using the custom plugin
+  const structuralMd = htmlToMarkdown(html, {
+    hooks: [structuralPlugin]
+  });
+  let metadata: Record<string, string> = {};
+  let semantic = htmlToMarkdown(html, withMinimalPreset({
+    isolateMain: false,
+    plugins: {
+      tagOverrides: {
+        a: { enter: '', exit: '' },
+        img: { enter: '', exit: '' },
+        svg: { enter: '', exit: '' },
+        canvas: { enter: '', exit: '' },
+      },
+      frontmatter: {
+        onExtract: (fm) => {
+          metadata = fm;
+        }
+      }
+    }
+  })).trim();
+  // 2.1 Fallback metadata if not found in head
+  if (!metadata.title) {
+    const h1Match = semantic.match(/^# (.*)$/m);
+    if (h1Match) {
+      // Clean markdown from title
+      metadata.title = h1Match[1].replace(/[#*`_]/g, '').trim();
+    }
+  }
+  if (!metadata.description) {
+    // Take first 150 chars of semantic (excluding title)
+    const bodyOnly = semantic.replace(/^# .*$/m, '').trim();
+    if (bodyOnly) {
+      // Clean markdown from description
+      const cleanBody = bodyOnly.replace(/[#*`_]/g, '').trim();
+      metadata.description = cleanBody.slice(0, 150).replace(/\n/g, ' ') + (cleanBody.length > 150 ? '...' : '');
+    }
+  }
+  // 2.2 Apply token cap
+  const cap = options.semanticTokenCap ?? globalConfig.defaultSemanticTokenCap ?? DEFAULTS.SEMANTIC_TOKEN_CAP;
+  const { truncated, isTruncated } = truncateToCap(semantic, cap);
+  semantic = truncated;
+  let hasSensitiveInfo = false;
+  // 3. Redaction Pipeline
+  if (options.redact) {
+    const redactorOptions = typeof options.redact === 'object' ? options.redact : {};
+    const redactor = new Redactor(redactorOptions);
+    const result = await redactor.process(semantic);
+    semantic = result.redacted;
+    hasSensitiveInfo = result.hasSensitiveInfo;
+  }
+  return {
+    structural: mapToTokens(structuralMd),
+    semantic,
+    hasSensitiveInfo,
+    metadata,
+    isTruncated
+  };
+}
+export { structuralPlugin as plugin } from './config.js';
+export { mapToTokens } from './token-mapper.js';
+export { Redactor, type RedactorOptions };

package/src/platform/empty-shim.ts ADDED Viewed

@@ -0,0 +1,34 @@
+// Common Node.js shims for browser-side bundling
+export const parseHTML = () => ({ document: null });
+export const createHash = () => ({ update: () => ({ digest: () => '' }) });
+export const Worker = class {};
+export const cpus = () => [];
+export const Readable = class {};
+// Path shims
+export const join = (...args: string[]) => args.join('/');
+export const resolve = (...args: string[]) => args.join('/');
+export const dirname = (p: string) => p.split('/').slice(0, -1).join('/') || '.';
+export const basename = (p: string) => p.split('/').pop() || '';
+export const extname = (p: string) => {
+  const parts = p.split('.');
+  return parts.length > 1 ? '.' + parts.pop() : '';
+};
+// FS shims
+export const readFileSync = () => '';
+export const existsSync = () => false;
+export const promises = {
+  readFile: async () => '',
+  writeFile: async () => {},
+};
+export default {
+  join, resolve, dirname, basename, extname,
+  readFileSync, existsSync,
+  promises,
+  createHash,
+  Worker,
+  cpus,
+  Readable
+};

package/src/sanitization/Redactor.ts ADDED Viewed

@@ -0,0 +1,95 @@
+import { OpenRedaction, type OpenRedactionOptions, getPatternsByCategory } from 'openredaction';
+export interface RedactorOptions {
+  redactPii?: boolean;
+  redactFinancials?: boolean;
+  redactCredentials?: boolean;
+  redactInfrastructure?: boolean;
+  redactMedical?: boolean;
+  customPlaceholders?: Record<string, string>;
+}
+/**
+ * Sovereign wrapper for the PII Shield engine.
+ * Adopts a "Narrative-First" strategy: Preserves names, dates, and locations
+ * to maintain story coherence while redacting "Toxic Identifiers" (SSNs, Credit Cards, Secrets).
+ */
+export class Redactor {
+  private engine: OpenRedaction;
+  constructor(options: RedactorOptions = {}) {
+    const patterns: string[] = [];
+    // 1. Personal & Contact (excluding Names, Addresses, Dates)
+    if (options.redactPii !== false) {
+      patterns.push('EMAIL');
+      patterns.push('PHONE_US', 'PHONE_UK', 'PHONE_INTERNATIONAL');
+      patterns.push(...getPatternsByCategory('government').map(p => p.type));
+    }
+    // 2. Financial
+    if (options.redactFinancials !== false) {
+      patterns.push(...getPatternsByCategory('financial').map(p => p.type));
+    }
+    // 3. Credentials
+    if (options.redactCredentials !== false) {
+      patterns.push(...getPatternsByCategory('credentials').map(p => p.type));
+    }
+    // 4. Medical
+    if (options.redactMedical !== false) {
+      patterns.push(...getPatternsByCategory('healthcare').map(p => p.type));
+    }
+    // 5. Infrastructure
+    if (options.redactInfrastructure !== false) {
+      patterns.push(...getPatternsByCategory('network').map(p => p.type));
+    }
+    const engineOptions: OpenRedactionOptions = {
+      patterns,
+      // Narrative-First safety: Double-down on disabling entities
+      includeNames: false,
+      includeAddresses: false,
+      redactionMode: 'placeholder',
+      enableContextAnalysis: true,
+      enableFalsePositiveFilter: true,
+      falsePositiveThreshold: 0.7,
+      deterministic: true,
+    };
+    this.engine = new OpenRedaction(engineOptions);
+  }
+  /**
+   * Redacts sensitive information from the given text.
+   */
+  public async redact(text: string): Promise<string> {
+    if (!text) return text;
+    const result = await this.engine.detect(text);
+    return result.redacted;
+  }
+  /**
+   * Checks if the text contains any sensitive information without modifying it.
+   */
+  public async hasSensitiveInfo(text: string): Promise<boolean> {
+    if (!text) return false;
+    const result = await this.engine.detect(text);
+    return result.detections.length > 0;
+  }
+  /**
+   * Performs both detection and redaction in a single pass.
+   */
+  public async process(text: string): Promise<{ redacted: string, hasSensitiveInfo: boolean }> {
+    if (!text) return { redacted: text, hasSensitiveInfo: false };
+    const result = await this.engine.detect(text);
+    return {
+      redacted: result.redacted,
+      hasSensitiveInfo: result.detections.length > 0
+    };
+  }
+}

package/src/token-mapper.test.ts ADDED Viewed

@@ -0,0 +1,32 @@
+import { mapToTokens } from './token-mapper'
+import { describe, expect, it } from 'vitest'
+describe('token-mapper', () => {
+  it('should map [FORM_START] to structFormStart', () => {
+    const input = '\n[FORM_START]\n'
+    expect(mapToTokens(input)).toContain('structFormStart')
+  })
+  it('should map [LINK:url] to structLinkElement', () => {
+    const input = '[LINK:https://example.com]Click me'
+    expect(mapToTokens(input)).toContain('structLinkElement')
+  })
+  it('should map [INPUT:type:name:placeholder] to structInputType', () => {
+    const input = '[INPUT:text:username:Enter name]'
+    expect(mapToTokens(input)).toContain('structInputTextusername')
+  })
+  it('should map headers to sysHeader', () => {
+    const input = '# Main Title\n## Sub Title'
+    const result = mapToTokens(input)
+    expect(result).toContain('sysHeader1 MainTitle')
+    expect(result).toContain('sysHeader2 SubTitle')
+  })
+  it('should strip natural language noise', () => {
+    const input = 'this is some normal text structLinkElement and more text'
+    const result = mapToTokens(input)
+    expect(result).toBe('structLinkElement')
+  })
+})

package/src/token-mapper.ts ADDED Viewed

@@ -0,0 +1,78 @@
+/**
+ * Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.
+ * This prevents Apple's NLTokenizer from stripping critical punctuation.
+ * We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).
+ */
+export function mapToTokens(text: string): string {
+  // 1. Process explicit markers from structuralPlugin
+  // We handle potential escaping from mdream
+  let processed = text
+    .replace(/\\?\[FORM_START\\?\]/g, 'structFormStart')
+    .replace(/\\?\[FORM_END\\?\]/g, 'structFormEnd')
+    .replace(/\\?\[SELECT_START\\?\]/g, 'structSelectStart')
+    .replace(/\\?\[SELECT_END\\?\]/g, 'structSelectEnd')
+    .replace(/\\?\[NAV_START\\?\]/g, 'structNavStart')
+    .replace(/\\?\[NAV_END\\?\]/g, 'structNavEnd');
+  // 1.1 Process Label marker
+  processed = processed.replace(/LABEL\\?\[/g, 'structLabel ');
+  // 2. Process attribute-based markers
+  processed = processed
+    // Inputs: [INPUT:type:name:placeholder] -> structInputType {type} {name}
+    .replace(/\\?\[INPUT:([^:]+):([^:]*):([^\\\]]*)\\?\]/g, (_, type, name) => {
+      const cleanName = name.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
+      return `structInput${type.charAt(0).toUpperCase() + type.slice(1)}${cleanName}`;
+    })
+    // Links: [LINK:url] -> structLink
+    .replace(/\\?\[LINK:[^\\\]]+\\?\]/g, () => 'structLinkElement')
+    // Buttons: [BUTTON:text] -> structButton {text}
+    .replace(/\\?\[BUTTON:([^\\\]]+)\\?\]/g, (_, text) => {
+      const clean = text.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
+      return `structButton${clean}`;
+    });
+  // 3. Process Standard Markdown artifacts (if any remain) into clean tokens
+  processed = processed
+    // Links: [text](url) -> structLinkElement {text}
+    .replace(/\[([^\]]*)\]\(([^)]+)\)/g, (_, content) => {
+      const clean = content.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
+      return `structLinkElement${clean}`;
+    })
+    // Images: ![alt](url) -> structImage {alt}
+    .replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_, alt) => {
+      const clean = alt.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
+      return `structImage${clean}`;
+    });
+  // 4. Process Headers into clean tokens
+  processed = processed
+    .replace(/^# (.*$)/gm, (_, content) => {
+      const clean = content.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);
+      return `sysHeader1 ${clean}`;
+    })
+    .replace(/^## (.*$)/gm, (_, content) => {
+      const clean = content.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);
+      return `sysHeader2 ${clean}`;
+    })
+    .replace(/^### (.*$)/gm, (_, content) => {
+      const clean = content.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);
+      return `sysHeader3 ${clean}`;
+    });
+  // 5. Final cleaning: remove URLs and punctuation (including colons now)
+  processed = processed
+    .replace(/https?:\/\/[^\s]+/g, '') // Remove URLs
+    .replace(/[#*`_\[\]():]/g, ' '); // Remove remaining markdown chars + colons
+  // 6. Aggressively strip remaining natural language noise
+  return processed
+    .split(/\s+/)
+    .filter(word => {
+      return word.startsWith('struct') ||
+             word.startsWith('sys') ||
+             (/^[A-Z]/.test(word) && word.length > 2); // Keep capitalized words (titles, labels) > 2 chars
+    })
+    .join(' ')
+    .trim();
+}

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "ESNext",
+    "module": "NodeNext",
+    "moduleResolution": "NodeNext",
+    "declaration": true,
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "outDir": "dist"
+  },
+  "include": ["src/**/*"]
+}

package/tsup.config.ts ADDED Viewed

@@ -0,0 +1,48 @@
+import { defineConfig } from 'tsup';
+import path from 'path';
+import { fileURLToPath } from 'url';
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+export default defineConfig([
+  {
+    entry: ['src/index.ts'],
+    format: ['esm', 'cjs'],
+    dts: true,
+    clean: true,
+    sourcemap: true,
+    minify: true,
+    treeshake: true,
+    platform: 'node',
+    external: ['@mdream/js', 'openredaction'],
+  },
+  {
+    entry: {
+      'index.browser': 'src/index.ts'
+    },
+    format: ['esm'],
+    dts: true,
+    clean: false,
+    sourcemap: true,
+    minify: true,
+    treeshake: true,
+    platform: 'browser',
+    noExternal: ['@mdream/js', 'openredaction'],
+    define: {
+      'process.versions.node': 'undefined',
+      'process.platform': '"browser"',
+    },
+    esbuildOptions(options) {
+      options.alias = {
+        'fs': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'path': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'url': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'crypto': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'os': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'stream': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'worker_threads': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+        'fs/promises': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
+      }
+    }
+  }
+]);