@alete-ai/gate-ingest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,183 @@
1
+ import { htmlToMarkdown, withMinimalPreset } from '@mdream/js';
2
+ import { structuralPlugin } from './config.js';
3
+ import { mapToTokens } from './token-mapper.js';
4
+ import { Redactor, type RedactorOptions } from './sanitization/Redactor.js';
5
+
6
+ export enum GateLabel {
7
+ SENSITIVE_PORTAL = 'sensitive_portal',
8
+ DIGESTIBLE_ARTICLE = 'digestible_article',
9
+ NOISE = 'noise',
10
+ UNKNOWN = 'unknown',
11
+ }
12
+
13
+ export interface IngestionResult {
14
+ /**
15
+ * Alphanumeric tokenized text for Apple MaxEnt classifier.
16
+ * High structural fidelity, low natural language noise.
17
+ */
18
+ structural: string;
19
+
20
+ /**
21
+ * Clean, readable Markdown for LLM analysis.
22
+ * Low structural noise, high semantic fidelity.
23
+ */
24
+ semantic: string;
25
+
26
+ /**
27
+ * Whether the semantic content contains sensitive PII that was redacted.
28
+ */
29
+ hasSensitiveInfo?: boolean;
30
+
31
+ /**
32
+ * Extracted metadata from the HTML (title, description, author, etc.)
33
+ */
34
+ metadata?: Record<string, string>;
35
+
36
+ /**
37
+ * Whether the semantic content was truncated due to the token cap.
38
+ */
39
+ isTruncated?: boolean;
40
+ }
41
+
42
+ export interface IngestionOptions {
43
+ /**
44
+ * Redaction configuration. If true, uses default settings.
45
+ */
46
+ redact?: RedactorOptions | boolean;
47
+
48
+ /**
49
+ * Override the semantic token cap for this specific call.
50
+ */
51
+ semanticTokenCap?: number;
52
+ }
53
+
54
+ export interface GlobalConfig {
55
+ /**
56
+ * The default token cap for semantic returns. Defaults to 15,000.
57
+ */
58
+ defaultSemanticTokenCap?: number;
59
+ }
60
+
61
+ const DEFAULTS = {
62
+ SEMANTIC_TOKEN_CAP: 15000,
63
+ };
64
+
65
+ let globalConfig: GlobalConfig = {
66
+ defaultSemanticTokenCap: DEFAULTS.SEMANTIC_TOKEN_CAP,
67
+ };
68
+
69
+ let isInitialized = false;
70
+
71
+ /**
72
+ * Initializes the Alete Gate ingestion substrate with global configuration.
73
+ */
74
+ export function initialize(config: GlobalConfig = {}): void {
75
+ globalConfig = { ...globalConfig, ...config };
76
+ console.log(`🛡️ Alete Gate: Ingestion substrate initialized. Semantic cap: ${globalConfig.defaultSemanticTokenCap} tokens. Explore our ecosystem at https://alete.ai/`);
77
+ isInitialized = true;
78
+ }
79
+
80
+ /**
81
+ * Estimates the number of tokens in a text string.
82
+ * Uses the 1.33x multiplier (tokens per word) for a safe estimation.
83
+ */
84
+ function estimateTokens(text: string): number {
85
+ const wordCount = text.trim().split(/\s+/).length;
86
+ return Math.ceil(wordCount * 1.33);
87
+ }
88
+
89
+ /**
90
+ * Truncates text to fit within a token cap.
91
+ */
92
+ function truncateToCap(text: string, cap: number): { truncated: string; isTruncated: boolean } {
93
+ const tokens = estimateTokens(text);
94
+ if (tokens <= cap) {
95
+ return { truncated: text, isTruncated: false };
96
+ }
97
+
98
+ // Calculate approximate words to keep
99
+ const wordsToKeep = Math.floor(cap / 1.33);
100
+ const words = text.trim().split(/\s+/);
101
+ const truncated = words.slice(0, wordsToKeep).join(' ') + '\n\n... [Content truncated due to token cap]';
102
+
103
+ return { truncated, isTruncated: true };
104
+ }
105
+
106
+ /**
107
+ * The unified ingestion pipeline for Alete Gate.
108
+ * Converts raw HTML into both structural tokens and semantic Markdown.
109
+ */
110
+ export async function processHtml(html: string, options: IngestionOptions = {}): Promise<IngestionResult> {
111
+ if (!isInitialized) {
112
+ initialize();
113
+ }
114
+
115
+ // 1. Generate Structural Markdown using the custom plugin
116
+ const structuralMd = htmlToMarkdown(html, {
117
+ hooks: [structuralPlugin]
118
+ });
119
+
120
+ let metadata: Record<string, string> = {};
121
+ let semantic = htmlToMarkdown(html, withMinimalPreset({
122
+ isolateMain: false,
123
+ plugins: {
124
+ tagOverrides: {
125
+ a: { enter: '', exit: '' },
126
+ img: { enter: '', exit: '' },
127
+ svg: { enter: '', exit: '' },
128
+ canvas: { enter: '', exit: '' },
129
+ },
130
+ frontmatter: {
131
+ onExtract: (fm) => {
132
+ metadata = fm;
133
+ }
134
+ }
135
+ }
136
+ })).trim();
137
+
138
+ // 2.1 Fallback metadata if not found in head
139
+ if (!metadata.title) {
140
+ const h1Match = semantic.match(/^# (.*)$/m);
141
+ if (h1Match) {
142
+ // Clean markdown from title
143
+ metadata.title = h1Match[1].replace(/[#*`_]/g, '').trim();
144
+ }
145
+ }
146
+ if (!metadata.description) {
147
+ // Take first 150 chars of semantic (excluding title)
148
+ const bodyOnly = semantic.replace(/^# .*$/m, '').trim();
149
+ if (bodyOnly) {
150
+ // Clean markdown from description
151
+ const cleanBody = bodyOnly.replace(/[#*`_]/g, '').trim();
152
+ metadata.description = cleanBody.slice(0, 150).replace(/\n/g, ' ') + (cleanBody.length > 150 ? '...' : '');
153
+ }
154
+ }
155
+
156
+ // 2.2 Apply token cap
157
+ const cap = options.semanticTokenCap ?? globalConfig.defaultSemanticTokenCap ?? DEFAULTS.SEMANTIC_TOKEN_CAP;
158
+ const { truncated, isTruncated } = truncateToCap(semantic, cap);
159
+ semantic = truncated;
160
+
161
+ let hasSensitiveInfo = false;
162
+
163
+ // 3. Redaction Pipeline
164
+ if (options.redact) {
165
+ const redactorOptions = typeof options.redact === 'object' ? options.redact : {};
166
+ const redactor = new Redactor(redactorOptions);
167
+ const result = await redactor.process(semantic);
168
+ semantic = result.redacted;
169
+ hasSensitiveInfo = result.hasSensitiveInfo;
170
+ }
171
+
172
+ return {
173
+ structural: mapToTokens(structuralMd),
174
+ semantic,
175
+ hasSensitiveInfo,
176
+ metadata,
177
+ isTruncated
178
+ };
179
+ }
180
+
181
+ export { structuralPlugin as plugin } from './config.js';
182
+ export { mapToTokens } from './token-mapper.js';
183
+ export { Redactor, type RedactorOptions };
@@ -0,0 +1,34 @@
1
+ // Common Node.js shims for browser-side bundling
2
+ export const parseHTML = () => ({ document: null });
3
+ export const createHash = () => ({ update: () => ({ digest: () => '' }) });
4
+ export const Worker = class {};
5
+ export const cpus = () => [];
6
+ export const Readable = class {};
7
+
8
+ // Path shims
9
+ export const join = (...args: string[]) => args.join('/');
10
+ export const resolve = (...args: string[]) => args.join('/');
11
+ export const dirname = (p: string) => p.split('/').slice(0, -1).join('/') || '.';
12
+ export const basename = (p: string) => p.split('/').pop() || '';
13
+ export const extname = (p: string) => {
14
+ const parts = p.split('.');
15
+ return parts.length > 1 ? '.' + parts.pop() : '';
16
+ };
17
+
18
+ // FS shims
19
+ export const readFileSync = () => '';
20
+ export const existsSync = () => false;
21
+ export const promises = {
22
+ readFile: async () => '',
23
+ writeFile: async () => {},
24
+ };
25
+
26
+ export default {
27
+ join, resolve, dirname, basename, extname,
28
+ readFileSync, existsSync,
29
+ promises,
30
+ createHash,
31
+ Worker,
32
+ cpus,
33
+ Readable
34
+ };
@@ -0,0 +1,95 @@
1
+ import { OpenRedaction, type OpenRedactionOptions, getPatternsByCategory } from 'openredaction';
2
+
3
+ export interface RedactorOptions {
4
+ redactPii?: boolean;
5
+ redactFinancials?: boolean;
6
+ redactCredentials?: boolean;
7
+ redactInfrastructure?: boolean;
8
+ redactMedical?: boolean;
9
+ customPlaceholders?: Record<string, string>;
10
+ }
11
+
12
+ /**
13
+ * Sovereign wrapper for the PII Shield engine.
14
+ * Adopts a "Narrative-First" strategy: Preserves names, dates, and locations
15
+ * to maintain story coherence while redacting "Toxic Identifiers" (SSNs, Credit Cards, Secrets).
16
+ */
17
+ export class Redactor {
18
+ private engine: OpenRedaction;
19
+
20
+ constructor(options: RedactorOptions = {}) {
21
+ const patterns: string[] = [];
22
+
23
+ // 1. Personal & Contact (excluding Names, Addresses, Dates)
24
+ if (options.redactPii !== false) {
25
+ patterns.push('EMAIL');
26
+ patterns.push('PHONE_US', 'PHONE_UK', 'PHONE_INTERNATIONAL');
27
+ patterns.push(...getPatternsByCategory('government').map(p => p.type));
28
+ }
29
+
30
+ // 2. Financial
31
+ if (options.redactFinancials !== false) {
32
+ patterns.push(...getPatternsByCategory('financial').map(p => p.type));
33
+ }
34
+
35
+ // 3. Credentials
36
+ if (options.redactCredentials !== false) {
37
+ patterns.push(...getPatternsByCategory('credentials').map(p => p.type));
38
+ }
39
+
40
+ // 4. Medical
41
+ if (options.redactMedical !== false) {
42
+ patterns.push(...getPatternsByCategory('healthcare').map(p => p.type));
43
+ }
44
+
45
+ // 5. Infrastructure
46
+ if (options.redactInfrastructure !== false) {
47
+ patterns.push(...getPatternsByCategory('network').map(p => p.type));
48
+ }
49
+
50
+ const engineOptions: OpenRedactionOptions = {
51
+ patterns,
52
+ // Narrative-First safety: Double-down on disabling entities
53
+ includeNames: false,
54
+ includeAddresses: false,
55
+
56
+ redactionMode: 'placeholder',
57
+ enableContextAnalysis: true,
58
+ enableFalsePositiveFilter: true,
59
+ falsePositiveThreshold: 0.7,
60
+ deterministic: true,
61
+ };
62
+
63
+ this.engine = new OpenRedaction(engineOptions);
64
+ }
65
+
66
+ /**
67
+ * Redacts sensitive information from the given text.
68
+ */
69
+ public async redact(text: string): Promise<string> {
70
+ if (!text) return text;
71
+ const result = await this.engine.detect(text);
72
+ return result.redacted;
73
+ }
74
+
75
+ /**
76
+ * Checks if the text contains any sensitive information without modifying it.
77
+ */
78
+ public async hasSensitiveInfo(text: string): Promise<boolean> {
79
+ if (!text) return false;
80
+ const result = await this.engine.detect(text);
81
+ return result.detections.length > 0;
82
+ }
83
+
84
+ /**
85
+ * Performs both detection and redaction in a single pass.
86
+ */
87
+ public async process(text: string): Promise<{ redacted: string, hasSensitiveInfo: boolean }> {
88
+ if (!text) return { redacted: text, hasSensitiveInfo: false };
89
+ const result = await this.engine.detect(text);
90
+ return {
91
+ redacted: result.redacted,
92
+ hasSensitiveInfo: result.detections.length > 0
93
+ };
94
+ }
95
+ }
@@ -0,0 +1,32 @@
1
+ import { mapToTokens } from './token-mapper'
2
+ import { describe, expect, it } from 'vitest'
3
+
4
+ describe('token-mapper', () => {
5
+ it('should map [FORM_START] to structFormStart', () => {
6
+ const input = '\n[FORM_START]\n'
7
+ expect(mapToTokens(input)).toContain('structFormStart')
8
+ })
9
+
10
+ it('should map [LINK:url] to structLinkElement', () => {
11
+ const input = '[LINK:https://example.com]Click me'
12
+ expect(mapToTokens(input)).toContain('structLinkElement')
13
+ })
14
+
15
+ it('should map [INPUT:type:name:placeholder] to structInputType', () => {
16
+ const input = '[INPUT:text:username:Enter name]'
17
+ expect(mapToTokens(input)).toContain('structInputTextusername')
18
+ })
19
+
20
+ it('should map headers to sysHeader', () => {
21
+ const input = '# Main Title\n## Sub Title'
22
+ const result = mapToTokens(input)
23
+ expect(result).toContain('sysHeader1 MainTitle')
24
+ expect(result).toContain('sysHeader2 SubTitle')
25
+ })
26
+
27
+ it('should strip natural language noise', () => {
28
+ const input = 'this is some normal text structLinkElement and more text'
29
+ const result = mapToTokens(input)
30
+ expect(result).toBe('structLinkElement')
31
+ })
32
+ })
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.
3
+ * This prevents Apple's NLTokenizer from stripping critical punctuation.
4
+ * We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).
5
+ */
6
+ export function mapToTokens(text: string): string {
7
+ // 1. Process explicit markers from structuralPlugin
8
+ // We handle potential escaping from mdream
9
+ let processed = text
10
+ .replace(/\\?\[FORM_START\\?\]/g, 'structFormStart')
11
+ .replace(/\\?\[FORM_END\\?\]/g, 'structFormEnd')
12
+ .replace(/\\?\[SELECT_START\\?\]/g, 'structSelectStart')
13
+ .replace(/\\?\[SELECT_END\\?\]/g, 'structSelectEnd')
14
+ .replace(/\\?\[NAV_START\\?\]/g, 'structNavStart')
15
+ .replace(/\\?\[NAV_END\\?\]/g, 'structNavEnd');
16
+
17
+ // 1.1 Process Label marker
18
+ processed = processed.replace(/LABEL\\?\[/g, 'structLabel ');
19
+
20
+ // 2. Process attribute-based markers
21
+ processed = processed
22
+ // Inputs: [INPUT:type:name:placeholder] -> structInputType {type} {name}
23
+ .replace(/\\?\[INPUT:([^:]+):([^:]*):([^\\\]]*)\\?\]/g, (_, type, name) => {
24
+ const cleanName = name.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
25
+ return `structInput${type.charAt(0).toUpperCase() + type.slice(1)}${cleanName}`;
26
+ })
27
+ // Links: [LINK:url] -> structLink
28
+ .replace(/\\?\[LINK:[^\\\]]+\\?\]/g, () => 'structLinkElement')
29
+ // Buttons: [BUTTON:text] -> structButton {text}
30
+ .replace(/\\?\[BUTTON:([^\\\]]+)\\?\]/g, (_, text) => {
31
+ const clean = text.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
32
+ return `structButton${clean}`;
33
+ });
34
+
35
+ // 3. Process Standard Markdown artifacts (if any remain) into clean tokens
36
+ processed = processed
37
+ // Links: [text](url) -> structLinkElement {text}
38
+ .replace(/\[([^\]]*)\]\(([^)]+)\)/g, (_, content) => {
39
+ const clean = content.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
40
+ return `structLinkElement${clean}`;
41
+ })
42
+ // Images: ![alt](url) -> structImage {alt}
43
+ .replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (_, alt) => {
44
+ const clean = alt.replace(/[^a-zA-Z0-9]/g, '').slice(0, 20);
45
+ return `structImage${clean}`;
46
+ });
47
+
48
+ // 4. Process Headers into clean tokens
49
+ processed = processed
50
+ .replace(/^# (.*$)/gm, (_, content) => {
51
+ const clean = content.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);
52
+ return `sysHeader1 ${clean}`;
53
+ })
54
+ .replace(/^## (.*$)/gm, (_, content) => {
55
+ const clean = content.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);
56
+ return `sysHeader2 ${clean}`;
57
+ })
58
+ .replace(/^### (.*$)/gm, (_, content) => {
59
+ const clean = content.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1').replace(/[^a-zA-Z0-9]/g, '').slice(0, 30);
60
+ return `sysHeader3 ${clean}`;
61
+ });
62
+
63
+ // 5. Final cleaning: remove URLs and punctuation (including colons now)
64
+ processed = processed
65
+ .replace(/https?:\/\/[^\s]+/g, '') // Remove URLs
66
+ .replace(/[#*`_\[\]():]/g, ' '); // Remove remaining markdown chars + colons
67
+
68
+ // 6. Aggressively strip remaining natural language noise
69
+ return processed
70
+ .split(/\s+/)
71
+ .filter(word => {
72
+ return word.startsWith('struct') ||
73
+ word.startsWith('sys') ||
74
+ (/^[A-Z]/.test(word) && word.length > 2); // Keep capitalized words (titles, labels) > 2 chars
75
+ })
76
+ .join(' ')
77
+ .trim();
78
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,14 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ESNext",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "declaration": true,
7
+ "strict": true,
8
+ "esModuleInterop": true,
9
+ "skipLibCheck": true,
10
+ "forceConsistentCasingInFileNames": true,
11
+ "outDir": "dist"
12
+ },
13
+ "include": ["src/**/*"]
14
+ }
package/tsup.config.ts ADDED
@@ -0,0 +1,48 @@
1
+ import { defineConfig } from 'tsup';
2
+ import path from 'path';
3
+ import { fileURLToPath } from 'url';
4
+
5
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
6
+
7
+ export default defineConfig([
8
+ {
9
+ entry: ['src/index.ts'],
10
+ format: ['esm', 'cjs'],
11
+ dts: true,
12
+ clean: true,
13
+ sourcemap: true,
14
+ minify: true,
15
+ treeshake: true,
16
+ platform: 'node',
17
+ external: ['@mdream/js', 'openredaction'],
18
+ },
19
+ {
20
+ entry: {
21
+ 'index.browser': 'src/index.ts'
22
+ },
23
+ format: ['esm'],
24
+ dts: true,
25
+ clean: false,
26
+ sourcemap: true,
27
+ minify: true,
28
+ treeshake: true,
29
+ platform: 'browser',
30
+ noExternal: ['@mdream/js', 'openredaction'],
31
+ define: {
32
+ 'process.versions.node': 'undefined',
33
+ 'process.platform': '"browser"',
34
+ },
35
+ esbuildOptions(options) {
36
+ options.alias = {
37
+ 'fs': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
38
+ 'path': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
39
+ 'url': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
40
+ 'crypto': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
41
+ 'os': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
42
+ 'stream': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
43
+ 'worker_threads': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
44
+ 'fs/promises': path.resolve(__dirname, 'src/platform/empty-shim.ts'),
45
+ }
46
+ }
47
+ }
48
+ ]);