@alete-ai/gate-ingest 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +622 -0
- package/dist/chunk-G7HIHPC6.js +2 -0
- package/dist/chunk-G7HIHPC6.js.map +1 -0
- package/dist/empty-shim-IFLK4AY7.js +2 -0
- package/dist/empty-shim-IFLK4AY7.js.map +1 -0
- package/dist/index.browser.d.ts +106 -0
- package/dist/index.browser.js +416 -0
- package/dist/index.browser.js.map +1 -0
- package/dist/index.cjs +12 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +106 -0
- package/dist/index.d.ts +106 -0
- package/dist/index.js +12 -0
- package/dist/index.js.map +1 -0
- package/package.json +39 -0
- package/src/config.ts +60 -0
- package/src/index.test.ts +100 -0
- package/src/index.ts +183 -0
- package/src/platform/empty-shim.ts +34 -0
- package/src/sanitization/Redactor.ts +95 -0
- package/src/token-mapper.test.ts +32 -0
- package/src/token-mapper.ts +78 -0
- package/tsconfig.json +14 -0
- package/tsup.config.ts +48 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import * as _mdream_js from '@mdream/js';
|
|
2
|
+
|
|
3
|
+
interface RedactorOptions {
|
|
4
|
+
redactPii?: boolean;
|
|
5
|
+
redactFinancials?: boolean;
|
|
6
|
+
redactCredentials?: boolean;
|
|
7
|
+
redactInfrastructure?: boolean;
|
|
8
|
+
redactMedical?: boolean;
|
|
9
|
+
customPlaceholders?: Record<string, string>;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Sovereign wrapper for the PII Shield engine.
|
|
13
|
+
* Adopts a "Narrative-First" strategy: Preserves names, dates, and locations
|
|
14
|
+
* to maintain story coherence while redacting "Toxic Identifiers" (SSNs, Credit Cards, Secrets).
|
|
15
|
+
*/
|
|
16
|
+
declare class Redactor {
|
|
17
|
+
private engine;
|
|
18
|
+
constructor(options?: RedactorOptions);
|
|
19
|
+
/**
|
|
20
|
+
* Redacts sensitive information from the given text.
|
|
21
|
+
*/
|
|
22
|
+
redact(text: string): Promise<string>;
|
|
23
|
+
/**
|
|
24
|
+
* Checks if the text contains any sensitive information without modifying it.
|
|
25
|
+
*/
|
|
26
|
+
hasSensitiveInfo(text: string): Promise<boolean>;
|
|
27
|
+
/**
|
|
28
|
+
* Performs both detection and redaction in a single pass.
|
|
29
|
+
*/
|
|
30
|
+
process(text: string): Promise<{
|
|
31
|
+
redacted: string;
|
|
32
|
+
hasSensitiveInfo: boolean;
|
|
33
|
+
}>;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Custom mdream plugin to retain structural artifacts.
|
|
38
|
+
* This preserves the "structural footprint" of sensitive portals
|
|
39
|
+
* by injecting intermediate markers with attribute context.
|
|
40
|
+
*/
|
|
41
|
+
declare const structuralPlugin: _mdream_js.TransformPlugin;
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Maps structural Markdown/HTML artifacts to explicit alphanumeric tokens.
|
|
45
|
+
* This prevents Apple's NLTokenizer from stripping critical punctuation.
|
|
46
|
+
* We use camelCase because NLTokenizer splits snake_case (STRUCT_FORM_START -> STRUCT, FORM, START).
|
|
47
|
+
*/
|
|
48
|
+
declare function mapToTokens(text: string): string;
|
|
49
|
+
|
|
50
|
+
declare enum GateLabel {
|
|
51
|
+
SENSITIVE_PORTAL = "sensitive_portal",
|
|
52
|
+
DIGESTIBLE_ARTICLE = "digestible_article",
|
|
53
|
+
NOISE = "noise",
|
|
54
|
+
UNKNOWN = "unknown"
|
|
55
|
+
}
|
|
56
|
+
interface IngestionResult {
|
|
57
|
+
/**
|
|
58
|
+
* Alphanumeric tokenized text for Apple MaxEnt classifier.
|
|
59
|
+
* High structural fidelity, low natural language noise.
|
|
60
|
+
*/
|
|
61
|
+
structural: string;
|
|
62
|
+
/**
|
|
63
|
+
* Clean, readable Markdown for LLM analysis.
|
|
64
|
+
* Low structural noise, high semantic fidelity.
|
|
65
|
+
*/
|
|
66
|
+
semantic: string;
|
|
67
|
+
/**
|
|
68
|
+
* Whether the semantic content contains sensitive PII that was redacted.
|
|
69
|
+
*/
|
|
70
|
+
hasSensitiveInfo?: boolean;
|
|
71
|
+
/**
|
|
72
|
+
* Extracted metadata from the HTML (title, description, author, etc.)
|
|
73
|
+
*/
|
|
74
|
+
metadata?: Record<string, string>;
|
|
75
|
+
/**
|
|
76
|
+
* Whether the semantic content was truncated due to the token cap.
|
|
77
|
+
*/
|
|
78
|
+
isTruncated?: boolean;
|
|
79
|
+
}
|
|
80
|
+
interface IngestionOptions {
|
|
81
|
+
/**
|
|
82
|
+
* Redaction configuration. If true, uses default settings.
|
|
83
|
+
*/
|
|
84
|
+
redact?: RedactorOptions | boolean;
|
|
85
|
+
/**
|
|
86
|
+
* Override the semantic token cap for this specific call.
|
|
87
|
+
*/
|
|
88
|
+
semanticTokenCap?: number;
|
|
89
|
+
}
|
|
90
|
+
interface GlobalConfig {
|
|
91
|
+
/**
|
|
92
|
+
* The default token cap for semantic returns. Defaults to 15,000.
|
|
93
|
+
*/
|
|
94
|
+
defaultSemanticTokenCap?: number;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Initializes the Alete Gate ingestion substrate with global configuration.
|
|
98
|
+
*/
|
|
99
|
+
declare function initialize(config?: GlobalConfig): void;
|
|
100
|
+
/**
|
|
101
|
+
* The unified ingestion pipeline for Alete Gate.
|
|
102
|
+
* Converts raw HTML into both structural tokens and semantic Markdown.
|
|
103
|
+
*/
|
|
104
|
+
declare function processHtml(html: string, options?: IngestionOptions): Promise<IngestionResult>;
|
|
105
|
+
|
|
106
|
+
export { GateLabel, type GlobalConfig, type IngestionOptions, type IngestionResult, Redactor, type RedactorOptions, initialize, mapToTokens, structuralPlugin as plugin, processHtml };
|