@framers/agentos 0.1.54 → 0.1.56
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/extensions/ExtensionManager.d.ts +1 -0
- package/dist/extensions/ExtensionManager.d.ts.map +1 -1
- package/dist/extensions/ExtensionManager.js +8 -0
- package/dist/extensions/ExtensionManager.js.map +1 -1
- package/dist/extensions/ISharedServiceRegistry.d.ts +35 -0
- package/dist/extensions/ISharedServiceRegistry.d.ts.map +1 -0
- package/dist/extensions/ISharedServiceRegistry.js +2 -0
- package/dist/extensions/ISharedServiceRegistry.js.map +1 -0
- package/dist/extensions/SharedServiceRegistry.d.ts +15 -0
- package/dist/extensions/SharedServiceRegistry.d.ts.map +1 -0
- package/dist/extensions/SharedServiceRegistry.js +63 -0
- package/dist/extensions/SharedServiceRegistry.js.map +1 -0
- package/dist/extensions/index.d.ts +3 -0
- package/dist/extensions/index.d.ts.map +1 -1
- package/dist/extensions/index.js +4 -0
- package/dist/extensions/index.js.map +1 -1
- package/dist/extensions/manifest.d.ts +2 -0
- package/dist/extensions/manifest.d.ts.map +1 -1
- package/dist/extensions/packs/pii-redaction/EntityMerger.d.ts +127 -0
- package/dist/extensions/packs/pii-redaction/EntityMerger.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/EntityMerger.js +263 -0
- package/dist/extensions/packs/pii-redaction/EntityMerger.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/PiiDetectionPipeline.d.ts +199 -0
- package/dist/extensions/packs/pii-redaction/PiiDetectionPipeline.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/PiiDetectionPipeline.js +456 -0
- package/dist/extensions/packs/pii-redaction/PiiDetectionPipeline.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/PiiRedactionGuardrail.d.ts +121 -0
- package/dist/extensions/packs/pii-redaction/PiiRedactionGuardrail.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/PiiRedactionGuardrail.js +271 -0
- package/dist/extensions/packs/pii-redaction/PiiRedactionGuardrail.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/RedactionEngine.d.ts +61 -0
- package/dist/extensions/packs/pii-redaction/RedactionEngine.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/RedactionEngine.js +207 -0
- package/dist/extensions/packs/pii-redaction/RedactionEngine.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/index.d.ts +90 -0
- package/dist/extensions/packs/pii-redaction/index.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/index.js +195 -0
- package/dist/extensions/packs/pii-redaction/index.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/IEntityRecognizer.d.ts +151 -0
- package/dist/extensions/packs/pii-redaction/recognizers/IEntityRecognizer.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/IEntityRecognizer.js +14 -0
- package/dist/extensions/packs/pii-redaction/recognizers/IEntityRecognizer.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/LlmJudgeRecognizer.d.ts +177 -0
- package/dist/extensions/packs/pii-redaction/recognizers/LlmJudgeRecognizer.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/LlmJudgeRecognizer.js +420 -0
- package/dist/extensions/packs/pii-redaction/recognizers/LlmJudgeRecognizer.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NerModelRecognizer.d.ts +145 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NerModelRecognizer.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NerModelRecognizer.js +299 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NerModelRecognizer.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NlpPrefilterRecognizer.d.ts +102 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NlpPrefilterRecognizer.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NlpPrefilterRecognizer.js +228 -0
- package/dist/extensions/packs/pii-redaction/recognizers/NlpPrefilterRecognizer.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/RegexRecognizer.d.ts +103 -0
- package/dist/extensions/packs/pii-redaction/recognizers/RegexRecognizer.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/recognizers/RegexRecognizer.js +275 -0
- package/dist/extensions/packs/pii-redaction/recognizers/RegexRecognizer.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiRedactTool.d.ts +118 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiRedactTool.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiRedactTool.js +152 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiRedactTool.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiScanTool.d.ts +98 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiScanTool.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiScanTool.js +153 -0
- package/dist/extensions/packs/pii-redaction/tools/PiiScanTool.js.map +1 -0
- package/dist/extensions/packs/pii-redaction/types.d.ts +332 -0
- package/dist/extensions/packs/pii-redaction/types.d.ts.map +1 -0
- package/dist/extensions/packs/pii-redaction/types.js +83 -0
- package/dist/extensions/packs/pii-redaction/types.js.map +1 -0
- package/dist/extensions/types.d.ts +5 -0
- package/dist/extensions/types.d.ts.map +1 -1
- package/dist/extensions/types.js.map +1 -1
- package/package.json +11 -1
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file NerModelRecognizer.ts
|
|
3
|
+
* @description Tier 3 NER-model recogniser that uses a HuggingFace
|
|
4
|
+
* Transformers pipeline for high-accuracy named-entity recognition.
|
|
5
|
+
*
|
|
6
|
+
* This recogniser loads a pre-trained BERT-style NER model via the
|
|
7
|
+
* `@huggingface/transformers` library and maps BIO-tagged outputs
|
|
8
|
+
* (B-PER, I-PER, B-LOC, I-LOC, B-ORG, I-ORG, B-MISC, I-MISC) to the
|
|
9
|
+
* pipeline's {@link PiiEntityType} values.
|
|
10
|
+
*
|
|
11
|
+
* The model is lazily loaded through the {@link ISharedServiceRegistry} so
|
|
12
|
+
* that only one instance exists per agent, and it is shared across any
|
|
13
|
+
* extensions that need NER capabilities.
|
|
14
|
+
*
|
|
15
|
+
* @module pii-redaction/recognizers
|
|
16
|
+
*/
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Service identity for the shared NER pipeline
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Stable service ID for the HuggingFace NER pipeline stored in the shared
|
|
22
|
+
* service registry.
|
|
23
|
+
*/
|
|
24
|
+
const NER_PIPELINE_SERVICE_ID = 'agentos:nlp:ner-pipeline';
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// BIO label → PiiEntityType mapping
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
/**
|
|
29
|
+
* Maps BERT NER BIO-tag prefixes (without the B-/I- prefix) to our
|
|
30
|
+
* canonical {@link PiiEntityType} values.
|
|
31
|
+
*
|
|
32
|
+
* | NER label | PiiEntityType |
|
|
33
|
+
* |-----------|-----------------|
|
|
34
|
+
* | PER | PERSON |
|
|
35
|
+
* | LOC | LOCATION |
|
|
36
|
+
* | ORG | ORGANIZATION |
|
|
37
|
+
* | MISC | UNKNOWN_PII |
|
|
38
|
+
*/
|
|
39
|
+
const NER_LABEL_MAP = {
|
|
40
|
+
PER: 'PERSON',
|
|
41
|
+
LOC: 'LOCATION',
|
|
42
|
+
ORG: 'ORGANIZATION',
|
|
43
|
+
MISC: 'UNKNOWN_PII',
|
|
44
|
+
};
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
// NerModelRecognizer
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
/**
|
|
49
|
+
* Tier 3 entity recogniser that runs a HuggingFace BERT NER model for
|
|
50
|
+
* high-accuracy named-entity recognition.
|
|
51
|
+
*
|
|
52
|
+
* ### How it works
|
|
53
|
+
* 1. On first `recognize()` call, the `@huggingface/transformers` library is
|
|
54
|
+
* loaded and a `token-classification` pipeline is created via the shared
|
|
55
|
+
* service registry.
|
|
56
|
+
* 2. The pipeline tokenises the input and runs it through the NER model,
|
|
57
|
+
* returning BIO-tagged token predictions.
|
|
58
|
+
* 3. Contiguous BIO tokens are merged: a `B-PER` followed by `I-PER` tokens
|
|
59
|
+
* becomes a single PERSON entity. The final score is the average of the
|
|
60
|
+
* constituent token scores.
|
|
61
|
+
* 4. Merged entities are mapped to {@link PiiEntity} objects.
|
|
62
|
+
*
|
|
63
|
+
* ### Graceful degradation
|
|
64
|
+
* If `@huggingface/transformers` is not installed or the model fails to load,
|
|
65
|
+
* the recogniser sets `unavailable = true` and returns empty arrays on all
|
|
66
|
+
* subsequent calls, ensuring the pipeline degrades without crashing.
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```ts
|
|
70
|
+
* const registry = new SharedServiceRegistry();
|
|
71
|
+
* const recognizer = new NerModelRecognizer(registry);
|
|
72
|
+
* const entities = await recognizer.recognize('John Smith lives in London');
|
|
73
|
+
* // entities: [{ entityType: 'PERSON', text: 'John Smith', ... },
|
|
74
|
+
* // { entityType: 'LOCATION', text: 'London', ... }]
|
|
75
|
+
* ```
|
|
76
|
+
*/
|
|
77
|
+
export class NerModelRecognizer {
|
|
78
|
+
/**
|
|
79
|
+
* Construct a new NerModelRecognizer.
|
|
80
|
+
*
|
|
81
|
+
* @param services - Shared service registry for lazy-loading the
|
|
82
|
+
* HuggingFace NER pipeline.
|
|
83
|
+
*/
|
|
84
|
+
constructor(services) {
|
|
85
|
+
/** @inheritdoc */
|
|
86
|
+
this.name = 'NerModelRecognizer';
|
|
87
|
+
/** @inheritdoc */
|
|
88
|
+
this.supportedEntities = [
|
|
89
|
+
'PERSON',
|
|
90
|
+
'LOCATION',
|
|
91
|
+
'ORGANIZATION',
|
|
92
|
+
'UNKNOWN_PII', // Mapped from MISC entities
|
|
93
|
+
];
|
|
94
|
+
/**
|
|
95
|
+
* When `true`, the transformers library or model failed to load and all
|
|
96
|
+
* future calls will return empty arrays.
|
|
97
|
+
*/
|
|
98
|
+
this.unavailable = false;
|
|
99
|
+
this.services = services;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Scan the input text for named entities using a BERT NER model.
|
|
103
|
+
*
|
|
104
|
+
* BIO-tagged tokens are merged into contiguous entity spans and mapped
|
|
105
|
+
* to {@link PiiEntity} objects.
|
|
106
|
+
*
|
|
107
|
+
* @param input - Raw text to analyse.
|
|
108
|
+
* @param options - Optional filtering and context hints.
|
|
109
|
+
* @returns Array of detected {@link PiiEntity} objects.
|
|
110
|
+
*/
|
|
111
|
+
async recognize(input, options) {
|
|
112
|
+
// If the model previously failed to load, bail out immediately.
|
|
113
|
+
if (this.unavailable)
|
|
114
|
+
return [];
|
|
115
|
+
// Determine which entity types the caller wants.
|
|
116
|
+
const wantedTypes = this.resolveWantedTypes(options?.entityTypes);
|
|
117
|
+
if (wantedTypes.size === 0)
|
|
118
|
+
return [];
|
|
119
|
+
// Lazily load the NER pipeline via the shared service registry.
|
|
120
|
+
let pipeline;
|
|
121
|
+
try {
|
|
122
|
+
pipeline = await this.services.getOrCreate(NER_PIPELINE_SERVICE_ID, async () => {
|
|
123
|
+
// Dynamic import so @huggingface/transformers is optional.
|
|
124
|
+
const transformers = await import('@huggingface/transformers');
|
|
125
|
+
// Create a token-classification pipeline with a small NER model.
|
|
126
|
+
const pipe = await transformers.pipeline('token-classification', 'Xenova/bert-base-NER');
|
|
127
|
+
return pipe;
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
catch {
|
|
131
|
+
// Transformers not installed or model download failed.
|
|
132
|
+
this.unavailable = true;
|
|
133
|
+
return [];
|
|
134
|
+
}
|
|
135
|
+
// Run the NER pipeline on the input text.
|
|
136
|
+
let tokens;
|
|
137
|
+
try {
|
|
138
|
+
tokens = await pipeline(input);
|
|
139
|
+
}
|
|
140
|
+
catch {
|
|
141
|
+
// Runtime inference error — degrade gracefully for this call.
|
|
142
|
+
return [];
|
|
143
|
+
}
|
|
144
|
+
// Merge BIO tokens into contiguous entity spans.
|
|
145
|
+
const merged = this.mergeBioTokens(tokens);
|
|
146
|
+
// Map merged spans to PiiEntity objects, filtering by wanted types.
|
|
147
|
+
return this.mapToEntities(merged, wantedTypes);
|
|
148
|
+
}
|
|
149
|
+
/** @inheritdoc */
|
|
150
|
+
async dispose() {
|
|
151
|
+
// The pipeline is owned by the shared service registry and will be
|
|
152
|
+
// cleaned up when the registry is released.
|
|
153
|
+
}
|
|
154
|
+
// -----------------------------------------------------------------------
|
|
155
|
+
// Private helpers
|
|
156
|
+
// -----------------------------------------------------------------------
|
|
157
|
+
/**
|
|
158
|
+
* Determines which of our supported entity types the caller wants.
|
|
159
|
+
*
|
|
160
|
+
* @param entityTypes - Optional entity-type filter from the caller.
|
|
161
|
+
* @returns Set of wanted types intersected with our supported types.
|
|
162
|
+
*/
|
|
163
|
+
resolveWantedTypes(entityTypes) {
|
|
164
|
+
if (!entityTypes || entityTypes.length === 0) {
|
|
165
|
+
return new Set(this.supportedEntities);
|
|
166
|
+
}
|
|
167
|
+
const supported = new Set(this.supportedEntities);
|
|
168
|
+
return new Set(entityTypes.filter((t) => supported.has(t)));
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Merges BIO-tagged tokens into contiguous entity spans.
|
|
172
|
+
*
|
|
173
|
+
* The BIO tagging scheme works as follows:
|
|
174
|
+
* - `B-XXX` — Beginning of a new entity of type XXX.
|
|
175
|
+
* - `I-XXX` — Inside/continuation of the current entity of type XXX.
|
|
176
|
+
* - `O` — Outside any entity (ignored).
|
|
177
|
+
*
|
|
178
|
+
* A `B-PER` followed by one or more `I-PER` tokens produces a single
|
|
179
|
+
* merged span. When a `B-XXX` appears while another entity is open,
|
|
180
|
+
* the previous entity is flushed and a new one begins.
|
|
181
|
+
*
|
|
182
|
+
* @param tokens - Raw BIO-tagged token array from the NER pipeline.
|
|
183
|
+
* @returns Array of merged entity spans with aggregated metadata.
|
|
184
|
+
*/
|
|
185
|
+
mergeBioTokens(tokens) {
|
|
186
|
+
const result = [];
|
|
187
|
+
let current = null;
|
|
188
|
+
for (const token of tokens) {
|
|
189
|
+
const { tag, label } = this.parseBioLabel(token.entity);
|
|
190
|
+
// Skip 'O' (outside) tokens.
|
|
191
|
+
if (tag === 'O' || !label) {
|
|
192
|
+
// Flush any in-progress entity.
|
|
193
|
+
if (current) {
|
|
194
|
+
result.push(current);
|
|
195
|
+
current = null;
|
|
196
|
+
}
|
|
197
|
+
continue;
|
|
198
|
+
}
|
|
199
|
+
if (tag === 'B') {
|
|
200
|
+
// Beginning of a new entity — flush the previous one if any.
|
|
201
|
+
if (current)
|
|
202
|
+
result.push(current);
|
|
203
|
+
current = {
|
|
204
|
+
label,
|
|
205
|
+
text: token.word,
|
|
206
|
+
start: token.start,
|
|
207
|
+
end: token.end,
|
|
208
|
+
scores: [token.score],
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
else if (tag === 'I' && current && current.label === label) {
|
|
212
|
+
// Continuation of the current entity — extend the span.
|
|
213
|
+
current.text += token.word.startsWith('##')
|
|
214
|
+
? token.word.slice(2) // WordPiece sub-token: strip '##' prefix
|
|
215
|
+
: ` ${token.word}`; // Regular token: add space separator
|
|
216
|
+
current.end = token.end;
|
|
217
|
+
current.scores.push(token.score);
|
|
218
|
+
}
|
|
219
|
+
else {
|
|
220
|
+
// I-tag without a matching B-tag or different label: treat as
|
|
221
|
+
// a new entity beginning (common with imperfect models).
|
|
222
|
+
if (current)
|
|
223
|
+
result.push(current);
|
|
224
|
+
current = {
|
|
225
|
+
label,
|
|
226
|
+
text: token.word,
|
|
227
|
+
start: token.start,
|
|
228
|
+
end: token.end,
|
|
229
|
+
scores: [token.score],
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
// Flush the last entity if still open.
|
|
234
|
+
if (current)
|
|
235
|
+
result.push(current);
|
|
236
|
+
return result;
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Parses a BIO label string like `'B-PER'` or `'I-LOC'` into its
|
|
240
|
+
* tag component (`'B'`, `'I'`, `'O'`) and entity label (`'PER'`, `'LOC'`).
|
|
241
|
+
*
|
|
242
|
+
* @param bioLabel - The raw BIO label from the NER model.
|
|
243
|
+
* @returns Parsed tag and label.
|
|
244
|
+
*/
|
|
245
|
+
parseBioLabel(bioLabel) {
|
|
246
|
+
if (bioLabel === 'O')
|
|
247
|
+
return { tag: 'O', label: null };
|
|
248
|
+
const dashIdx = bioLabel.indexOf('-');
|
|
249
|
+
if (dashIdx === -1) {
|
|
250
|
+
// Non-standard label without B-/I- prefix — treat as beginning.
|
|
251
|
+
return { tag: 'B', label: bioLabel };
|
|
252
|
+
}
|
|
253
|
+
return {
|
|
254
|
+
tag: bioLabel.slice(0, dashIdx),
|
|
255
|
+
label: bioLabel.slice(dashIdx + 1),
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Maps merged entity spans to {@link PiiEntity} objects, filtering by
|
|
260
|
+
* the set of wanted entity types.
|
|
261
|
+
*
|
|
262
|
+
* The score for each entity is the arithmetic mean of its constituent
|
|
263
|
+
* token scores, reflecting the model's average confidence across the
|
|
264
|
+
* full span.
|
|
265
|
+
*
|
|
266
|
+
* @param merged - Array of merged BIO entity spans.
|
|
267
|
+
* @param wantedTypes - Set of entity types the caller is interested in.
|
|
268
|
+
* @returns Filtered array of {@link PiiEntity} objects.
|
|
269
|
+
*/
|
|
270
|
+
mapToEntities(merged, wantedTypes) {
|
|
271
|
+
const entities = [];
|
|
272
|
+
for (const span of merged) {
|
|
273
|
+
// Map the NER label to our PiiEntityType.
|
|
274
|
+
const entityType = NER_LABEL_MAP[span.label];
|
|
275
|
+
if (!entityType)
|
|
276
|
+
continue; // Unknown NER label — skip.
|
|
277
|
+
// Apply entity-type filter.
|
|
278
|
+
if (!wantedTypes.has(entityType))
|
|
279
|
+
continue;
|
|
280
|
+
// Compute average score across all constituent tokens.
|
|
281
|
+
const avgScore = span.scores.reduce((sum, s) => sum + s, 0) / span.scores.length;
|
|
282
|
+
entities.push({
|
|
283
|
+
entityType,
|
|
284
|
+
text: span.text,
|
|
285
|
+
start: span.start,
|
|
286
|
+
end: span.end,
|
|
287
|
+
score: avgScore,
|
|
288
|
+
source: 'ner-model',
|
|
289
|
+
metadata: {
|
|
290
|
+
nerLabel: span.label,
|
|
291
|
+
nerModel: 'Xenova/bert-base-NER',
|
|
292
|
+
tokenCount: span.scores.length,
|
|
293
|
+
},
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
return entities;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
//# sourceMappingURL=NerModelRecognizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"NerModelRecognizer.js","sourceRoot":"","sources":["../../../../../src/extensions/packs/pii-redaction/recognizers/NerModelRecognizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAMH,8EAA8E;AAC9E,+CAA+C;AAC/C,8EAA8E;AAE9E;;;GAGG;AACH,MAAM,uBAAuB,GAAG,0BAA0B,CAAC;AAE3D,8EAA8E;AAC9E,oCAAoC;AACpC,8EAA8E;AAE9E;;;;;;;;;;GAUG;AACH,MAAM,aAAa,GAAkC;IACnD,GAAG,EAAE,QAAQ;IACb,GAAG,EAAE,UAAU;IACf,GAAG,EAAE,cAAc;IACnB,IAAI,EAAE,aAAa;CACpB,CAAC;AA6BF,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,MAAM,OAAO,kBAAkB;IAwB7B;;;;;OAKG;IACH,YAAY,QAAgC;QA7B5C,kBAAkB;QACF,SAAI,GAAG,oBAAoB,CAAC;QAE5C,kBAAkB;QACF,sBAAiB,GAAoB;YACnD,QAAQ;YACR,UAAU;YACV,cAAc;YACd,aAAa,EAAE,4BAA4B;SAC5C,CAAC;QAEF;;;WAGG;QACK,gBAAW,GAAG,KAAK,CAAC;QAe1B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IAED;;;;;;;;;OASG;IACI,KAAK,CAAC,SAAS,CAAC,KAAa,EAAE,OAA0B;QAC9D,gEAAgE;QAChE,IAAI,IAAI,CAAC,WAAW;YAAE,OAAO,EAAE,CAAC;QAEhC,iDAAiD;QACjD,MAAM,WAAW,GAAG,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QAClE,IAAI,WAAW,CAAC,IAAI,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEtC,gEAAgE;QAChE,IAAI,QAAuB,CAAC;QAC5B,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,WAAW,CACxC,uBAAuB,EACvB,KAAK,IAAI,EAAE;gBACT,2DAA2D;gBAC3D,MAAM,YAAY,GAAG,MAAM,MAAM,CAAC,2BAA2B,CAAC,CAAC;gBAC/D,iEAAiE;gBACjE,MAAM,IAAI,GAAG,MAAO,YAElB,CAAC,QAAQ,CACT,sBAAsB,EACtB,sBAAsB,CACvB,CAAC;gBACF,OAAO,IAAI,CAAC;YACd,CAAC,CACF,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,uDAAuD;YACvD,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;YACxB,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,0CAA0C;QAC1C,IAAI,MAAkB,CAAC;QACvB,IAAI,CAAC;YACH,MAAM,GAAG,MAAM,QAAQ,CAAC,KAAK,CAAC,CAAC;QACjC,CAAC;QAAC,MAAM,CAAC;YACP,8DAA8D;YAC9D,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,iDAAiD;QACjD,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;QAE3C,oEAAoE;QACpE,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IACjD,CAAC;IAED,kBAAkB;IACX,KAAK,CAAC,OAAO;QAClB,mEAAmE;QACnE,4CAA4C;IAC9C,CAAC;IAED,0EAA0E;IAC1E,kBAAkB;IAClB,0EAA0E;IAE1E;;;;;OAKG;IACK,kBAAkB,CAAC,WAA6B;QACtD,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7C,OAAO,IAAI,GAAG,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QACzC,CAAC;QACD,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAClD,OAAO,IAAI,GAAG,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9D,CAAC;IAED;;;;;;;;;;;;;;OAcG;IACK,cAAc,CAAC,MAAkB;QACvC,MAAM,MAAM,GAAmB,EAAE,CAAC;QAClC,IAAI,OAAO,GAAwB,IAAI,CAAC;QAExC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YAExD,6BAA6B;YAC7B,IAAI,GAAG,KAAK,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC1B,gCAAgC;gBAChC,IAAI,OAAO,EAAE,CAAC;oBACZ,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACrB,OAAO,GAAG,IAAI,CAAC;gBACjB,CAAC;gBACD,SAAS;YACX,CAAC;YAED,IAAI,GAAG,KAAK,GAAG,EAAE,CAAC;gBAChB,6DAA6D;gBAC7D,IAAI,OAAO;oBAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBAElC,OAAO,GAAG;oBACR,KAAK;oBACL,IAAI,EAAE,KAAK,CAAC,IAAI;oBAChB,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,GAAG,EAAE,KAAK,CAAC,GAAG;oBACd,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC;iBACtB,CAAC;YACJ,CAAC;iBAAM,IAAI,GAAG,KAAK,GAAG,IAAI,OAAO,IAAI,OAAO,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;gBAC7D,wDAAwD;gBACxD,OAAO,CAAC,IAAI,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;oBACzC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAE,yCAAyC;oBAChE,CAAC,CAAC,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC,CAAI,qCAAqC;gBAC9D,OAAO,CAAC,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC;gBACxB,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,8DAA8D;gBAC9D,yDAAyD;gBACzD,IAAI,OAAO;oBAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBAClC,OAAO,GAAG;oBACR,KAAK;oBACL,IAAI,EAAE,KAAK,CAAC,IAAI;oBAChB,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,GAAG,EAAE,KAAK,CAAC,GAAG;oBACd,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC;iBACtB,CAAC;YACJ,CAAC;QACH,CAAC;QAED,uCAAuC;QACvC,IAAI,OAAO;YAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAElC,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;;;OAMG;IACK,aAAa,CAAC,QAAgB;QACpC,IAAI,QAAQ,KAAK,GAAG;YAAE,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;QAEvD,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACtC,IAAI,OAAO,KAAK,CAAC,CAAC,EAAE,CAAC;YACnB,gEAAgE;YAChE,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;QACvC,CAAC;QAED,OAAO;YACL,GAAG,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC;YAC/B,KAAK,EAAE,QAAQ,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,CAAC;SACnC,CAAC;IACJ,CAAC;IAED;;;;;;;;;;;OAWG;IACK,aAAa,CACnB,MAAsB,EACtB,WAA+B;QAE/B,MAAM,QAAQ,GAAgB,EAAE,CAAC;QAEjC,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;YAC1B,0CAA0C;YAC1C,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC7C,IAAI,CAAC,UAAU;gBAAE,SAAS,CAAC,4BAA4B;YAEvD,4BAA4B;YAC5B,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,UAAU,CAAC;gBAAE,SAAS;YAE3C,uDAAuD;YACvD,MAAM,QAAQ,GACZ,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC;YAElE,QAAQ,CAAC,IAAI,CAAC;gBACZ,UAAU;gBACV,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,KAAK,EAAE,QAAQ;gBACf,MAAM,EAAE,WAAW;gBACnB,QAAQ,EAAE;oBACR,QAAQ,EAAE,IAAI,CAAC,KAAK;oBACpB,QAAQ,EAAE,sBAAsB;oBAChC,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;iBAC/B;aACF,CAAC,CAAC;QACL,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;CACF"}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file NlpPrefilterRecognizer.ts
|
|
3
|
+
* @description Tier 2 NLP-based pre-filter recogniser that uses the
|
|
4
|
+
* `compromise` library for lightweight named-entity extraction.
|
|
5
|
+
*
|
|
6
|
+
* This recogniser is designed as a low-confidence pre-filter: it catches
|
|
7
|
+
* person names, places, and organisations that regex patterns typically miss,
|
|
8
|
+
* then flags them at a low score (0.3–0.6) so that higher tiers (NER model,
|
|
9
|
+
* LLM judge) can confirm or discard them.
|
|
10
|
+
*
|
|
11
|
+
* The `compromise` library is loaded lazily via the {@link ISharedServiceRegistry}
|
|
12
|
+
* so that the heavyweight module is only initialised once and shared across
|
|
13
|
+
* any extensions that need it.
|
|
14
|
+
*
|
|
15
|
+
* @module pii-redaction/recognizers
|
|
16
|
+
*/
|
|
17
|
+
import type { PiiEntity, PiiEntityType } from '../types';
|
|
18
|
+
import type { IEntityRecognizer, RecognizeOptions } from './IEntityRecognizer';
|
|
19
|
+
import type { ISharedServiceRegistry } from '../../../ISharedServiceRegistry';
|
|
20
|
+
/**
|
|
21
|
+
* Tier 2 NLP pre-filter recogniser that uses the `compromise` library to
|
|
22
|
+
* detect person names, places, and organisations.
|
|
23
|
+
*
|
|
24
|
+
* ### Design rationale
|
|
25
|
+
* Compromise is a rule-based NLP library (~200 KB) that is much lighter than
|
|
26
|
+
* a full transformer model. It provides reasonable recall for English-language
|
|
27
|
+
* named entities at the cost of lower precision. This recogniser intentionally
|
|
28
|
+
* assigns low confidence scores (0.3–0.6) so that its results serve as
|
|
29
|
+
* *candidates* for higher-tier confirmation rather than final detections.
|
|
30
|
+
*
|
|
31
|
+
* ### Graceful degradation
|
|
32
|
+
* If the compromise module fails to load (e.g. not installed, wrong platform),
|
|
33
|
+
* the recogniser sets an internal `unavailable` flag and returns empty results
|
|
34
|
+
* on every subsequent call without throwing. This ensures the pipeline
|
|
35
|
+
* continues to function with just the regex and/or LLM tiers.
|
|
36
|
+
*
|
|
37
|
+
* ### Shared service pattern
|
|
38
|
+
* The compromise module is loaded via the {@link ISharedServiceRegistry} so
|
|
39
|
+
* that multiple extensions in the same agent can share a single instance.
|
|
40
|
+
*
|
|
41
|
+
* @example
|
|
42
|
+
* ```ts
|
|
43
|
+
* const registry = new SharedServiceRegistry();
|
|
44
|
+
* const recognizer = new NlpPrefilterRecognizer(registry);
|
|
45
|
+
* const entities = await recognizer.recognize('John Smith lives in London');
|
|
46
|
+
* // entities might include PERSON "John Smith" and LOCATION "London"
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare class NlpPrefilterRecognizer implements IEntityRecognizer {
|
|
50
|
+
/** @inheritdoc */
|
|
51
|
+
readonly name = "NlpPrefilterRecognizer";
|
|
52
|
+
/** @inheritdoc */
|
|
53
|
+
readonly supportedEntities: PiiEntityType[];
|
|
54
|
+
/**
|
|
55
|
+
* When `true`, the compromise module failed to load and all future
|
|
56
|
+
* `recognize()` calls will return empty arrays silently.
|
|
57
|
+
*/
|
|
58
|
+
private unavailable;
|
|
59
|
+
/**
|
|
60
|
+
* Reference to the shared service registry used for lazy-loading the
|
|
61
|
+
* compromise NLP module.
|
|
62
|
+
*/
|
|
63
|
+
private readonly services;
|
|
64
|
+
/**
|
|
65
|
+
* Construct a new NlpPrefilterRecognizer.
|
|
66
|
+
*
|
|
67
|
+
* @param services - Shared service registry for lazy-loading compromise.
|
|
68
|
+
*/
|
|
69
|
+
constructor(services: ISharedServiceRegistry);
|
|
70
|
+
/**
|
|
71
|
+
* Scan the input text for person names, places, and organisations using
|
|
72
|
+
* the compromise NLP library.
|
|
73
|
+
*
|
|
74
|
+
* Results are returned with low confidence scores (0.3–0.6) to indicate
|
|
75
|
+
* they are pre-filter candidates requiring higher-tier confirmation.
|
|
76
|
+
*
|
|
77
|
+
* @param input - Raw text to analyse.
|
|
78
|
+
* @param options - Optional filtering and context hints.
|
|
79
|
+
* @returns Array of low-confidence {@link PiiEntity} candidates.
|
|
80
|
+
*/
|
|
81
|
+
recognize(input: string, options?: RecognizeOptions): Promise<PiiEntity[]>;
|
|
82
|
+
/** @inheritdoc */
|
|
83
|
+
dispose(): Promise<void>;
|
|
84
|
+
/**
|
|
85
|
+
* Determines which of our supported entity types the caller wants.
|
|
86
|
+
*
|
|
87
|
+
* @param entityTypes - Optional entity-type filter from the caller.
|
|
88
|
+
* @returns Set of wanted types intersected with our supported types.
|
|
89
|
+
*/
|
|
90
|
+
private resolveWantedTypes;
|
|
91
|
+
/**
|
|
92
|
+
* Computes a contiguous [start, end) span from a compromise offset result.
|
|
93
|
+
*
|
|
94
|
+
* Compromise returns individual term offsets; this method merges them into
|
|
95
|
+
* a single span covering the full match text.
|
|
96
|
+
*
|
|
97
|
+
* @param match - A compromise offset result with term-level positions.
|
|
98
|
+
* @returns Object with `start` and `end` offsets, or `null` if no terms.
|
|
99
|
+
*/
|
|
100
|
+
private computeSpan;
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=NlpPrefilterRecognizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"NlpPrefilterRecognizer.d.ts","sourceRoot":"","sources":["../../../../../src/extensions/packs/pii-redaction/recognizers/NlpPrefilterRecognizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzD,OAAO,KAAK,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC/E,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,iCAAiC,CAAC;AAwF9E;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,sBAAuB,YAAW,iBAAiB;IAC9D,kBAAkB;IAClB,SAAgB,IAAI,4BAA4B;IAEhD,kBAAkB;IAClB,SAAgB,iBAAiB,EAAE,aAAa,EAAE,CAIhD;IAEF;;;OAGG;IACH,OAAO,CAAC,WAAW,CAAS;IAE5B;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAyB;IAElD;;;;OAIG;gBACS,QAAQ,EAAE,sBAAsB;IAI5C;;;;;;;;;;OAUG;IACU,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;IA4FvF,kBAAkB;IACL,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IASrC;;;;;OAKG;IACH,OAAO,CAAC,kBAAkB;IAQ1B;;;;;;;;OAQG;IACH,OAAO,CAAC,WAAW;CAepB"}
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file NlpPrefilterRecognizer.ts
|
|
3
|
+
* @description Tier 2 NLP-based pre-filter recogniser that uses the
|
|
4
|
+
* `compromise` library for lightweight named-entity extraction.
|
|
5
|
+
*
|
|
6
|
+
* This recogniser is designed as a low-confidence pre-filter: it catches
|
|
7
|
+
* person names, places, and organisations that regex patterns typically miss,
|
|
8
|
+
* then flags them at a low score (0.3–0.6) so that higher tiers (NER model,
|
|
9
|
+
* LLM judge) can confirm or discard them.
|
|
10
|
+
*
|
|
11
|
+
* The `compromise` library is loaded lazily via the {@link ISharedServiceRegistry}
|
|
12
|
+
* so that the heavyweight module is only initialised once and shared across
|
|
13
|
+
* any extensions that need it.
|
|
14
|
+
*
|
|
15
|
+
* @module pii-redaction/recognizers
|
|
16
|
+
*/
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Service identity constant for the shared compromise instance
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Stable service ID used to store the compromise module in the shared
|
|
22
|
+
* service registry. Other extensions that need compromise can reuse the
|
|
23
|
+
* same cached instance.
|
|
24
|
+
*/
|
|
25
|
+
const COMPROMISE_SERVICE_ID = 'agentos:nlp:compromise';
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Score constants
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/**
|
|
30
|
+
* Score assigned to person name detections. Higher than places/orgs because
|
|
31
|
+
* compromise's person-name heuristics tend to be slightly more reliable.
|
|
32
|
+
*/
|
|
33
|
+
const PERSON_SCORE = 0.55;
|
|
34
|
+
/** Score assigned to location/place detections. */
|
|
35
|
+
const LOCATION_SCORE = 0.45;
|
|
36
|
+
/** Score assigned to organisation detections. */
|
|
37
|
+
const ORG_SCORE = 0.4;
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// NlpPrefilterRecognizer
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
/**
|
|
42
|
+
* Tier 2 NLP pre-filter recogniser that uses the `compromise` library to
|
|
43
|
+
* detect person names, places, and organisations.
|
|
44
|
+
*
|
|
45
|
+
* ### Design rationale
|
|
46
|
+
* Compromise is a rule-based NLP library (~200 KB) that is much lighter than
|
|
47
|
+
* a full transformer model. It provides reasonable recall for English-language
|
|
48
|
+
* named entities at the cost of lower precision. This recogniser intentionally
|
|
49
|
+
* assigns low confidence scores (0.3–0.6) so that its results serve as
|
|
50
|
+
* *candidates* for higher-tier confirmation rather than final detections.
|
|
51
|
+
*
|
|
52
|
+
* ### Graceful degradation
|
|
53
|
+
* If the compromise module fails to load (e.g. not installed, wrong platform),
|
|
54
|
+
* the recogniser sets an internal `unavailable` flag and returns empty results
|
|
55
|
+
* on every subsequent call without throwing. This ensures the pipeline
|
|
56
|
+
* continues to function with just the regex and/or LLM tiers.
|
|
57
|
+
*
|
|
58
|
+
* ### Shared service pattern
|
|
59
|
+
* The compromise module is loaded via the {@link ISharedServiceRegistry} so
|
|
60
|
+
* that multiple extensions in the same agent can share a single instance.
|
|
61
|
+
*
|
|
62
|
+
* @example
|
|
63
|
+
* ```ts
|
|
64
|
+
* const registry = new SharedServiceRegistry();
|
|
65
|
+
* const recognizer = new NlpPrefilterRecognizer(registry);
|
|
66
|
+
* const entities = await recognizer.recognize('John Smith lives in London');
|
|
67
|
+
* // entities might include PERSON "John Smith" and LOCATION "London"
|
|
68
|
+
* ```
|
|
69
|
+
*/
|
|
70
|
+
export class NlpPrefilterRecognizer {
|
|
71
|
+
/**
|
|
72
|
+
* Construct a new NlpPrefilterRecognizer.
|
|
73
|
+
*
|
|
74
|
+
* @param services - Shared service registry for lazy-loading compromise.
|
|
75
|
+
*/
|
|
76
|
+
constructor(services) {
|
|
77
|
+
/** @inheritdoc */
|
|
78
|
+
this.name = 'NlpPrefilterRecognizer';
|
|
79
|
+
/** @inheritdoc */
|
|
80
|
+
this.supportedEntities = [
|
|
81
|
+
'PERSON',
|
|
82
|
+
'LOCATION',
|
|
83
|
+
'ORGANIZATION',
|
|
84
|
+
];
|
|
85
|
+
/**
|
|
86
|
+
* When `true`, the compromise module failed to load and all future
|
|
87
|
+
* `recognize()` calls will return empty arrays silently.
|
|
88
|
+
*/
|
|
89
|
+
this.unavailable = false;
|
|
90
|
+
this.services = services;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Scan the input text for person names, places, and organisations using
|
|
94
|
+
* the compromise NLP library.
|
|
95
|
+
*
|
|
96
|
+
* Results are returned with low confidence scores (0.3–0.6) to indicate
|
|
97
|
+
* they are pre-filter candidates requiring higher-tier confirmation.
|
|
98
|
+
*
|
|
99
|
+
* @param input - Raw text to analyse.
|
|
100
|
+
* @param options - Optional filtering and context hints.
|
|
101
|
+
* @returns Array of low-confidence {@link PiiEntity} candidates.
|
|
102
|
+
*/
|
|
103
|
+
async recognize(input, options) {
|
|
104
|
+
// If compromise failed to load previously, bail out silently.
|
|
105
|
+
if (this.unavailable)
|
|
106
|
+
return [];
|
|
107
|
+
// Determine which entity types the caller wants from us.
|
|
108
|
+
const wantedTypes = this.resolveWantedTypes(options?.entityTypes);
|
|
109
|
+
if (wantedTypes.size === 0)
|
|
110
|
+
return [];
|
|
111
|
+
// Lazily load compromise via the shared service registry.
|
|
112
|
+
let nlp;
|
|
113
|
+
try {
|
|
114
|
+
nlp = await this.services.getOrCreate(COMPROMISE_SERVICE_ID, async () => {
|
|
115
|
+
// Dynamic import so compromise is optional at the module level.
|
|
116
|
+
const mod = await import('compromise');
|
|
117
|
+
// compromise's default export may be the function directly or
|
|
118
|
+
// wrapped in { default: ... } depending on the bundler.
|
|
119
|
+
return mod.default ?? mod;
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
catch {
|
|
123
|
+
// compromise is not installed or failed to load — degrade gracefully.
|
|
124
|
+
this.unavailable = true;
|
|
125
|
+
return [];
|
|
126
|
+
}
|
|
127
|
+
// Parse the input text with compromise.
|
|
128
|
+
const doc = nlp(input);
|
|
129
|
+
const entities = [];
|
|
130
|
+
// Extract people (PERSON entities).
|
|
131
|
+
if (wantedTypes.has('PERSON')) {
|
|
132
|
+
const people = doc.people().out('offset');
|
|
133
|
+
for (const match of people) {
|
|
134
|
+
const span = this.computeSpan(match);
|
|
135
|
+
if (span) {
|
|
136
|
+
entities.push({
|
|
137
|
+
entityType: 'PERSON',
|
|
138
|
+
text: match.text,
|
|
139
|
+
start: span.start,
|
|
140
|
+
end: span.end,
|
|
141
|
+
score: PERSON_SCORE,
|
|
142
|
+
source: 'nlp-prefilter',
|
|
143
|
+
metadata: { recognizer: 'compromise', method: 'people' },
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// Extract places (LOCATION entities).
|
|
149
|
+
if (wantedTypes.has('LOCATION')) {
|
|
150
|
+
const places = doc.places().out('offset');
|
|
151
|
+
for (const match of places) {
|
|
152
|
+
const span = this.computeSpan(match);
|
|
153
|
+
if (span) {
|
|
154
|
+
entities.push({
|
|
155
|
+
entityType: 'LOCATION',
|
|
156
|
+
text: match.text,
|
|
157
|
+
start: span.start,
|
|
158
|
+
end: span.end,
|
|
159
|
+
score: LOCATION_SCORE,
|
|
160
|
+
source: 'nlp-prefilter',
|
|
161
|
+
metadata: { recognizer: 'compromise', method: 'places' },
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
// Extract organisations (ORGANIZATION entities).
|
|
167
|
+
if (wantedTypes.has('ORGANIZATION')) {
|
|
168
|
+
const orgs = doc.organizations().out('offset');
|
|
169
|
+
for (const match of orgs) {
|
|
170
|
+
const span = this.computeSpan(match);
|
|
171
|
+
if (span) {
|
|
172
|
+
entities.push({
|
|
173
|
+
entityType: 'ORGANIZATION',
|
|
174
|
+
text: match.text,
|
|
175
|
+
start: span.start,
|
|
176
|
+
end: span.end,
|
|
177
|
+
score: ORG_SCORE,
|
|
178
|
+
source: 'nlp-prefilter',
|
|
179
|
+
metadata: { recognizer: 'compromise', method: 'organizations' },
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return entities;
|
|
185
|
+
}
|
|
186
|
+
/** @inheritdoc */
|
|
187
|
+
async dispose() {
|
|
188
|
+
// The compromise instance is owned by the shared service registry and
|
|
189
|
+
// will be cleaned up when the registry is released. Nothing to do here.
|
|
190
|
+
}
|
|
191
|
+
// -----------------------------------------------------------------------
|
|
192
|
+
// Private helpers
|
|
193
|
+
// -----------------------------------------------------------------------
|
|
194
|
+
/**
|
|
195
|
+
* Determines which of our supported entity types the caller wants.
|
|
196
|
+
*
|
|
197
|
+
* @param entityTypes - Optional entity-type filter from the caller.
|
|
198
|
+
* @returns Set of wanted types intersected with our supported types.
|
|
199
|
+
*/
|
|
200
|
+
resolveWantedTypes(entityTypes) {
|
|
201
|
+
if (!entityTypes || entityTypes.length === 0) {
|
|
202
|
+
return new Set(this.supportedEntities);
|
|
203
|
+
}
|
|
204
|
+
const supported = new Set(this.supportedEntities);
|
|
205
|
+
return new Set(entityTypes.filter((t) => supported.has(t)));
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Computes a contiguous [start, end) span from a compromise offset result.
|
|
209
|
+
*
|
|
210
|
+
* Compromise returns individual term offsets; this method merges them into
|
|
211
|
+
* a single span covering the full match text.
|
|
212
|
+
*
|
|
213
|
+
* @param match - A compromise offset result with term-level positions.
|
|
214
|
+
* @returns Object with `start` and `end` offsets, or `null` if no terms.
|
|
215
|
+
*/
|
|
216
|
+
computeSpan(match) {
|
|
217
|
+
if (!match.terms || match.terms.length === 0)
|
|
218
|
+
return null;
|
|
219
|
+
// Start is the offset of the first term.
|
|
220
|
+
const firstTerm = match.terms[0];
|
|
221
|
+
const start = firstTerm.offset.start;
|
|
222
|
+
// End is the offset + length of the last term.
|
|
223
|
+
const lastTerm = match.terms[match.terms.length - 1];
|
|
224
|
+
const end = lastTerm.offset.start + lastTerm.offset.length;
|
|
225
|
+
return { start, end };
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
//# sourceMappingURL=NlpPrefilterRecognizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"NlpPrefilterRecognizer.js","sourceRoot":"","sources":["../../../../../src/extensions/packs/pii-redaction/recognizers/NlpPrefilterRecognizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAMH,8EAA8E;AAC9E,+DAA+D;AAC/D,8EAA8E;AAE9E;;;;GAIG;AACH,MAAM,qBAAqB,GAAG,wBAAwB,CAAC;AAyDvD,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E;;;GAGG;AACH,MAAM,YAAY,GAAG,IAAI,CAAC;AAE1B,mDAAmD;AACnD,MAAM,cAAc,GAAG,IAAI,CAAC;AAE5B,iDAAiD;AACjD,MAAM,SAAS,GAAG,GAAG,CAAC;AAEtB,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,MAAM,OAAO,sBAAsB;IAuBjC;;;;OAIG;IACH,YAAY,QAAgC;QA3B5C,kBAAkB;QACF,SAAI,GAAG,wBAAwB,CAAC;QAEhD,kBAAkB;QACF,sBAAiB,GAAoB;YACnD,QAAQ;YACR,UAAU;YACV,cAAc;SACf,CAAC;QAEF;;;WAGG;QACK,gBAAW,GAAG,KAAK,CAAC;QAc1B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IAED;;;;;;;;;;OAUG;IACI,KAAK,CAAC,SAAS,CAAC,KAAa,EAAE,OAA0B;QAC9D,8DAA8D;QAC9D,IAAI,IAAI,CAAC,WAAW;YAAE,OAAO,EAAE,CAAC;QAEhC,yDAAyD;QACzD,MAAM,WAAW,GAAG,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QAClE,IAAI,WAAW,CAAC,IAAI,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEtC,0DAA0D;QAC1D,IAAI,GAAqB,CAAC;QAC1B,IAAI,CAAC;YACH,GAAG,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,WAAW,CACnC,qBAAqB,EACrB,KAAK,IAAI,EAAE;gBACT,gEAAgE;gBAChE,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;gBACvC,8DAA8D;gBAC9D,wDAAwD;gBACxD,OAAQ,GAA+B,CAAC,OAA2B,IAAI,GAAkC,CAAC;YAC5G,CAAC,CACF,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,sEAAsE;YACtE,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;YACxB,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,wCAAwC;QACxC,MAAM,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC;QAEvB,MAAM,QAAQ,GAAgB,EAAE,CAAC;QAEjC,oCAAoC;QACpC,IAAI,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC9B,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAC1C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBACrC,IAAI,IAAI,EAAE,CAAC;oBACT,QAAQ,CAAC,IAAI,CAAC;wBACZ,UAAU,EAAE,QAAQ;wBACpB,IAAI,EAAE,KAAK,CAAC,IAAI;wBAChB,KAAK,EAAE,IAAI,CAAC,KAAK;wBACjB,GAAG,EAAE,IAAI,CAAC,GAAG;wBACb,KAAK,EAAE,YAAY;wBACnB,MAAM,EAAE,eAAe;wBACvB,QAAQ,EAAE,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,EAAE,QAAQ,EAAE;qBACzD,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,IAAI,WAAW,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAC1C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBACrC,IAAI,IAAI,EAAE,CAAC;oBACT,QAAQ,CAAC,IAAI,CAAC;wBACZ,UAAU,EAAE,UAAU;wBACtB,IAAI,EAAE,KAAK,CAAC,IAAI;wBAChB,KAAK,EAAE,IAAI,CAAC,KAAK;wBACjB,GAAG,EAAE,IAAI,CAAC,GAAG;wBACb,KAAK,EAAE,cAAc;wBACrB,MAAM,EAAE,eAAe;wBACvB,QAAQ,EAAE,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,EAAE,QAAQ,EAAE;qBACzD,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;QAED,iDAAiD;QACjD,IAAI,WAAW,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC;YACpC,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAC/C,KAAK,MAAM,KAAK,IAAI,IAAI,EAAE,CAAC;gBACzB,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;gBACrC,IAAI,IAAI,EAAE,CAAC;oBACT,QAAQ,CAAC,IAAI,CAAC;wBACZ,UAAU,EAAE,cAAc;wBAC1B,IAAI,EAAE,KAAK,CAAC,IAAI;wBAChB,KAAK,EAAE,IAAI,CAAC,KAAK;wBACjB,GAAG,EAAE,IAAI,CAAC,GAAG;wBACb,KAAK,EAAE,SAAS;wBAChB,MAAM,EAAE,eAAe;wBACvB,QAAQ,EAAE,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,EAAE,eAAe,EAAE;qBAChE,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,kBAAkB;IACX,KAAK,CAAC,OAAO;QAClB,sEAAsE;QACtE,yEAAyE;IAC3E,CAAC;IAED,0EAA0E;IAC1E,kBAAkB;IAClB,0EAA0E;IAE1E;;;;;OAKG;IACK,kBAAkB,CAAC,WAA6B;QACtD,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7C,OAAO,IAAI,GAAG,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QACzC,CAAC;QACD,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAClD,OAAO,IAAI,GAAG,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9D,CAAC;IAED;;;;;;;;OAQG;IACK,WAAW,CACjB,KAA6B;QAE7B,IAAI,CAAC,KAAK,CAAC,KAAK,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC;QAE1D,yCAAyC;QACzC,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC;QAErC,+CAA+C;QAC/C,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACrD,MAAM,GAAG,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QAE3D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC;IACxB,CAAC;CACF"}
|