rehydra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +615 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +114 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +228 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +180 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +384 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +111 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +325 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +253 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +46 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +130 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +118 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +332 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +12 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +12 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +239 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +114 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +374 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +197 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +80 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +10 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +10 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +66 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Enricher
|
|
3
|
+
* Enriches PII spans with semantic attributes (gender, location scope)
|
|
4
|
+
* for MT-friendly tags that preserve grammatical context.
|
|
5
|
+
*
|
|
6
|
+
* This module uses data from the GeoNames and gender-guesser projects.
|
|
7
|
+
* Data is automatically downloaded when using:
|
|
8
|
+
* createAnonymizer({ semantic: { enabled: true, autoDownload: true } })
|
|
9
|
+
*/
|
|
10
|
+
import { SpanMatch, PersonGender, LocationScope } from "../types/index.js";
|
|
11
|
+
/**
|
|
12
|
+
* Configuration for semantic enrichment
|
|
13
|
+
*/
|
|
14
|
+
export interface EnricherConfig {
|
|
15
|
+
/** Locale hint for name gender disambiguation (e.g., 'de', 'it', 'fr') */
|
|
16
|
+
locale?: string;
|
|
17
|
+
/** Minimum confidence to apply semantic attributes (default: 0.0) */
|
|
18
|
+
minConfidence?: number;
|
|
19
|
+
/** Whether to mark low-confidence results as 'unknown' */
|
|
20
|
+
strictMode?: boolean;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Result of gender inference with confidence
|
|
24
|
+
*/
|
|
25
|
+
export interface GenderResult {
|
|
26
|
+
gender: PersonGender;
|
|
27
|
+
confidence: number;
|
|
28
|
+
source: "database" | "inference" | "unknown";
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Result of location classification with confidence
|
|
32
|
+
*/
|
|
33
|
+
export interface LocationResult {
|
|
34
|
+
scope: LocationScope;
|
|
35
|
+
confidence: number;
|
|
36
|
+
countryCode?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Initializes semantic data (async, must be called before sync functions)
|
|
40
|
+
* @throws Error if data files are not available
|
|
41
|
+
*/
|
|
42
|
+
export declare function initializeEnricher(): Promise<void>;
|
|
43
|
+
/**
|
|
44
|
+
* Checks if enricher is ready for synchronous operations
|
|
45
|
+
*/
|
|
46
|
+
export declare function isEnricherReady(): boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Enriches PII spans with semantic attributes based on lookup tables
|
|
49
|
+
*
|
|
50
|
+
* NOTE: This function requires semantic data to be pre-loaded via initializeEnricher()
|
|
51
|
+
* or through createAnonymizer({ semantic: { enabled: true } }).
|
|
52
|
+
*
|
|
53
|
+
* @param spans - Array of detected PII spans
|
|
54
|
+
* @param config - Optional configuration for enrichment
|
|
55
|
+
* @returns Array of spans with semantic attributes added
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* ```typescript
|
|
59
|
+
* const enrichedSpans = enrichSemantics(spans, { locale: 'de' });
|
|
60
|
+
* // "Mary" -> { gender: 'female' }
|
|
61
|
+
* // "Berlin" -> { scope: 'city' }
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
export declare function enrichSemantics(spans: SpanMatch[], config?: EnricherConfig): SpanMatch[];
|
|
65
|
+
/**
|
|
66
|
+
* Infers gender from a person's name using the lookup database
|
|
67
|
+
*
|
|
68
|
+
* @param name - Full name or first name
|
|
69
|
+
* @param locale - Optional locale for disambiguation (e.g., 'de', 'it')
|
|
70
|
+
* @returns Gender result with confidence
|
|
71
|
+
*
|
|
72
|
+
* @example
|
|
73
|
+
* ```typescript
|
|
74
|
+
* inferGender('Mary Smith'); // { gender: 'female', confidence: 1.0 }
|
|
75
|
+
* inferGender('Andrea', 'it'); // { gender: 'male', confidence: 1.0 }
|
|
76
|
+
* inferGender('Andrea', 'en'); // { gender: 'female', confidence: 1.0 }
|
|
77
|
+
* ```
|
|
78
|
+
*/
|
|
79
|
+
export declare function inferGender(name: string, locale?: string): GenderResult;
|
|
80
|
+
/**
|
|
81
|
+
* Classifies a location by its geographic scope
|
|
82
|
+
*
|
|
83
|
+
* @param location - Location name
|
|
84
|
+
* @returns Classification result with confidence
|
|
85
|
+
*
|
|
86
|
+
* @example
|
|
87
|
+
* ```typescript
|
|
88
|
+
* classifyLocation('Berlin'); // { scope: 'city', confidence: 1.0 }
|
|
89
|
+
* classifyLocation('Germany'); // { scope: 'country', confidence: 1.0 }
|
|
90
|
+
* classifyLocation('Bavaria'); // { scope: 'region', confidence: 1.0 }
|
|
91
|
+
* ```
|
|
92
|
+
*/
|
|
93
|
+
export declare function classifyLocation(location: string): LocationResult;
|
|
94
|
+
/**
|
|
95
|
+
* Gets statistics about the lookup databases
|
|
96
|
+
*/
|
|
97
|
+
export declare function getDatabaseStats(): {
|
|
98
|
+
names: number;
|
|
99
|
+
cities: number;
|
|
100
|
+
countries: number;
|
|
101
|
+
regions: number;
|
|
102
|
+
loaded: boolean;
|
|
103
|
+
};
|
|
104
|
+
/**
|
|
105
|
+
* Checks if a name exists in the database
|
|
106
|
+
*/
|
|
107
|
+
export declare function hasName(name: string): boolean;
|
|
108
|
+
/**
|
|
109
|
+
* Checks if a location exists in the database
|
|
110
|
+
*/
|
|
111
|
+
export declare function hasLocation(location: string): boolean;
|
|
112
|
+
//# sourceMappingURL=semantic-enricher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic-enricher.d.ts","sourceRoot":"","sources":["../../src/pipeline/semantic-enricher.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACL,SAAS,EAET,YAAY,EACZ,aAAa,EACd,MAAM,mBAAmB,CAAC;AAS3B;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,0EAA0E;IAC1E,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,qEAAqE;IACrE,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,0DAA0D;IAC1D,UAAU,CAAC,EAAE,OAAO,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,MAAM,EAAE,YAAY,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,UAAU,GAAG,WAAW,GAAG,SAAS,CAAC;CAC9C;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,aAAa,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAKD;;;GAGG;AACH,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAaxD;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,OAAO,CAEzC;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,SAAS,EAAE,EAClB,MAAM,CAAC,EAAE,cAAc,GACtB,SAAS,EAAE,CAkBb;AAoGD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,YAAY,CAwBvE;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,cAAc,CAgCjE;AA+ED;;GAEG;AACH,wBAAgB,gBAAgB,IAAI;IAClC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,OAAO,CAAC;CACjB,CAEA;AAED;;GAEG;AACH,wBAAgB,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAU7C;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAQrD"}
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic Enricher
|
|
3
|
+
* Enriches PII spans with semantic attributes (gender, location scope)
|
|
4
|
+
* for MT-friendly tags that preserve grammatical context.
|
|
5
|
+
*
|
|
6
|
+
* This module uses data from the GeoNames and gender-guesser projects.
|
|
7
|
+
* Data is automatically downloaded when using:
|
|
8
|
+
* createAnonymizer({ semantic: { enabled: true, autoDownload: true } })
|
|
9
|
+
*/
|
|
10
|
+
import { PIIType, } from "../types/index.js";
|
|
11
|
+
import { isSemanticDataDownloaded, loadSemanticData, getSemanticDataSync, getDataStats, } from "./semantic-data-loader.js";
|
|
12
|
+
// Track if data has been initialized
|
|
13
|
+
let dataInitialized = false;
|
|
14
|
+
/**
|
|
15
|
+
* Initializes semantic data (async, must be called before sync functions)
|
|
16
|
+
* @throws Error if data files are not available
|
|
17
|
+
*/
|
|
18
|
+
export async function initializeEnricher() {
|
|
19
|
+
if (dataInitialized)
|
|
20
|
+
return;
|
|
21
|
+
const available = await isSemanticDataDownloaded();
|
|
22
|
+
if (!available) {
|
|
23
|
+
throw new Error(`Semantic enrichment data not available. ` +
|
|
24
|
+
`Use ensureSemanticData() or createAnonymizer({ semantic: { enabled: true } }) to download.`);
|
|
25
|
+
}
|
|
26
|
+
await loadSemanticData();
|
|
27
|
+
dataInitialized = true;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Checks if enricher is ready for synchronous operations
|
|
31
|
+
*/
|
|
32
|
+
export function isEnricherReady() {
|
|
33
|
+
return dataInitialized && getSemanticDataSync() !== null;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Enriches PII spans with semantic attributes based on lookup tables
|
|
37
|
+
*
|
|
38
|
+
* NOTE: This function requires semantic data to be pre-loaded via initializeEnricher()
|
|
39
|
+
* or through createAnonymizer({ semantic: { enabled: true } }).
|
|
40
|
+
*
|
|
41
|
+
* @param spans - Array of detected PII spans
|
|
42
|
+
* @param config - Optional configuration for enrichment
|
|
43
|
+
* @returns Array of spans with semantic attributes added
|
|
44
|
+
*
|
|
45
|
+
* @example
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const enrichedSpans = enrichSemantics(spans, { locale: 'de' });
|
|
48
|
+
* // "Mary" -> { gender: 'female' }
|
|
49
|
+
* // "Berlin" -> { scope: 'city' }
|
|
50
|
+
* ```
|
|
51
|
+
*/
|
|
52
|
+
export function enrichSemantics(spans, config) {
|
|
53
|
+
// Check if data is loaded
|
|
54
|
+
const data = getSemanticDataSync();
|
|
55
|
+
if (data === null) {
|
|
56
|
+
// Silently skip enrichment if data not available
|
|
57
|
+
return spans;
|
|
58
|
+
}
|
|
59
|
+
return spans.map((span) => {
|
|
60
|
+
switch (span.type) {
|
|
61
|
+
case PIIType.PERSON:
|
|
62
|
+
return enrichPerson(span, config?.locale);
|
|
63
|
+
case PIIType.LOCATION:
|
|
64
|
+
return enrichLocation(span);
|
|
65
|
+
default:
|
|
66
|
+
return span;
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Enriches a PERSON span with gender attribute
|
|
72
|
+
*/
|
|
73
|
+
function enrichPerson(span, locale) {
|
|
74
|
+
const result = inferGender(span.text, locale);
|
|
75
|
+
return {
|
|
76
|
+
...span,
|
|
77
|
+
semantic: {
|
|
78
|
+
...span.semantic,
|
|
79
|
+
gender: result.gender,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Enriches a LOCATION span with scope attribute
|
|
85
|
+
*/
|
|
86
|
+
function enrichLocation(span) {
|
|
87
|
+
const result = classifyLocation(span.text);
|
|
88
|
+
return {
|
|
89
|
+
...span,
|
|
90
|
+
semantic: {
|
|
91
|
+
...span.semantic,
|
|
92
|
+
scope: result.scope,
|
|
93
|
+
},
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Looks up gender for a name in the database (synchronous)
|
|
98
|
+
*/
|
|
99
|
+
function lookupGenderSync(name, locale) {
|
|
100
|
+
const data = getSemanticDataSync();
|
|
101
|
+
if (data === null)
|
|
102
|
+
return undefined;
|
|
103
|
+
const entry = data.names.get(name.toLowerCase());
|
|
104
|
+
if (entry === undefined)
|
|
105
|
+
return undefined;
|
|
106
|
+
// Check for locale-specific override
|
|
107
|
+
if (locale !== undefined &&
|
|
108
|
+
locale !== "" &&
|
|
109
|
+
entry.localeOverrides !== undefined &&
|
|
110
|
+
entry.localeOverrides[locale] !== undefined) {
|
|
111
|
+
return entry.localeOverrides[locale];
|
|
112
|
+
}
|
|
113
|
+
return entry.gender;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Population threshold for "major" cities that take precedence over regions
|
|
117
|
+
*/
|
|
118
|
+
const MAJOR_CITY_POPULATION = 500000;
|
|
119
|
+
/**
|
|
120
|
+
* Looks up location type synchronously
|
|
121
|
+
*/
|
|
122
|
+
function lookupLocationTypeSync(location) {
|
|
123
|
+
const data = getSemanticDataSync();
|
|
124
|
+
if (data === null)
|
|
125
|
+
return undefined;
|
|
126
|
+
const normalized = location.toLowerCase().trim();
|
|
127
|
+
// Check countries FIRST (to avoid "USA" being matched as a city)
|
|
128
|
+
const countryCode = data.countries.get(normalized);
|
|
129
|
+
if (countryCode !== undefined) {
|
|
130
|
+
return { type: "country", countryCode };
|
|
131
|
+
}
|
|
132
|
+
// Check cities - if it's a major city (pop > 500K), prioritize it over regions
|
|
133
|
+
const city = data.cities.get(normalized);
|
|
134
|
+
if (city && city.population >= MAJOR_CITY_POPULATION) {
|
|
135
|
+
return { type: "city", countryCode: city.country };
|
|
136
|
+
}
|
|
137
|
+
// Check regions
|
|
138
|
+
const region = data.regions.get(normalized);
|
|
139
|
+
if (region) {
|
|
140
|
+
return { type: "region", countryCode: region.country };
|
|
141
|
+
}
|
|
142
|
+
// Check remaining cities (smaller cities)
|
|
143
|
+
if (city) {
|
|
144
|
+
return { type: "city", countryCode: city.country };
|
|
145
|
+
}
|
|
146
|
+
return undefined;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Infers gender from a person's name using the lookup database
|
|
150
|
+
*
|
|
151
|
+
* @param name - Full name or first name
|
|
152
|
+
* @param locale - Optional locale for disambiguation (e.g., 'de', 'it')
|
|
153
|
+
* @returns Gender result with confidence
|
|
154
|
+
*
|
|
155
|
+
* @example
|
|
156
|
+
* ```typescript
|
|
157
|
+
* inferGender('Mary Smith'); // { gender: 'female', confidence: 1.0 }
|
|
158
|
+
* inferGender('Andrea', 'it'); // { gender: 'male', confidence: 1.0 }
|
|
159
|
+
* inferGender('Andrea', 'en'); // { gender: 'female', confidence: 1.0 }
|
|
160
|
+
* ```
|
|
161
|
+
*/
|
|
162
|
+
export function inferGender(name, locale) {
|
|
163
|
+
// Extract first name (handles "John Smith" -> "John")
|
|
164
|
+
const firstName = extractFirstName(name);
|
|
165
|
+
if (firstName === null || firstName === "") {
|
|
166
|
+
return { gender: "unknown", confidence: 0, source: "unknown" };
|
|
167
|
+
}
|
|
168
|
+
// Check if data is available
|
|
169
|
+
const data = getSemanticDataSync();
|
|
170
|
+
if (data === null) {
|
|
171
|
+
return { gender: "unknown", confidence: 0, source: "unknown" };
|
|
172
|
+
}
|
|
173
|
+
const gender = lookupGenderSync(firstName, locale);
|
|
174
|
+
if (gender === undefined || gender === "") {
|
|
175
|
+
return { gender: "unknown", confidence: 0, source: "unknown" };
|
|
176
|
+
}
|
|
177
|
+
return {
|
|
178
|
+
gender: gender,
|
|
179
|
+
confidence: 1.0,
|
|
180
|
+
source: "database",
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Classifies a location by its geographic scope
|
|
185
|
+
*
|
|
186
|
+
* @param location - Location name
|
|
187
|
+
* @returns Classification result with confidence
|
|
188
|
+
*
|
|
189
|
+
* @example
|
|
190
|
+
* ```typescript
|
|
191
|
+
* classifyLocation('Berlin'); // { scope: 'city', confidence: 1.0 }
|
|
192
|
+
* classifyLocation('Germany'); // { scope: 'country', confidence: 1.0 }
|
|
193
|
+
* classifyLocation('Bavaria'); // { scope: 'region', confidence: 1.0 }
|
|
194
|
+
* ```
|
|
195
|
+
*/
|
|
196
|
+
export function classifyLocation(location) {
|
|
197
|
+
// Check if data is available
|
|
198
|
+
const data = getSemanticDataSync();
|
|
199
|
+
if (data === null) {
|
|
200
|
+
return { scope: "unknown", confidence: 0 };
|
|
201
|
+
}
|
|
202
|
+
const normalized = normalizeLocationName(location);
|
|
203
|
+
const result = lookupLocationTypeSync(normalized);
|
|
204
|
+
if (!result) {
|
|
205
|
+
// Try variations
|
|
206
|
+
const variations = generateLocationVariations(location);
|
|
207
|
+
for (const variant of variations) {
|
|
208
|
+
const variantResult = lookupLocationTypeSync(variant);
|
|
209
|
+
if (variantResult) {
|
|
210
|
+
return {
|
|
211
|
+
scope: variantResult.type,
|
|
212
|
+
confidence: 0.9,
|
|
213
|
+
countryCode: variantResult.countryCode,
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
return { scope: "unknown", confidence: 0 };
|
|
218
|
+
}
|
|
219
|
+
return {
|
|
220
|
+
scope: result.type,
|
|
221
|
+
confidence: 1.0,
|
|
222
|
+
countryCode: result.countryCode,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Extracts the first name from a full name
|
|
227
|
+
*/
|
|
228
|
+
function extractFirstName(fullName) {
|
|
229
|
+
const trimmed = fullName.trim();
|
|
230
|
+
if (!trimmed)
|
|
231
|
+
return null;
|
|
232
|
+
// Handle common prefixes (Dr., Mr., Mrs., etc.)
|
|
233
|
+
const withoutPrefix = trimmed.replace(/^(dr\.?|mr\.?|mrs\.?|ms\.?|prof\.?|rev\.?|sir|dame|lord|lady)\s+/i, "");
|
|
234
|
+
// Split and get first word
|
|
235
|
+
const parts = withoutPrefix.split(/\s+/);
|
|
236
|
+
return parts[0] ?? null;
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Normalizes a location name for lookup
|
|
240
|
+
*/
|
|
241
|
+
function normalizeLocationName(location) {
|
|
242
|
+
return (location
|
|
243
|
+
.toLowerCase()
|
|
244
|
+
.trim()
|
|
245
|
+
// Remove common suffixes
|
|
246
|
+
.replace(/\s+(city|town|village|state|province|region|county)$/i, "")
|
|
247
|
+
// Normalize whitespace
|
|
248
|
+
.replace(/\s+/g, " "));
|
|
249
|
+
}
|
|
250
|
+
/**
|
|
251
|
+
* Generates variations of a location name for fuzzy matching
|
|
252
|
+
*/
|
|
253
|
+
function generateLocationVariations(location) {
|
|
254
|
+
const normalized = normalizeLocationName(location);
|
|
255
|
+
const variations = [];
|
|
256
|
+
// Try without "the"
|
|
257
|
+
if (normalized.startsWith("the ")) {
|
|
258
|
+
variations.push(normalized.slice(4));
|
|
259
|
+
}
|
|
260
|
+
// Try without common articles in other languages
|
|
261
|
+
const articlePatterns = [
|
|
262
|
+
/^(la|le|les|el|los|las|il|lo|gli|i|die|der|das|de|het)\s+/i,
|
|
263
|
+
];
|
|
264
|
+
for (const pattern of articlePatterns) {
|
|
265
|
+
const withoutArticle = normalized.replace(pattern, "");
|
|
266
|
+
if (withoutArticle !== normalized) {
|
|
267
|
+
variations.push(withoutArticle);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
// Try ASCII transliteration for common diacritics
|
|
271
|
+
const asciiVersion = normalized
|
|
272
|
+
.replace(/[àáâãäå]/g, "a")
|
|
273
|
+
.replace(/[èéêë]/g, "e")
|
|
274
|
+
.replace(/[ìíîï]/g, "i")
|
|
275
|
+
.replace(/[òóôõö]/g, "o")
|
|
276
|
+
.replace(/[ùúûü]/g, "u")
|
|
277
|
+
.replace(/[ñ]/g, "n")
|
|
278
|
+
.replace(/[ç]/g, "c")
|
|
279
|
+
.replace(/[ß]/g, "ss")
|
|
280
|
+
.replace(/[æ]/g, "ae")
|
|
281
|
+
.replace(/[ø]/g, "o")
|
|
282
|
+
.replace(/[œ]/g, "oe");
|
|
283
|
+
if (asciiVersion !== normalized) {
|
|
284
|
+
variations.push(asciiVersion);
|
|
285
|
+
}
|
|
286
|
+
return variations;
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Gets statistics about the lookup databases
|
|
290
|
+
*/
|
|
291
|
+
export function getDatabaseStats() {
|
|
292
|
+
return getDataStats();
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Checks if a name exists in the database
|
|
296
|
+
*/
|
|
297
|
+
export function hasName(name) {
|
|
298
|
+
const data = getSemanticDataSync();
|
|
299
|
+
if (data === null) {
|
|
300
|
+
return false;
|
|
301
|
+
}
|
|
302
|
+
const firstName = extractFirstName(name);
|
|
303
|
+
if (firstName === null || firstName === "")
|
|
304
|
+
return false;
|
|
305
|
+
return lookupGenderSync(firstName) !== undefined;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Checks if a location exists in the database
|
|
309
|
+
*/
|
|
310
|
+
export function hasLocation(location) {
|
|
311
|
+
const data = getSemanticDataSync();
|
|
312
|
+
if (data === null) {
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
315
|
+
const normalized = normalizeLocationName(location);
|
|
316
|
+
return lookupLocationTypeSync(normalized) !== undefined;
|
|
317
|
+
}
|
|
318
|
+
//# sourceMappingURL=semantic-enricher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic-enricher.js","sourceRoot":"","sources":["../../src/pipeline/semantic-enricher.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAEL,OAAO,GAGR,MAAM,mBAAmB,CAAC;AAE3B,OAAO,EACL,wBAAwB,EACxB,gBAAgB,EAChB,mBAAmB,EACnB,YAAY,GACb,MAAM,2BAA2B,CAAC;AAgCnC,qCAAqC;AACrC,IAAI,eAAe,GAAG,KAAK,CAAC;AAE5B;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB;IACtC,IAAI,eAAe;QAAE,OAAO;IAE5B,MAAM,SAAS,GAAG,MAAM,wBAAwB,EAAE,CAAC;IACnD,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CACb,0CAA0C;YACxC,4FAA4F,CAC/F,CAAC;IACJ,CAAC;IAED,MAAM,gBAAgB,EAAE,CAAC;IACzB,eAAe,GAAG,IAAI,CAAC;AACzB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,OAAO,eAAe,IAAI,mBAAmB,EAAE,KAAK,IAAI,CAAC;AAC3D,CAAC;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,UAAU,eAAe,CAC7B,KAAkB,EAClB,MAAuB;IAEvB,0BAA0B;IAC1B,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,iDAAiD;QACjD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACxB,QAAQ,IAAI,CAAC,IAAI,EAAE,CAAC;YAClB,KAAK,OAAO,CAAC,MAAM;gBACjB,OAAO,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;YAC5C,KAAK,OAAO,CAAC,QAAQ;gBACnB,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;YAC9B;gBACE,OAAO,IAAI,CAAC;QAChB,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,IAAe,EAAE,MAAe;IACpD,MAAM,MAAM,GAAG,WAAW,CAAC,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAE9C,OAAO;QACL,GAAG,IAAI;QACP,QAAQ,EAAE;YACR,GAAG,IAAI,CAAC,QAAQ;YAChB,MAAM,EAAE,MAAM,CAAC,MAAM;SACtB;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,IAAe;IACrC,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE3C,OAAO;QACL,GAAG,IAAI;QACP,QAAQ,EAAE;YACR,GAAG,IAAI,CAAC,QAAQ;YAChB,KAAK,EAAE,MAAM,CAAC,KAAK;SACpB;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CACvB,IAAY,EACZ,MAAe;IAEf,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI;QAAE,OAAO,SAAS,CAAC;IAEpC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;IACjD,IAAI,KAAK,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IAE1C,qCAAqC;IACrC,IACE,MAAM,KAAK,SAAS;QACpB,MAAM,KAAK,EAAE;QACb,KAAK,CAAC,eAAe,KAAK,SAAS;QACnC,KAAK,CAAC,eAAe,CAAC,MAAM,CAAC,KAAK,SAAS,EAC3C,CAAC;QACD,OAAO,KAAK,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,KAAK,CAAC,MAAM,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,MAAM,qBAAqB,GAAG,MAAM,CAAC;AAErC;;GAEG;AACH,SAAS,sBAAsB,CAC7B,QAAgB;IAEhB,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI;QAAE,OAAO,SAAS,CAAC;IAEpC,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;IAEjD,iEAAiE;IACjE,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IACnD,IAAI,WAAW,KAAK,SAAS,EAAE,CAAC;QAC9B,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC;IAC1C,CAAC;IAED,+EAA+E;IAC/E,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IACzC,IAAI,IAAI,IAAI,IAAI,CAAC,UAAU,IAAI,qBAAqB,EAAE,CAAC;QACrD,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,IAAI,CAAC,OAAO,EAAE,CAAC;IACrD,CAAC;IAED,gBAAgB;IAChB,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC5C,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,CAAC,OAAO,EAAE,CAAC;IACzD,CAAC;IAED,0CAA0C;IAC1C,IAAI,IAAI,EAAE,CAAC;QACT,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,IAAI,CAAC,OAAO,EAAE,CAAC;IACrD,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY,EAAE,MAAe;IACvD,sDAAsD;IACtD,MAAM,SAAS,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACzC,IAAI,SAAS,KAAK,IAAI,IAAI,SAAS,KAAK,EAAE,EAAE,CAAC;QAC3C,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;IACjE,CAAC;IAED,6BAA6B;IAC7B,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;IACjE,CAAC;IAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAEnD,IAAI,MAAM,KAAK,SAAS,IAAI,MAAM,KAAK,EAAE,EAAE,CAAC;QAC1C,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,UAAU,EAAE,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;IACjE,CAAC;IAED,OAAO;QACL,MAAM,EAAE,MAAsB;QAC9B,UAAU,EAAE,GAAG;QACf,MAAM,EAAE,UAAU;KACnB,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,gBAAgB,CAAC,QAAgB;IAC/C,6BAA6B;IAC7B,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IAC7C,CAAC;IAED,MAAM,UAAU,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IACnD,MAAM,MAAM,GAAG,sBAAsB,CAAC,UAAU,CAAC,CAAC;IAElD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,iBAAiB;QACjB,MAAM,UAAU,GAAG,0BAA0B,CAAC,QAAQ,CAAC,CAAC;QACxD,KAAK,MAAM,OAAO,IAAI,UAAU,EAAE,CAAC;YACjC,MAAM,aAAa,GAAG,sBAAsB,CAAC,OAAO,CAAC,CAAC;YACtD,IAAI,aAAa,EAAE,CAAC;gBAClB,OAAO;oBACL,KAAK,EAAE,aAAa,CAAC,IAAqB;oBAC1C,UAAU,EAAE,GAAG;oBACf,WAAW,EAAE,aAAa,CAAC,WAAW;iBACvC,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC;IAC7C,CAAC;IAED,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,IAAqB;QACnC,UAAU,EAAE,GAAG;QACf,WAAW,EAAE,MAAM,CAAC,WAAW;KAChC,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,QAAgB;IACxC,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC;IAChC,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IAE1B,gDAAgD;IAChD,MAAM,aAAa,GAAG,OAAO,CAAC,OAAO,CACnC,mEAAmE,EACnE,EAAE,CACH,CAAC;IAEF,2BAA2B;IAC3B,MAAM,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACzC,OAAO,KAAK,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,OAAO,CACL,QAAQ;SACL,WAAW,EAAE;SACb,IAAI,EAAE;QACP,yBAAyB;SACxB,OAAO,CAAC,uDAAuD,EAAE,EAAE,CAAC;QACrE,uBAAuB;SACtB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CACxB,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,0BAA0B,CAAC,QAAgB;IAClD,MAAM,UAAU,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IACnD,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,oBAAoB;IACpB,IAAI,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;QAClC,UAAU,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,iDAAiD;IACjD,MAAM,eAAe,GAAG;QACtB,4DAA4D;KAC7D,CAAC;IACF,KAAK,MAAM,OAAO,IAAI,eAAe,EAAE,CAAC;QACtC,MAAM,cAAc,GAAG,UAAU,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACvD,IAAI,cAAc,KAAK,UAAU,EAAE,CAAC;YAClC,UAAU,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,kDAAkD;IAClD,MAAM,YAAY,GAAG,UAAU;SAC5B,OAAO,CAAC,WAAW,EAAE,GAAG,CAAC;SACzB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC;SACrB,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC;SACrB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAEzB,IAAI,YAAY,KAAK,UAAU,EAAE,CAAC;QAChC,UAAU,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAChC,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB;IAO9B,OAAO,YAAY,EAAE,CAAC;AACxB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,OAAO,CAAC,IAAY;IAClC,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,KAAK,CAAC;IACf,CAAC;IAED,MAAM,SAAS,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACzC,IAAI,SAAS,KAAK,IAAI,IAAI,SAAS,KAAK,EAAE;QAAE,OAAO,KAAK,CAAC;IAEzD,OAAO,gBAAgB,CAAC,SAAS,CAAC,KAAK,SAAS,CAAC;AACnD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB;IAC1C,MAAM,IAAI,GAAG,mBAAmB,EAAE,CAAC;IACnC,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,OAAO,KAAK,CAAC;IACf,CAAC;IAED,MAAM,UAAU,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IACnD,OAAO,sBAAsB,CAAC,UAAU,CAAC,KAAK,SAAS,CAAC;AAC1D,CAAC"}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Replacement Tagger
|
|
3
|
+
* Replaces PII spans with placeholder tags and builds the PII map
|
|
4
|
+
*/
|
|
5
|
+
import { PIIType, SpanMatch, DetectedEntity, AnonymizationPolicy, SemanticAttributes } from "../types/index.js";
|
|
6
|
+
/**
|
|
7
|
+
* PII Map entry (before encryption)
|
|
8
|
+
*/
|
|
9
|
+
export interface PIIMapEntry {
|
|
10
|
+
/** PII type */
|
|
11
|
+
type: PIIType;
|
|
12
|
+
/** Entity ID */
|
|
13
|
+
id: number;
|
|
14
|
+
/** Original text */
|
|
15
|
+
original: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Raw PII Map (before encryption)
|
|
19
|
+
*/
|
|
20
|
+
export type RawPIIMap = Map<string, string>;
|
|
21
|
+
/**
|
|
22
|
+
* Tagging result
|
|
23
|
+
*/
|
|
24
|
+
export interface TaggingResult {
|
|
25
|
+
/** Anonymized text with placeholder tags */
|
|
26
|
+
anonymizedText: string;
|
|
27
|
+
/** List of detected entities with assigned IDs */
|
|
28
|
+
entities: DetectedEntity[];
|
|
29
|
+
/** Raw PII map (type_id -> original) */
|
|
30
|
+
piiMap: RawPIIMap;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Generates a PII placeholder tag
|
|
34
|
+
* Format: <PII type="TYPE" id="N"/> or <PII type="TYPE" gender="X" id="N"/> etc.
|
|
35
|
+
*
|
|
36
|
+
* Semantic attributes (gender, scope) are included when provided and not 'unknown'
|
|
37
|
+
*/
|
|
38
|
+
export declare function generateTag(type: PIIType, id: number, semantic?: SemanticAttributes): string;
|
|
39
|
+
/**
|
|
40
|
+
* Result of parsing a PII tag
|
|
41
|
+
*/
|
|
42
|
+
export interface ParsedTag {
|
|
43
|
+
type: PIIType;
|
|
44
|
+
id: number;
|
|
45
|
+
semantic?: SemanticAttributes;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Parses a PII tag to extract type, id, and semantic attributes
|
|
49
|
+
* Returns null if not a valid tag
|
|
50
|
+
*
|
|
51
|
+
* Supports formats:
|
|
52
|
+
* - <PII type="TYPE" id="N"/>
|
|
53
|
+
* - <PII type="TYPE" gender="X" id="N"/>
|
|
54
|
+
* - <PII type="TYPE" scope="X" id="N"/>
|
|
55
|
+
* - <PII type="TYPE" gender="X" scope="Y" id="N"/>
|
|
56
|
+
*/
|
|
57
|
+
export declare function parseTag(tag: string): ParsedTag | null;
|
|
58
|
+
/**
|
|
59
|
+
* Creates a key for the PII map
|
|
60
|
+
*/
|
|
61
|
+
export declare function createPIIMapKey(type: PIIType, id: number): string;
|
|
62
|
+
/**
|
|
63
|
+
* Tags PII spans in text and builds the PII map
|
|
64
|
+
*/
|
|
65
|
+
export declare function tagEntities(text: string, matches: SpanMatch[], policy: AnonymizationPolicy): TaggingResult;
|
|
66
|
+
/**
|
|
67
|
+
* Validates that a tag is well-formed
|
|
68
|
+
*/
|
|
69
|
+
export declare function isValidTag(tag: string): boolean;
|
|
70
|
+
/**
|
|
71
|
+
* Tag extraction result with the matched text for accurate replacement
|
|
72
|
+
*/
|
|
73
|
+
export interface ExtractedTag {
|
|
74
|
+
type: PIIType;
|
|
75
|
+
id: number;
|
|
76
|
+
position: number;
|
|
77
|
+
/** The actual matched text (needed for replacement when tag is mangled) */
|
|
78
|
+
matchedText: string;
|
|
79
|
+
/** Semantic attributes extracted from the tag */
|
|
80
|
+
semantic?: SemanticAttributes;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Extracts all PII tags from anonymized text using fuzzy matching
|
|
84
|
+
* Handles mangled tags that may occur after translation
|
|
85
|
+
*
|
|
86
|
+
* Translation can mangle tags by:
|
|
87
|
+
* - Changing quote types (" → " or „ or « etc.)
|
|
88
|
+
* - Adding/removing whitespace
|
|
89
|
+
* - Changing case (type → Type, PII → pii)
|
|
90
|
+
* - Reordering attributes (id before type)
|
|
91
|
+
* - Modifying self-closing syntax (/> → / > or >)
|
|
92
|
+
*/
|
|
93
|
+
export declare function extractTags(anonymizedText: string): ExtractedTag[];
|
|
94
|
+
/**
|
|
95
|
+
* Extracts tags using strict matching (original behavior)
|
|
96
|
+
* Useful when you know tags haven't been mangled
|
|
97
|
+
* Supports optional gender and scope attributes
|
|
98
|
+
*/
|
|
99
|
+
export declare function extractTagsStrict(anonymizedText: string): ExtractedTag[];
|
|
100
|
+
/**
|
|
101
|
+
* Counts entities by type
|
|
102
|
+
*/
|
|
103
|
+
export declare function countEntitiesByType(entities: DetectedEntity[]): Record<PIIType, number>;
|
|
104
|
+
/**
|
|
105
|
+
* Rehydrates anonymized text using the PII map
|
|
106
|
+
* Uses fuzzy matching to handle tags that may have been mangled by translation
|
|
107
|
+
*
|
|
108
|
+
* @param anonymizedText - Text containing PII tags (possibly mangled)
|
|
109
|
+
* @param piiMap - Map of PII keys to original values
|
|
110
|
+
* @param strict - If true, use strict matching (original behavior). Default: false
|
|
111
|
+
* @returns Text with PII tags replaced by original values
|
|
112
|
+
*/
|
|
113
|
+
export declare function rehydrate(anonymizedText: string, piiMap: RawPIIMap, strict?: boolean): string;
|
|
114
|
+
//# sourceMappingURL=tagger.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tagger.d.ts","sourceRoot":"","sources":["../../src/pipeline/tagger.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EACL,OAAO,EACP,SAAS,EACT,cAAc,EACd,mBAAmB,EACnB,kBAAkB,EACnB,MAAM,mBAAmB,CAAC;AAG3B;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,eAAe;IACf,IAAI,EAAE,OAAO,CAAC;IACd,gBAAgB;IAChB,EAAE,EAAE,MAAM,CAAC;IACX,oBAAoB;IACpB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;AAE5C;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,4CAA4C;IAC5C,cAAc,EAAE,MAAM,CAAC;IACvB,kDAAkD;IAClD,QAAQ,EAAE,cAAc,EAAE,CAAC;IAC3B,wCAAwC;IACxC,MAAM,EAAE,SAAS,CAAC;CACnB;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CACzB,IAAI,EAAE,OAAO,EACb,EAAE,EAAE,MAAM,EACV,QAAQ,CAAC,EAAE,kBAAkB,GAC5B,MAAM,CAcR;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,OAAO,CAAC;IACd,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,CAAC,EAAE,kBAAkB,CAAC;CAC/B;AAED;;;;;;;;;GASG;AACH,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,SAAS,GAAG,IAAI,CA+CtD;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,GAAG,MAAM,CAEjE;AAED;;GAEG;AACH,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,SAAS,EAAE,EACpB,MAAM,EAAE,mBAAmB,GAC1B,aAAa,CA8Ef;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAE/C;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,OAAO,CAAC;IACd,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,MAAM,CAAC;IACjB,2EAA2E;IAC3E,WAAW,EAAE,MAAM,CAAC;IACpB,iDAAiD;IACjD,QAAQ,CAAC,EAAE,kBAAkB,CAAC;CAC/B;AA4DD;;;;;;;;;;GAUG;AACH,wBAAgB,WAAW,CAAC,cAAc,EAAE,MAAM,GAAG,YAAY,EAAE,CAyFlE;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,cAAc,EAAE,MAAM,GAAG,YAAY,EAAE,CAqDxE;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,QAAQ,EAAE,cAAc,EAAE,GACzB,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,CAczB;AAED;;;;;;;;GAQG;AACH,wBAAgB,SAAS,CACvB,cAAc,EAAE,MAAM,EACtB,MAAM,EAAE,SAAS,EACjB,MAAM,GAAE,OAAe,GACtB,MAAM,CAyBR"}
|