netra-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of netra-sdk might be problematic. Click here for more details.
- netra/__init__.py +148 -0
- netra/anonymizer/__init__.py +7 -0
- netra/anonymizer/anonymizer.py +79 -0
- netra/anonymizer/base.py +159 -0
- netra/anonymizer/fp_anonymizer.py +182 -0
- netra/config.py +111 -0
- netra/decorators.py +167 -0
- netra/exceptions/__init__.py +6 -0
- netra/exceptions/injection.py +33 -0
- netra/exceptions/pii.py +46 -0
- netra/input_scanner.py +142 -0
- netra/instrumentation/__init__.py +257 -0
- netra/instrumentation/aiohttp/__init__.py +378 -0
- netra/instrumentation/aiohttp/version.py +1 -0
- netra/instrumentation/cohere/__init__.py +446 -0
- netra/instrumentation/cohere/version.py +1 -0
- netra/instrumentation/google_genai/__init__.py +506 -0
- netra/instrumentation/google_genai/config.py +5 -0
- netra/instrumentation/google_genai/utils.py +31 -0
- netra/instrumentation/google_genai/version.py +1 -0
- netra/instrumentation/httpx/__init__.py +545 -0
- netra/instrumentation/httpx/version.py +1 -0
- netra/instrumentation/instruments.py +78 -0
- netra/instrumentation/mistralai/__init__.py +545 -0
- netra/instrumentation/mistralai/config.py +5 -0
- netra/instrumentation/mistralai/utils.py +30 -0
- netra/instrumentation/mistralai/version.py +1 -0
- netra/instrumentation/weaviate/__init__.py +121 -0
- netra/instrumentation/weaviate/version.py +1 -0
- netra/pii.py +757 -0
- netra/processors/__init__.py +4 -0
- netra/processors/session_span_processor.py +55 -0
- netra/processors/span_aggregation_processor.py +365 -0
- netra/scanner.py +104 -0
- netra/session.py +185 -0
- netra/session_manager.py +96 -0
- netra/tracer.py +99 -0
- netra/version.py +1 -0
- netra_sdk-0.1.0.dist-info/LICENCE +201 -0
- netra_sdk-0.1.0.dist-info/METADATA +573 -0
- netra_sdk-0.1.0.dist-info/RECORD +42 -0
- netra_sdk-0.1.0.dist-info/WHEEL +4 -0
netra/pii.py
ADDED
|
@@ -0,0 +1,757 @@
|
|
|
1
|
+
# File: netra/pii.py
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Pattern, Tuple, Union, cast
|
|
9
|
+
|
|
10
|
+
from netra import Netra
|
|
11
|
+
from netra.anonymizer import Anonymizer
|
|
12
|
+
from netra.exceptions import PIIBlockedException
|
|
13
|
+
|
|
14
|
+
EMAIL_PATTERN: Pattern[str] = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
|
15
|
+
PHONE_PATTERN: Pattern[str] = re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b")
|
|
16
|
+
CREDIT_CARD_PATTERN: Pattern[str] = re.compile(r"\b(?:\d[ -]*?){13,16}\b")
|
|
17
|
+
SSN_PATTERN: Pattern[str] = re.compile(r"\b\d{3}-\d{2}-\d{4}\b")
|
|
18
|
+
|
|
19
|
+
DEFAULT_PII_PATTERNS: Dict[str, Pattern[str]] = {
|
|
20
|
+
"EMAIL": EMAIL_PATTERN,
|
|
21
|
+
"PHONE": PHONE_PATTERN,
|
|
22
|
+
"CREDIT_CARD": CREDIT_CARD_PATTERN,
|
|
23
|
+
"SSN": SSN_PATTERN,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
DEFAULT_ENTITIES: List[str] = [
|
|
27
|
+
"CREDIT_CARD",
|
|
28
|
+
"CRYPTO",
|
|
29
|
+
"DATE_TIME",
|
|
30
|
+
"EMAIL_ADDRESS",
|
|
31
|
+
"IBAN_CODE",
|
|
32
|
+
"IP_ADDRESS",
|
|
33
|
+
"NRP",
|
|
34
|
+
"LOCATION",
|
|
35
|
+
"PHONE_NUMBER",
|
|
36
|
+
"MEDICAL_LICENSE",
|
|
37
|
+
"URL",
|
|
38
|
+
"US_BANK_NUMBER",
|
|
39
|
+
"US_DRIVER_LICENSE",
|
|
40
|
+
"US_ITIN",
|
|
41
|
+
"US_PASSPORT",
|
|
42
|
+
"US_SSN",
|
|
43
|
+
"UK_NHS",
|
|
44
|
+
"UK_NINO",
|
|
45
|
+
"ES_NIF",
|
|
46
|
+
"ES_NIE",
|
|
47
|
+
"IT_FISCAL_CODE",
|
|
48
|
+
"IT_DRIVER_LICENSE",
|
|
49
|
+
"IT_VAT_CODE",
|
|
50
|
+
"IT_PASSPORT",
|
|
51
|
+
"IT_IDENTITY_CARD",
|
|
52
|
+
"PL_PESEL",
|
|
53
|
+
"SG_NRIC_FIN",
|
|
54
|
+
"SG_UEN",
|
|
55
|
+
"AU_ABN",
|
|
56
|
+
"AU_ACN",
|
|
57
|
+
"AU_TFN",
|
|
58
|
+
"AU_MEDICARE",
|
|
59
|
+
"IN_PAN",
|
|
60
|
+
"IN_AADHAAR",
|
|
61
|
+
"IN_VEHICLE_REGISTRATION",
|
|
62
|
+
"IN_VOTER",
|
|
63
|
+
"IN_PASSPORT",
|
|
64
|
+
"FI_PERSONAL_IDENTITY_CODE",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True)
|
|
69
|
+
class PIIDetectionResult:
|
|
70
|
+
"""
|
|
71
|
+
Result of running PII detection on input text.
|
|
72
|
+
Attributes:
|
|
73
|
+
has_pii: True if any PII matches were found.
|
|
74
|
+
pii_entities: Dictionary mapping PII label -> count of occurrences.
|
|
75
|
+
masked_text: Input text with PII spans replaced/masked.
|
|
76
|
+
Can be a string for simple inputs, a list of dicts for chat messages,
|
|
77
|
+
or a list of BaseMessage objects for LangChain inputs.
|
|
78
|
+
original_text: The original text used to call the detect() method.
|
|
79
|
+
Can be a string, list of strings, list of dictionaries, or any other type.
|
|
80
|
+
is_blocked: True if block_on_pii is enabled and has_pii is True.
|
|
81
|
+
is_masked: True if any text was replaced to mask PII.
|
|
82
|
+
pii_actions: Dictionary mapping action types to lists of PII entities.
|
|
83
|
+
hashed_entities: Dictionary mapping hashed entity values to their original values.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
has_pii: bool = False
|
|
87
|
+
pii_entities: Dict[str, int] = field(default_factory=dict)
|
|
88
|
+
masked_text: Optional[Union[str, List[Dict[str, str]], List[Any]]] = None
|
|
89
|
+
original_text: Optional[Union[str, List[Dict[str, str]], List[str], List[Any]]] = None
|
|
90
|
+
is_blocked: bool = False
|
|
91
|
+
is_masked: bool = False
|
|
92
|
+
pii_actions: Dict[Any, List[str]] = field(default_factory=dict)
|
|
93
|
+
hashed_entities: Dict[str, str] = field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class PIIDetector(ABC):
|
|
97
|
+
"""
|
|
98
|
+
Abstract base for all PII detectors. Provides common iteration/
|
|
99
|
+
aggregation logic, while requiring subclasses to implement _detect_single_message().
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, action_type: Literal["BLOCK", "FLAG", "MASK"] = "FLAG") -> None:
|
|
103
|
+
"""
|
|
104
|
+
Initialize the PII detector.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
action_type: Action to take when PII is detected. Options are:
|
|
108
|
+
- "BLOCK": Raise PIIBlockedException when PII is detected
|
|
109
|
+
- "FLAG": Detect PII but don't block or mask
|
|
110
|
+
- "MASK": Replace PII with mask tokens (default)
|
|
111
|
+
"""
|
|
112
|
+
self._action_type: Literal["BLOCK", "FLAG", "MASK"] = action_type
|
|
113
|
+
|
|
114
|
+
@abstractmethod
|
|
115
|
+
def _detect_pii(self, text: str) -> Tuple[bool, Counter[str], str, Dict[str, str]]:
|
|
116
|
+
"""
|
|
117
|
+
Detect PII in a single message.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
text: The text to detect PII in
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Tuple of (has_pii, counts, masked_text, entities)
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def _preprocess(self, text: str) -> str:
|
|
127
|
+
"""
|
|
128
|
+
Preprocess text before PII detection.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
text: The input text to preprocess.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Preprocessed text ready for PII detection.
|
|
135
|
+
"""
|
|
136
|
+
if not isinstance(text, str):
|
|
137
|
+
return str(text) if text is not None else ""
|
|
138
|
+
|
|
139
|
+
# Trim whitespace
|
|
140
|
+
text = text.strip()
|
|
141
|
+
|
|
142
|
+
return text
|
|
143
|
+
|
|
144
|
+
def _mask_spans(self, text: str, spans: Dict[str, List[Tuple[int, int]]]) -> str:
|
|
145
|
+
"""
|
|
146
|
+
Mask identified PII spans in the text.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
text: The original text containing PII.
|
|
150
|
+
spans: Dictionary mapping PII label to list of (start, end) spans.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Text with PII spans replaced by mask tokens.
|
|
154
|
+
"""
|
|
155
|
+
# Convert spans to a flat list of (start, end, label) tuples
|
|
156
|
+
all_spans = []
|
|
157
|
+
for label, span_list in spans.items():
|
|
158
|
+
for start, end in span_list:
|
|
159
|
+
all_spans.append((start, end, label))
|
|
160
|
+
|
|
161
|
+
# Sort spans by start position (in reverse order to avoid index shifting)
|
|
162
|
+
all_spans.sort(reverse=True)
|
|
163
|
+
|
|
164
|
+
# Apply masking
|
|
165
|
+
result = text
|
|
166
|
+
for start, end, label in all_spans:
|
|
167
|
+
mask = f"[{label}]"
|
|
168
|
+
result = result[:start] + mask + result[end:]
|
|
169
|
+
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
def detect(self, input_data: Union[str, List[Dict[str, str]], List[str], List[Any]]) -> PIIDetectionResult:
|
|
173
|
+
"""
|
|
174
|
+
Public entry point. Accepts either:
|
|
175
|
+
1. A single string
|
|
176
|
+
2. A list of dictionaries with string values (e.g. chat messages)
|
|
177
|
+
3. A list of strings
|
|
178
|
+
4. A list of LangChain BaseMessage objects (detected by duck typing)
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
input_data: The input data to detect PII in
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
PIIDetectionResult: The detection result containing PII information
|
|
185
|
+
"""
|
|
186
|
+
try:
|
|
187
|
+
return self._process_input_data(input_data)
|
|
188
|
+
except PIIBlockedException as e:
|
|
189
|
+
return self._handle_pii_exception(e)
|
|
190
|
+
|
|
191
|
+
def _process_input_data(
|
|
192
|
+
self, input_data: Union[str, List[Dict[str, str]], List[str], List[Any]]
|
|
193
|
+
) -> PIIDetectionResult:
|
|
194
|
+
"""
|
|
195
|
+
Process input data based on its type and route to appropriate detection method.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
input_data: The input data to detect PII in
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
PIIDetectionResult: The detection result containing PII information
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
ValueError: If input type is not supported
|
|
205
|
+
"""
|
|
206
|
+
if isinstance(input_data, str):
|
|
207
|
+
return self._detect_single_message(input_data)
|
|
208
|
+
|
|
209
|
+
if isinstance(input_data, list):
|
|
210
|
+
return self._process_list_input(input_data)
|
|
211
|
+
|
|
212
|
+
raise ValueError(f"Unsupported input type: {type(input_data).__name__}")
|
|
213
|
+
|
|
214
|
+
def _process_list_input(self, input_list: List[Any]) -> PIIDetectionResult:
|
|
215
|
+
"""
|
|
216
|
+
Process list input by determining the list type and routing to appropriate method.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
input_list: List of items to process
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
PIIDetectionResult: The detection result containing PII information
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
ValueError: If list item type is not supported
|
|
226
|
+
"""
|
|
227
|
+
if not input_list:
|
|
228
|
+
return PIIDetectionResult(original_text=input_list)
|
|
229
|
+
|
|
230
|
+
first_item = input_list[0]
|
|
231
|
+
|
|
232
|
+
if isinstance(first_item, dict):
|
|
233
|
+
return self._detect_chat_messages(cast(List[Dict[str, str]], input_list))
|
|
234
|
+
|
|
235
|
+
if isinstance(first_item, str):
|
|
236
|
+
return self._detect_string_list(cast(List[str], input_list))
|
|
237
|
+
|
|
238
|
+
if self._is_langchain_message(first_item):
|
|
239
|
+
return self._process_langchain_messages(input_list)
|
|
240
|
+
|
|
241
|
+
raise ValueError(f"Unsupported input type in list: {type(first_item).__name__}")
|
|
242
|
+
|
|
243
|
+
def _is_langchain_message(self, item: Any) -> bool:
|
|
244
|
+
"""
|
|
245
|
+
Check if an item is a LangChain BaseMessage-like object using duck typing.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
item: The item to check
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if the item has the expected LangChain message attributes
|
|
252
|
+
"""
|
|
253
|
+
return hasattr(item, "content") and hasattr(item, "type")
|
|
254
|
+
|
|
255
|
+
def _process_langchain_messages(self, messages: List[Any]) -> PIIDetectionResult:
|
|
256
|
+
"""
|
|
257
|
+
Process LangChain BaseMessage-like objects by extracting their content.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
messages: List of LangChain BaseMessage-like objects
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
PIIDetectionResult: The detection result containing PII information
|
|
264
|
+
"""
|
|
265
|
+
contents = [msg.content for msg in messages if hasattr(msg, "content")]
|
|
266
|
+
return self._detect_string_list(contents)
|
|
267
|
+
|
|
268
|
+
def _handle_pii_exception(self, exception: PIIBlockedException) -> PIIDetectionResult:
|
|
269
|
+
"""
|
|
270
|
+
Handle PIIBlockedException based on the configured action type.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
exception: The PIIBlockedException that was raised
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
PIIDetectionResult: Appropriate result based on action type
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
PIIBlockedException: Re-raised if action type is BLOCK
|
|
280
|
+
"""
|
|
281
|
+
pii_actions = self._create_pii_actions(exception)
|
|
282
|
+
attributes = self._build_trace_attributes(exception, pii_actions)
|
|
283
|
+
|
|
284
|
+
# Log the PII detection event
|
|
285
|
+
Netra.set_custom_event(event_name="pii_detected", attributes=attributes)
|
|
286
|
+
|
|
287
|
+
# Handle different action types
|
|
288
|
+
if self._action_type == "BLOCK":
|
|
289
|
+
raise exception
|
|
290
|
+
|
|
291
|
+
return self._create_detection_result(exception, pii_actions)
|
|
292
|
+
|
|
293
|
+
def _create_pii_actions(self, exception: PIIBlockedException) -> Dict[str, List[str]]:
|
|
294
|
+
"""
|
|
295
|
+
Create pii_actions dictionary based on action type and detected entities.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
exception: The PIIBlockedException containing detected entities
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary mapping action type to list of PII entity types
|
|
302
|
+
"""
|
|
303
|
+
return {self._action_type: list(exception.pii_entities.keys())}
|
|
304
|
+
|
|
305
|
+
def _build_trace_attributes(
|
|
306
|
+
self, exception: PIIBlockedException, pii_actions: Dict[str, List[str]]
|
|
307
|
+
) -> Dict[str, Any]:
|
|
308
|
+
"""
|
|
309
|
+
Build attributes dictionary for tracing/logging the PII detection event.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
exception: The PIIBlockedException containing PII information
|
|
313
|
+
pii_actions: Dictionary of PII actions to be taken
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Dictionary of attributes for the trace event
|
|
317
|
+
"""
|
|
318
|
+
attributes = {
|
|
319
|
+
"has_pii": exception.has_pii,
|
|
320
|
+
"pii_entities": json.dumps(exception.pii_entities),
|
|
321
|
+
"is_blocked": self._action_type == "BLOCK",
|
|
322
|
+
"is_masked": self._action_type == "MASK",
|
|
323
|
+
"pii_actions": json.dumps(pii_actions),
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
# Add masked_text to attributes only for MASK action type
|
|
327
|
+
if self._action_type == "MASK":
|
|
328
|
+
attributes["masked_text"] = self._serialize_masked_text(exception.masked_text)
|
|
329
|
+
|
|
330
|
+
return attributes
|
|
331
|
+
|
|
332
|
+
def _serialize_masked_text(self, masked_text: Any) -> str:
|
|
333
|
+
"""
|
|
334
|
+
Serialize masked text to string format for tracing attributes.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
masked_text: The masked text in various possible formats
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
String representation of the masked text
|
|
341
|
+
"""
|
|
342
|
+
if isinstance(masked_text, (dict, list)):
|
|
343
|
+
return json.dumps(masked_text)
|
|
344
|
+
return str(masked_text)
|
|
345
|
+
|
|
346
|
+
def _create_detection_result(
|
|
347
|
+
self, exception: PIIBlockedException, pii_actions: Dict[str, List[str]]
|
|
348
|
+
) -> PIIDetectionResult:
|
|
349
|
+
"""
|
|
350
|
+
Create PIIDetectionResult based on action type and exception data.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
exception: The PIIBlockedException containing PII information
|
|
354
|
+
pii_actions: Dictionary of PII actions taken
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
PIIDetectionResult with appropriate fields set based on action type
|
|
358
|
+
"""
|
|
359
|
+
if self._action_type == "MASK":
|
|
360
|
+
return PIIDetectionResult(
|
|
361
|
+
has_pii=exception.has_pii,
|
|
362
|
+
pii_entities=exception.pii_entities,
|
|
363
|
+
original_text=exception.original_text,
|
|
364
|
+
pii_actions=pii_actions,
|
|
365
|
+
masked_text=exception.masked_text,
|
|
366
|
+
is_blocked=False,
|
|
367
|
+
is_masked=True,
|
|
368
|
+
hashed_entities=exception.hashed_entities,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# For FLAG action type
|
|
372
|
+
return PIIDetectionResult(
|
|
373
|
+
has_pii=exception.has_pii,
|
|
374
|
+
pii_entities=exception.pii_entities,
|
|
375
|
+
original_text=exception.original_text,
|
|
376
|
+
pii_actions=pii_actions,
|
|
377
|
+
masked_text=None,
|
|
378
|
+
is_blocked=False,
|
|
379
|
+
is_masked=False,
|
|
380
|
+
hashed_entities=exception.hashed_entities,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
def _detect_single_message(self, text: str) -> PIIDetectionResult:
|
|
384
|
+
"""
|
|
385
|
+
Detect PII in a single message.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
text: The text to detect PII in
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
PIIDetectionResult: The detection result containing PII information
|
|
392
|
+
"""
|
|
393
|
+
has_pii, counts, masked_text, entities = self._detect_pii(text)
|
|
394
|
+
|
|
395
|
+
if has_pii:
|
|
396
|
+
# Create pii_actions based on the action type and detected entities
|
|
397
|
+
pii_actions = {self._action_type: list(counts.keys())}
|
|
398
|
+
raise PIIBlockedException(
|
|
399
|
+
message="PII detected; blocking enabled.",
|
|
400
|
+
has_pii=has_pii,
|
|
401
|
+
pii_entities=dict(counts),
|
|
402
|
+
masked_text=masked_text,
|
|
403
|
+
pii_actions=pii_actions,
|
|
404
|
+
is_blocked=True,
|
|
405
|
+
original_text=text,
|
|
406
|
+
hashed_entities=entities,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
return PIIDetectionResult(
|
|
410
|
+
has_pii=has_pii,
|
|
411
|
+
pii_entities={},
|
|
412
|
+
masked_text=None, # No PII detected, so no masked text needed
|
|
413
|
+
original_text=text,
|
|
414
|
+
is_blocked=False,
|
|
415
|
+
is_masked=False,
|
|
416
|
+
pii_actions={}, # No PII detected, so no actions needed
|
|
417
|
+
hashed_entities=entities,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
def _detect_chat_messages(self, chat_messages: List[Dict[str, str]]) -> PIIDetectionResult:
|
|
421
|
+
"""
|
|
422
|
+
Detect PII in a list of chat messages.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
chat_messages: List of chat message dictionaries with 'role' and 'message' keys
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
PIIDetectionResult: The detection result containing PII information
|
|
429
|
+
"""
|
|
430
|
+
overall_has_pii = False
|
|
431
|
+
total_counts: Counter[str] = Counter()
|
|
432
|
+
masked_list: List[Dict[str, str]] = []
|
|
433
|
+
merged_hashed_entities: Dict[str, str] = {}
|
|
434
|
+
|
|
435
|
+
for message in chat_messages:
|
|
436
|
+
role = message.get("role", "unknown")
|
|
437
|
+
text = message.get("content", "")
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
self._detect_single_message(text)
|
|
441
|
+
# If we get here, no PII was detected
|
|
442
|
+
masked_list.append({"role": role, "content": text})
|
|
443
|
+
except PIIBlockedException as e:
|
|
444
|
+
# PII was detected
|
|
445
|
+
overall_has_pii = True
|
|
446
|
+
total_counts.update(e.pii_entities)
|
|
447
|
+
# Merge hashed entities from this message
|
|
448
|
+
merged_hashed_entities.update(e.hashed_entities)
|
|
449
|
+
# Convert masked_text to string if it's not already to prevent type errors
|
|
450
|
+
masked_text_str = str(e.masked_text) if e.masked_text is not None else ""
|
|
451
|
+
masked_list.append({"role": role, "content": masked_text_str})
|
|
452
|
+
|
|
453
|
+
if overall_has_pii:
|
|
454
|
+
# Create pii_actions based on the action type and detected entities
|
|
455
|
+
pii_actions = {self._action_type: list(total_counts.keys())}
|
|
456
|
+
raise PIIBlockedException(
|
|
457
|
+
message="PII detected in one or more messages; blocking enabled.",
|
|
458
|
+
has_pii=overall_has_pii,
|
|
459
|
+
pii_entities=dict(total_counts),
|
|
460
|
+
masked_text=masked_list,
|
|
461
|
+
pii_actions=pii_actions,
|
|
462
|
+
is_blocked=True,
|
|
463
|
+
hashed_entities=merged_hashed_entities,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
return PIIDetectionResult(
|
|
467
|
+
has_pii=False,
|
|
468
|
+
pii_entities={},
|
|
469
|
+
masked_text=None,
|
|
470
|
+
original_text=chat_messages,
|
|
471
|
+
is_blocked=False,
|
|
472
|
+
is_masked=False,
|
|
473
|
+
pii_actions={}, # No PII detected, so no actions needed
|
|
474
|
+
hashed_entities={},
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
def _detect_string_list(self, string_list: List[str]) -> PIIDetectionResult:
|
|
478
|
+
"""
|
|
479
|
+
Detect PII in a list of strings.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
string_list: List of strings to detect PII in
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
PIIDetectionResult: The detection result containing PII information
|
|
486
|
+
"""
|
|
487
|
+
overall_has_pii = False
|
|
488
|
+
total_counts: Counter[str] = Counter()
|
|
489
|
+
masked_list: List[str] = []
|
|
490
|
+
merged_hashed_entities: Dict[str, str] = {}
|
|
491
|
+
|
|
492
|
+
for text in string_list:
|
|
493
|
+
try:
|
|
494
|
+
self._detect_single_message(text)
|
|
495
|
+
# If we get here, no PII was detected
|
|
496
|
+
masked_list.append(text)
|
|
497
|
+
except PIIBlockedException as e:
|
|
498
|
+
# PII was detected
|
|
499
|
+
overall_has_pii = True
|
|
500
|
+
total_counts.update(e.pii_entities)
|
|
501
|
+
# Merge hashed entities from this string
|
|
502
|
+
merged_hashed_entities.update(e.hashed_entities)
|
|
503
|
+
# Ensure we're appending a string to the string list
|
|
504
|
+
masked_text_str = str(e.masked_text) if e.masked_text is not None else ""
|
|
505
|
+
masked_list.append(masked_text_str)
|
|
506
|
+
|
|
507
|
+
if overall_has_pii:
|
|
508
|
+
# Create pii_actions based on the action type and detected entities
|
|
509
|
+
pii_actions = {self._action_type: list(total_counts.keys())}
|
|
510
|
+
raise PIIBlockedException(
|
|
511
|
+
message="PII detected in one or more messages; blocking enabled.",
|
|
512
|
+
has_pii=overall_has_pii,
|
|
513
|
+
pii_entities=dict(total_counts),
|
|
514
|
+
masked_text=masked_list,
|
|
515
|
+
pii_actions=pii_actions,
|
|
516
|
+
is_blocked=True,
|
|
517
|
+
hashed_entities=merged_hashed_entities,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
return PIIDetectionResult(
|
|
521
|
+
has_pii=False,
|
|
522
|
+
pii_entities={},
|
|
523
|
+
masked_text=None,
|
|
524
|
+
original_text=string_list,
|
|
525
|
+
is_blocked=False,
|
|
526
|
+
is_masked=False,
|
|
527
|
+
pii_actions={}, # No PII detected, so no actions needed
|
|
528
|
+
hashed_entities={},
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
class RegexPIIDetector(PIIDetector):
|
|
533
|
+
"""
|
|
534
|
+
Regex-based PII detector. Overrides _detect_single_message to handle a plain string.
|
|
535
|
+
"""
|
|
536
|
+
|
|
537
|
+
def __init__(
|
|
538
|
+
self,
|
|
539
|
+
patterns: Optional[Dict[str, Pattern[str]]] = None,
|
|
540
|
+
action_type: Literal["BLOCK", "FLAG", "MASK"] = "MASK",
|
|
541
|
+
) -> None:
|
|
542
|
+
if action_type is None:
|
|
543
|
+
env_action = os.getenv("NETRA_ACTION_TYPE", "MASK")
|
|
544
|
+
# Ensure action_type is one of the valid literal values
|
|
545
|
+
if env_action not in ["BLOCK", "FLAG", "MASK"]:
|
|
546
|
+
action_type = cast(Literal["BLOCK", "FLAG", "MASK"], "FLAG")
|
|
547
|
+
else:
|
|
548
|
+
action_type = cast(Literal["BLOCK", "FLAG", "MASK"], env_action)
|
|
549
|
+
super().__init__(action_type=action_type)
|
|
550
|
+
self.patterns: Dict[str, Pattern[str]] = patterns or DEFAULT_PII_PATTERNS
|
|
551
|
+
|
|
552
|
+
def _detect_pii(self, text: str) -> Tuple[bool, Counter[str], str, Dict[str, str]]:
|
|
553
|
+
"""
|
|
554
|
+
Detect PII in a single message.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
text: The text to detect PII in
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
Tuple of (has_pii, counts, masked_text, entities)
|
|
561
|
+
"""
|
|
562
|
+
text = self._preprocess(text) # trim & normalize
|
|
563
|
+
if not text:
|
|
564
|
+
return False, Counter(), "", {}
|
|
565
|
+
|
|
566
|
+
spans: Dict[str, List[Tuple[int, int]]] = {}
|
|
567
|
+
counts: Counter[str] = Counter()
|
|
568
|
+
|
|
569
|
+
for label, pattern in self.patterns.items():
|
|
570
|
+
matches = list(pattern.finditer(text))
|
|
571
|
+
if not matches:
|
|
572
|
+
continue
|
|
573
|
+
counts[label] = len(matches)
|
|
574
|
+
spans[label] = [m.span() for m in matches]
|
|
575
|
+
|
|
576
|
+
has_pii_local = bool(counts)
|
|
577
|
+
masked = text
|
|
578
|
+
entities: dict[str, Any] = {}
|
|
579
|
+
|
|
580
|
+
if has_pii_local:
|
|
581
|
+
masked = self._mask_spans(text, spans)
|
|
582
|
+
|
|
583
|
+
return has_pii_local, counts, masked, entities
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
class PresidioPIIDetector(PIIDetector):
|
|
587
|
+
"""
|
|
588
|
+
Presidio-based PII detector. Overrides _detect_single_message to
|
|
589
|
+
call Presidio's Analyzer + Anonymizer on a string.
|
|
590
|
+
|
|
591
|
+
Examples:
|
|
592
|
+
# Using default hash function
|
|
593
|
+
detector = PresidioPIIDetector()
|
|
594
|
+
result = detector.detect("My email is john@example.com")
|
|
595
|
+
|
|
596
|
+
# Using custom hash function
|
|
597
|
+
import hashlib
|
|
598
|
+
def custom_hash(text: str) -> str:
|
|
599
|
+
return hashlib.sha256(text.encode()).hexdigest()[:8]
|
|
600
|
+
|
|
601
|
+
detector = PresidioPIIDetector(
|
|
602
|
+
hash_function=custom_hash,
|
|
603
|
+
anonymizer_cache_size=500,
|
|
604
|
+
action_type="MASK",
|
|
605
|
+
score_threshold=0.8
|
|
606
|
+
)
|
|
607
|
+
"""
|
|
608
|
+
|
|
609
|
+
def __init__(
|
|
610
|
+
self,
|
|
611
|
+
entities: Optional[List[str]] = None,
|
|
612
|
+
language: str = "en",
|
|
613
|
+
score_threshold: float = 0.6,
|
|
614
|
+
action_type: Optional[Literal["BLOCK", "FLAG", "MASK"]] = None,
|
|
615
|
+
anonymizer_cache_size: int = 1000,
|
|
616
|
+
hash_function: Optional[Callable[[str], str]] = None,
|
|
617
|
+
) -> None:
|
|
618
|
+
if action_type is None:
|
|
619
|
+
action_type = "FLAG"
|
|
620
|
+
env_action = os.getenv("NETRA_ACTION_TYPE", "FLAG")
|
|
621
|
+
# Ensure action_type is one of the valid literal values
|
|
622
|
+
if env_action in ["BLOCK", "FLAG", "MASK"]:
|
|
623
|
+
action_type = cast(Literal["BLOCK", "FLAG", "MASK"], env_action)
|
|
624
|
+
super().__init__(action_type=action_type)
|
|
625
|
+
try:
|
|
626
|
+
from presidio_analyzer import AnalyzerEngine # noqa: F401
|
|
627
|
+
except ImportError as exc:
|
|
628
|
+
raise ImportError("Presidio-based PII detection requires: presidio-analyzer. " "Install via pip.") from exc
|
|
629
|
+
|
|
630
|
+
self.language: str = language
|
|
631
|
+
self.entities: Optional[List[str]] = entities if entities else DEFAULT_ENTITIES
|
|
632
|
+
self.score_threshold: float = score_threshold
|
|
633
|
+
|
|
634
|
+
self.analyzer = AnalyzerEngine()
|
|
635
|
+
self.anonymizer = Anonymizer(hash_function=hash_function, cache_size=anonymizer_cache_size)
|
|
636
|
+
|
|
637
|
+
def _detect_pii(self, text: str) -> Tuple[bool, Counter[str], str, Dict[str, str]]:
|
|
638
|
+
"""
|
|
639
|
+
Detect PII in a single message.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
text: The text to detect PII in
|
|
643
|
+
|
|
644
|
+
Returns:
|
|
645
|
+
Tuple of (has_pii, counts, masked_text, entities)
|
|
646
|
+
"""
|
|
647
|
+
text = self._preprocess(text)
|
|
648
|
+
if not text:
|
|
649
|
+
return False, Counter(), "", {}
|
|
650
|
+
|
|
651
|
+
analyzer_results = self.analyzer.analyze(
|
|
652
|
+
text=text,
|
|
653
|
+
language=self.language,
|
|
654
|
+
entities=self.entities,
|
|
655
|
+
score_threshold=self.score_threshold,
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
counts = Counter([res.entity_type for res in analyzer_results])
|
|
659
|
+
has_pii = bool(counts)
|
|
660
|
+
masked = text
|
|
661
|
+
entities: Dict[str, str] = {}
|
|
662
|
+
|
|
663
|
+
if has_pii:
|
|
664
|
+
try:
|
|
665
|
+
anonymized_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
|
|
666
|
+
masked = anonymized_result.masked_text
|
|
667
|
+
entities = anonymized_result.entities
|
|
668
|
+
except Exception:
|
|
669
|
+
spans: Dict[str, List[Tuple[int, int]]] = {}
|
|
670
|
+
for res in analyzer_results:
|
|
671
|
+
spans.setdefault(res.entity_type, []).append((res.start, res.end))
|
|
672
|
+
masked = self._mask_spans(text, spans)
|
|
673
|
+
|
|
674
|
+
return has_pii, counts, masked, entities
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def get_default_detector(
|
|
678
|
+
action_type: Optional[Literal["BLOCK", "FLAG", "MASK"]] = None,
|
|
679
|
+
entities: Optional[List[str]] = None,
|
|
680
|
+
hash_function: Optional[Callable[[str], str]] = None,
|
|
681
|
+
) -> PIIDetector:
|
|
682
|
+
"""
|
|
683
|
+
Returns a default PII detector instance (Presidio-based by default).
|
|
684
|
+
If you want regex-based instead, call `set_default_detector(RegexPIIDetector(...))`.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
action_type: Action to take when PII is detected. Options are:
|
|
688
|
+
- "BLOCK": Raise PIIBlockedException when PII is detected
|
|
689
|
+
- "FLAG": Detect PII but don't block or mask
|
|
690
|
+
- "MASK": Replace PII with mask tokens (default)
|
|
691
|
+
entities: Optional list of entity types to detect. If None, uses Presidio's default entities
|
|
692
|
+
hash_function: Optional custom hash function for anonymization. If None, uses default hash function.
|
|
693
|
+
"""
|
|
694
|
+
return PresidioPIIDetector(action_type=action_type, entities=entities, hash_function=hash_function)
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
# ---------------------------------------------------------------------------- #
|
|
698
|
+
# EXAMPLE USAGE #
|
|
699
|
+
# ---------------------------------------------------------------------------- #
|
|
700
|
+
# from netra.pii import RegexPIIDetector, get_default_detector
|
|
701
|
+
# from netra.exceptions.pii import PIIBlockedException
|
|
702
|
+
#
|
|
703
|
+
# # Create a regex-based detector that blocks on any PII found:
|
|
704
|
+
# regex_detector = RegexPIIDetector(action_type="BLOCK")
|
|
705
|
+
# try:
|
|
706
|
+
# result = regex_detector.detect("My email is sooraj@example.com")
|
|
707
|
+
# # If block_on_pii=True and PII is found, code won't reach here
|
|
708
|
+
# print(f"PII detected: {result.has_pii}, Entities: {result.entity_counts}")
|
|
709
|
+
# except PIIBlockedException as e:
|
|
710
|
+
# # Access structured information about the PII detection
|
|
711
|
+
# print(f"Blocked: {e.blocked}, Entities found: {e.entity_counts}")
|
|
712
|
+
# print(f"Masked version: {e.masked_text}")
|
|
713
|
+
#
|
|
714
|
+
# # Or get the default (Presidio-based) detector:
|
|
715
|
+
# default_detector = get_default_detector(action_type="FLAG")
|
|
716
|
+
# pii_info = default_detector.detect("Call me at 123-456-7890")
|
|
717
|
+
# print(f"Found PII types: {list(pii_info.entity_counts.keys())}")
|
|
718
|
+
#
|
|
719
|
+
# # Using custom hash function with get_default_detector:
|
|
720
|
+
# import hashlib
|
|
721
|
+
# def custom_hash(text: str) -> str:
|
|
722
|
+
# return hashlib.sha256(text.encode()).hexdigest()[:8]
|
|
723
|
+
#
|
|
724
|
+
# detector_with_custom_hash = get_default_detector(
|
|
725
|
+
# action_type="MASK",
|
|
726
|
+
# hash_function=custom_hash
|
|
727
|
+
# )
|
|
728
|
+
# result = detector_with_custom_hash.detect("My email is john@example.com")
|
|
729
|
+
# print(f"Masked text: {result.masked_text}")
|
|
730
|
+
# print(f"Entity mappings: {result.entities}")
|
|
731
|
+
#
|
|
732
|
+
# # Using PresidioPIIDetector directly with custom hash function:
|
|
733
|
+
# from combat.pii import PresidioPIIDetector
|
|
734
|
+
#
|
|
735
|
+
# def md5_hash(text: str) -> str:
|
|
736
|
+
# return hashlib.md5(text.encode()).hexdigest()[:10]
|
|
737
|
+
#
|
|
738
|
+
# presidio_detector = PresidioPIIDetector(
|
|
739
|
+
# hash_function=md5_hash,
|
|
740
|
+
# anonymizer_cache_size=500,
|
|
741
|
+
# action_type="MASK",
|
|
742
|
+
# score_threshold=0.8
|
|
743
|
+
# )
|
|
744
|
+
# result = presidio_detector.detect("Contact me at john.doe@company.com or 555-123-4567")
|
|
745
|
+
# print(f"Masked: {result.masked_text}")
|
|
746
|
+
# print(f"Entities: {result.entities}")
|
|
747
|
+
#
|
|
748
|
+
# # You can also manually raise the exception if needed
|
|
749
|
+
# if pii_info.has_pii:
|
|
750
|
+
# raise PIIBlockedException(
|
|
751
|
+
# message="Custom blocking message",
|
|
752
|
+
# has_pii=pii_info.has_pii,
|
|
753
|
+
# entity_counts=pii_info.entity_counts,
|
|
754
|
+
# masked_text=pii_info.masked_text,
|
|
755
|
+
# is_blocked=True
|
|
756
|
+
# original_text=pii_info.original_text,
|
|
757
|
+
# )
|