netra-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of netra-sdk might be problematic. Click here for more details.

Files changed (42) hide show
  1. netra/__init__.py +148 -0
  2. netra/anonymizer/__init__.py +7 -0
  3. netra/anonymizer/anonymizer.py +79 -0
  4. netra/anonymizer/base.py +159 -0
  5. netra/anonymizer/fp_anonymizer.py +182 -0
  6. netra/config.py +111 -0
  7. netra/decorators.py +167 -0
  8. netra/exceptions/__init__.py +6 -0
  9. netra/exceptions/injection.py +33 -0
  10. netra/exceptions/pii.py +46 -0
  11. netra/input_scanner.py +142 -0
  12. netra/instrumentation/__init__.py +257 -0
  13. netra/instrumentation/aiohttp/__init__.py +378 -0
  14. netra/instrumentation/aiohttp/version.py +1 -0
  15. netra/instrumentation/cohere/__init__.py +446 -0
  16. netra/instrumentation/cohere/version.py +1 -0
  17. netra/instrumentation/google_genai/__init__.py +506 -0
  18. netra/instrumentation/google_genai/config.py +5 -0
  19. netra/instrumentation/google_genai/utils.py +31 -0
  20. netra/instrumentation/google_genai/version.py +1 -0
  21. netra/instrumentation/httpx/__init__.py +545 -0
  22. netra/instrumentation/httpx/version.py +1 -0
  23. netra/instrumentation/instruments.py +78 -0
  24. netra/instrumentation/mistralai/__init__.py +545 -0
  25. netra/instrumentation/mistralai/config.py +5 -0
  26. netra/instrumentation/mistralai/utils.py +30 -0
  27. netra/instrumentation/mistralai/version.py +1 -0
  28. netra/instrumentation/weaviate/__init__.py +121 -0
  29. netra/instrumentation/weaviate/version.py +1 -0
  30. netra/pii.py +757 -0
  31. netra/processors/__init__.py +4 -0
  32. netra/processors/session_span_processor.py +55 -0
  33. netra/processors/span_aggregation_processor.py +365 -0
  34. netra/scanner.py +104 -0
  35. netra/session.py +185 -0
  36. netra/session_manager.py +96 -0
  37. netra/tracer.py +99 -0
  38. netra/version.py +1 -0
  39. netra_sdk-0.1.0.dist-info/LICENCE +201 -0
  40. netra_sdk-0.1.0.dist-info/METADATA +573 -0
  41. netra_sdk-0.1.0.dist-info/RECORD +42 -0
  42. netra_sdk-0.1.0.dist-info/WHEEL +4 -0
netra/pii.py ADDED
@@ -0,0 +1,757 @@
1
+ # File: netra/pii.py
2
+ import json
3
+ import os
4
+ import re
5
+ from abc import ABC, abstractmethod
6
+ from collections import Counter
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Callable, Dict, List, Literal, Optional, Pattern, Tuple, Union, cast
9
+
10
+ from netra import Netra
11
+ from netra.anonymizer import Anonymizer
12
+ from netra.exceptions import PIIBlockedException
13
+
14
+ EMAIL_PATTERN: Pattern[str] = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
15
+ PHONE_PATTERN: Pattern[str] = re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b")
16
+ CREDIT_CARD_PATTERN: Pattern[str] = re.compile(r"\b(?:\d[ -]*?){13,16}\b")
17
+ SSN_PATTERN: Pattern[str] = re.compile(r"\b\d{3}-\d{2}-\d{4}\b")
18
+
19
+ DEFAULT_PII_PATTERNS: Dict[str, Pattern[str]] = {
20
+ "EMAIL": EMAIL_PATTERN,
21
+ "PHONE": PHONE_PATTERN,
22
+ "CREDIT_CARD": CREDIT_CARD_PATTERN,
23
+ "SSN": SSN_PATTERN,
24
+ }
25
+
26
+ DEFAULT_ENTITIES: List[str] = [
27
+ "CREDIT_CARD",
28
+ "CRYPTO",
29
+ "DATE_TIME",
30
+ "EMAIL_ADDRESS",
31
+ "IBAN_CODE",
32
+ "IP_ADDRESS",
33
+ "NRP",
34
+ "LOCATION",
35
+ "PHONE_NUMBER",
36
+ "MEDICAL_LICENSE",
37
+ "URL",
38
+ "US_BANK_NUMBER",
39
+ "US_DRIVER_LICENSE",
40
+ "US_ITIN",
41
+ "US_PASSPORT",
42
+ "US_SSN",
43
+ "UK_NHS",
44
+ "UK_NINO",
45
+ "ES_NIF",
46
+ "ES_NIE",
47
+ "IT_FISCAL_CODE",
48
+ "IT_DRIVER_LICENSE",
49
+ "IT_VAT_CODE",
50
+ "IT_PASSPORT",
51
+ "IT_IDENTITY_CARD",
52
+ "PL_PESEL",
53
+ "SG_NRIC_FIN",
54
+ "SG_UEN",
55
+ "AU_ABN",
56
+ "AU_ACN",
57
+ "AU_TFN",
58
+ "AU_MEDICARE",
59
+ "IN_PAN",
60
+ "IN_AADHAAR",
61
+ "IN_VEHICLE_REGISTRATION",
62
+ "IN_VOTER",
63
+ "IN_PASSPORT",
64
+ "FI_PERSONAL_IDENTITY_CODE",
65
+ ]
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class PIIDetectionResult:
70
+ """
71
+ Result of running PII detection on input text.
72
+ Attributes:
73
+ has_pii: True if any PII matches were found.
74
+ pii_entities: Dictionary mapping PII label -> count of occurrences.
75
+ masked_text: Input text with PII spans replaced/masked.
76
+ Can be a string for simple inputs, a list of dicts for chat messages,
77
+ or a list of BaseMessage objects for LangChain inputs.
78
+ original_text: The original text used to call the detect() method.
79
+ Can be a string, list of strings, list of dictionaries, or any other type.
80
+ is_blocked: True if block_on_pii is enabled and has_pii is True.
81
+ is_masked: True if any text was replaced to mask PII.
82
+ pii_actions: Dictionary mapping action types to lists of PII entities.
83
+ hashed_entities: Dictionary mapping hashed entity values to their original values.
84
+ """
85
+
86
+ has_pii: bool = False
87
+ pii_entities: Dict[str, int] = field(default_factory=dict)
88
+ masked_text: Optional[Union[str, List[Dict[str, str]], List[Any]]] = None
89
+ original_text: Optional[Union[str, List[Dict[str, str]], List[str], List[Any]]] = None
90
+ is_blocked: bool = False
91
+ is_masked: bool = False
92
+ pii_actions: Dict[Any, List[str]] = field(default_factory=dict)
93
+ hashed_entities: Dict[str, str] = field(default_factory=dict)
94
+
95
+
96
+ class PIIDetector(ABC):
97
+ """
98
+ Abstract base for all PII detectors. Provides common iteration/
99
+ aggregation logic, while requiring subclasses to implement _detect_single_message().
100
+ """
101
+
102
+ def __init__(self, action_type: Literal["BLOCK", "FLAG", "MASK"] = "FLAG") -> None:
103
+ """
104
+ Initialize the PII detector.
105
+
106
+ Args:
107
+ action_type: Action to take when PII is detected. Options are:
108
+ - "BLOCK": Raise PIIBlockedException when PII is detected
109
+ - "FLAG": Detect PII but don't block or mask
110
+ - "MASK": Replace PII with mask tokens (default)
111
+ """
112
+ self._action_type: Literal["BLOCK", "FLAG", "MASK"] = action_type
113
+
114
+ @abstractmethod
115
+ def _detect_pii(self, text: str) -> Tuple[bool, Counter[str], str, Dict[str, str]]:
116
+ """
117
+ Detect PII in a single message.
118
+
119
+ Args:
120
+ text: The text to detect PII in
121
+
122
+ Returns:
123
+ Tuple of (has_pii, counts, masked_text, entities)
124
+ """
125
+
126
+ def _preprocess(self, text: str) -> str:
127
+ """
128
+ Preprocess text before PII detection.
129
+
130
+ Args:
131
+ text: The input text to preprocess.
132
+
133
+ Returns:
134
+ Preprocessed text ready for PII detection.
135
+ """
136
+ if not isinstance(text, str):
137
+ return str(text) if text is not None else ""
138
+
139
+ # Trim whitespace
140
+ text = text.strip()
141
+
142
+ return text
143
+
144
+ def _mask_spans(self, text: str, spans: Dict[str, List[Tuple[int, int]]]) -> str:
145
+ """
146
+ Mask identified PII spans in the text.
147
+
148
+ Args:
149
+ text: The original text containing PII.
150
+ spans: Dictionary mapping PII label to list of (start, end) spans.
151
+
152
+ Returns:
153
+ Text with PII spans replaced by mask tokens.
154
+ """
155
+ # Convert spans to a flat list of (start, end, label) tuples
156
+ all_spans = []
157
+ for label, span_list in spans.items():
158
+ for start, end in span_list:
159
+ all_spans.append((start, end, label))
160
+
161
+ # Sort spans by start position (in reverse order to avoid index shifting)
162
+ all_spans.sort(reverse=True)
163
+
164
+ # Apply masking
165
+ result = text
166
+ for start, end, label in all_spans:
167
+ mask = f"[{label}]"
168
+ result = result[:start] + mask + result[end:]
169
+
170
+ return result
171
+
172
+ def detect(self, input_data: Union[str, List[Dict[str, str]], List[str], List[Any]]) -> PIIDetectionResult:
173
+ """
174
+ Public entry point. Accepts either:
175
+ 1. A single string
176
+ 2. A list of dictionaries with string values (e.g. chat messages)
177
+ 3. A list of strings
178
+ 4. A list of LangChain BaseMessage objects (detected by duck typing)
179
+
180
+ Args:
181
+ input_data: The input data to detect PII in
182
+
183
+ Returns:
184
+ PIIDetectionResult: The detection result containing PII information
185
+ """
186
+ try:
187
+ return self._process_input_data(input_data)
188
+ except PIIBlockedException as e:
189
+ return self._handle_pii_exception(e)
190
+
191
+ def _process_input_data(
192
+ self, input_data: Union[str, List[Dict[str, str]], List[str], List[Any]]
193
+ ) -> PIIDetectionResult:
194
+ """
195
+ Process input data based on its type and route to appropriate detection method.
196
+
197
+ Args:
198
+ input_data: The input data to detect PII in
199
+
200
+ Returns:
201
+ PIIDetectionResult: The detection result containing PII information
202
+
203
+ Raises:
204
+ ValueError: If input type is not supported
205
+ """
206
+ if isinstance(input_data, str):
207
+ return self._detect_single_message(input_data)
208
+
209
+ if isinstance(input_data, list):
210
+ return self._process_list_input(input_data)
211
+
212
+ raise ValueError(f"Unsupported input type: {type(input_data).__name__}")
213
+
214
+ def _process_list_input(self, input_list: List[Any]) -> PIIDetectionResult:
215
+ """
216
+ Process list input by determining the list type and routing to appropriate method.
217
+
218
+ Args:
219
+ input_list: List of items to process
220
+
221
+ Returns:
222
+ PIIDetectionResult: The detection result containing PII information
223
+
224
+ Raises:
225
+ ValueError: If list item type is not supported
226
+ """
227
+ if not input_list:
228
+ return PIIDetectionResult(original_text=input_list)
229
+
230
+ first_item = input_list[0]
231
+
232
+ if isinstance(first_item, dict):
233
+ return self._detect_chat_messages(cast(List[Dict[str, str]], input_list))
234
+
235
+ if isinstance(first_item, str):
236
+ return self._detect_string_list(cast(List[str], input_list))
237
+
238
+ if self._is_langchain_message(first_item):
239
+ return self._process_langchain_messages(input_list)
240
+
241
+ raise ValueError(f"Unsupported input type in list: {type(first_item).__name__}")
242
+
243
+ def _is_langchain_message(self, item: Any) -> bool:
244
+ """
245
+ Check if an item is a LangChain BaseMessage-like object using duck typing.
246
+
247
+ Args:
248
+ item: The item to check
249
+
250
+ Returns:
251
+ True if the item has the expected LangChain message attributes
252
+ """
253
+ return hasattr(item, "content") and hasattr(item, "type")
254
+
255
+ def _process_langchain_messages(self, messages: List[Any]) -> PIIDetectionResult:
256
+ """
257
+ Process LangChain BaseMessage-like objects by extracting their content.
258
+
259
+ Args:
260
+ messages: List of LangChain BaseMessage-like objects
261
+
262
+ Returns:
263
+ PIIDetectionResult: The detection result containing PII information
264
+ """
265
+ contents = [msg.content for msg in messages if hasattr(msg, "content")]
266
+ return self._detect_string_list(contents)
267
+
268
+ def _handle_pii_exception(self, exception: PIIBlockedException) -> PIIDetectionResult:
269
+ """
270
+ Handle PIIBlockedException based on the configured action type.
271
+
272
+ Args:
273
+ exception: The PIIBlockedException that was raised
274
+
275
+ Returns:
276
+ PIIDetectionResult: Appropriate result based on action type
277
+
278
+ Raises:
279
+ PIIBlockedException: Re-raised if action type is BLOCK
280
+ """
281
+ pii_actions = self._create_pii_actions(exception)
282
+ attributes = self._build_trace_attributes(exception, pii_actions)
283
+
284
+ # Log the PII detection event
285
+ Netra.set_custom_event(event_name="pii_detected", attributes=attributes)
286
+
287
+ # Handle different action types
288
+ if self._action_type == "BLOCK":
289
+ raise exception
290
+
291
+ return self._create_detection_result(exception, pii_actions)
292
+
293
+ def _create_pii_actions(self, exception: PIIBlockedException) -> Dict[str, List[str]]:
294
+ """
295
+ Create pii_actions dictionary based on action type and detected entities.
296
+
297
+ Args:
298
+ exception: The PIIBlockedException containing detected entities
299
+
300
+ Returns:
301
+ Dictionary mapping action type to list of PII entity types
302
+ """
303
+ return {self._action_type: list(exception.pii_entities.keys())}
304
+
305
+ def _build_trace_attributes(
306
+ self, exception: PIIBlockedException, pii_actions: Dict[str, List[str]]
307
+ ) -> Dict[str, Any]:
308
+ """
309
+ Build attributes dictionary for tracing/logging the PII detection event.
310
+
311
+ Args:
312
+ exception: The PIIBlockedException containing PII information
313
+ pii_actions: Dictionary of PII actions to be taken
314
+
315
+ Returns:
316
+ Dictionary of attributes for the trace event
317
+ """
318
+ attributes = {
319
+ "has_pii": exception.has_pii,
320
+ "pii_entities": json.dumps(exception.pii_entities),
321
+ "is_blocked": self._action_type == "BLOCK",
322
+ "is_masked": self._action_type == "MASK",
323
+ "pii_actions": json.dumps(pii_actions),
324
+ }
325
+
326
+ # Add masked_text to attributes only for MASK action type
327
+ if self._action_type == "MASK":
328
+ attributes["masked_text"] = self._serialize_masked_text(exception.masked_text)
329
+
330
+ return attributes
331
+
332
+ def _serialize_masked_text(self, masked_text: Any) -> str:
333
+ """
334
+ Serialize masked text to string format for tracing attributes.
335
+
336
+ Args:
337
+ masked_text: The masked text in various possible formats
338
+
339
+ Returns:
340
+ String representation of the masked text
341
+ """
342
+ if isinstance(masked_text, (dict, list)):
343
+ return json.dumps(masked_text)
344
+ return str(masked_text)
345
+
346
+ def _create_detection_result(
347
+ self, exception: PIIBlockedException, pii_actions: Dict[str, List[str]]
348
+ ) -> PIIDetectionResult:
349
+ """
350
+ Create PIIDetectionResult based on action type and exception data.
351
+
352
+ Args:
353
+ exception: The PIIBlockedException containing PII information
354
+ pii_actions: Dictionary of PII actions taken
355
+
356
+ Returns:
357
+ PIIDetectionResult with appropriate fields set based on action type
358
+ """
359
+ if self._action_type == "MASK":
360
+ return PIIDetectionResult(
361
+ has_pii=exception.has_pii,
362
+ pii_entities=exception.pii_entities,
363
+ original_text=exception.original_text,
364
+ pii_actions=pii_actions,
365
+ masked_text=exception.masked_text,
366
+ is_blocked=False,
367
+ is_masked=True,
368
+ hashed_entities=exception.hashed_entities,
369
+ )
370
+
371
+ # For FLAG action type
372
+ return PIIDetectionResult(
373
+ has_pii=exception.has_pii,
374
+ pii_entities=exception.pii_entities,
375
+ original_text=exception.original_text,
376
+ pii_actions=pii_actions,
377
+ masked_text=None,
378
+ is_blocked=False,
379
+ is_masked=False,
380
+ hashed_entities=exception.hashed_entities,
381
+ )
382
+
383
+ def _detect_single_message(self, text: str) -> PIIDetectionResult:
384
+ """
385
+ Detect PII in a single message.
386
+
387
+ Args:
388
+ text: The text to detect PII in
389
+
390
+ Returns:
391
+ PIIDetectionResult: The detection result containing PII information
392
+ """
393
+ has_pii, counts, masked_text, entities = self._detect_pii(text)
394
+
395
+ if has_pii:
396
+ # Create pii_actions based on the action type and detected entities
397
+ pii_actions = {self._action_type: list(counts.keys())}
398
+ raise PIIBlockedException(
399
+ message="PII detected; blocking enabled.",
400
+ has_pii=has_pii,
401
+ pii_entities=dict(counts),
402
+ masked_text=masked_text,
403
+ pii_actions=pii_actions,
404
+ is_blocked=True,
405
+ original_text=text,
406
+ hashed_entities=entities,
407
+ )
408
+
409
+ return PIIDetectionResult(
410
+ has_pii=has_pii,
411
+ pii_entities={},
412
+ masked_text=None, # No PII detected, so no masked text needed
413
+ original_text=text,
414
+ is_blocked=False,
415
+ is_masked=False,
416
+ pii_actions={}, # No PII detected, so no actions needed
417
+ hashed_entities=entities,
418
+ )
419
+
420
+ def _detect_chat_messages(self, chat_messages: List[Dict[str, str]]) -> PIIDetectionResult:
421
+ """
422
+ Detect PII in a list of chat messages.
423
+
424
+ Args:
425
+ chat_messages: List of chat message dictionaries with 'role' and 'message' keys
426
+
427
+ Returns:
428
+ PIIDetectionResult: The detection result containing PII information
429
+ """
430
+ overall_has_pii = False
431
+ total_counts: Counter[str] = Counter()
432
+ masked_list: List[Dict[str, str]] = []
433
+ merged_hashed_entities: Dict[str, str] = {}
434
+
435
+ for message in chat_messages:
436
+ role = message.get("role", "unknown")
437
+ text = message.get("content", "")
438
+
439
+ try:
440
+ self._detect_single_message(text)
441
+ # If we get here, no PII was detected
442
+ masked_list.append({"role": role, "content": text})
443
+ except PIIBlockedException as e:
444
+ # PII was detected
445
+ overall_has_pii = True
446
+ total_counts.update(e.pii_entities)
447
+ # Merge hashed entities from this message
448
+ merged_hashed_entities.update(e.hashed_entities)
449
+ # Convert masked_text to string if it's not already to prevent type errors
450
+ masked_text_str = str(e.masked_text) if e.masked_text is not None else ""
451
+ masked_list.append({"role": role, "content": masked_text_str})
452
+
453
+ if overall_has_pii:
454
+ # Create pii_actions based on the action type and detected entities
455
+ pii_actions = {self._action_type: list(total_counts.keys())}
456
+ raise PIIBlockedException(
457
+ message="PII detected in one or more messages; blocking enabled.",
458
+ has_pii=overall_has_pii,
459
+ pii_entities=dict(total_counts),
460
+ masked_text=masked_list,
461
+ pii_actions=pii_actions,
462
+ is_blocked=True,
463
+ hashed_entities=merged_hashed_entities,
464
+ )
465
+
466
+ return PIIDetectionResult(
467
+ has_pii=False,
468
+ pii_entities={},
469
+ masked_text=None,
470
+ original_text=chat_messages,
471
+ is_blocked=False,
472
+ is_masked=False,
473
+ pii_actions={}, # No PII detected, so no actions needed
474
+ hashed_entities={},
475
+ )
476
+
477
+ def _detect_string_list(self, string_list: List[str]) -> PIIDetectionResult:
478
+ """
479
+ Detect PII in a list of strings.
480
+
481
+ Args:
482
+ string_list: List of strings to detect PII in
483
+
484
+ Returns:
485
+ PIIDetectionResult: The detection result containing PII information
486
+ """
487
+ overall_has_pii = False
488
+ total_counts: Counter[str] = Counter()
489
+ masked_list: List[str] = []
490
+ merged_hashed_entities: Dict[str, str] = {}
491
+
492
+ for text in string_list:
493
+ try:
494
+ self._detect_single_message(text)
495
+ # If we get here, no PII was detected
496
+ masked_list.append(text)
497
+ except PIIBlockedException as e:
498
+ # PII was detected
499
+ overall_has_pii = True
500
+ total_counts.update(e.pii_entities)
501
+ # Merge hashed entities from this string
502
+ merged_hashed_entities.update(e.hashed_entities)
503
+ # Ensure we're appending a string to the string list
504
+ masked_text_str = str(e.masked_text) if e.masked_text is not None else ""
505
+ masked_list.append(masked_text_str)
506
+
507
+ if overall_has_pii:
508
+ # Create pii_actions based on the action type and detected entities
509
+ pii_actions = {self._action_type: list(total_counts.keys())}
510
+ raise PIIBlockedException(
511
+ message="PII detected in one or more messages; blocking enabled.",
512
+ has_pii=overall_has_pii,
513
+ pii_entities=dict(total_counts),
514
+ masked_text=masked_list,
515
+ pii_actions=pii_actions,
516
+ is_blocked=True,
517
+ hashed_entities=merged_hashed_entities,
518
+ )
519
+
520
+ return PIIDetectionResult(
521
+ has_pii=False,
522
+ pii_entities={},
523
+ masked_text=None,
524
+ original_text=string_list,
525
+ is_blocked=False,
526
+ is_masked=False,
527
+ pii_actions={}, # No PII detected, so no actions needed
528
+ hashed_entities={},
529
+ )
530
+
531
+
532
+ class RegexPIIDetector(PIIDetector):
533
+ """
534
+ Regex-based PII detector. Overrides _detect_single_message to handle a plain string.
535
+ """
536
+
537
+ def __init__(
538
+ self,
539
+ patterns: Optional[Dict[str, Pattern[str]]] = None,
540
+ action_type: Literal["BLOCK", "FLAG", "MASK"] = "MASK",
541
+ ) -> None:
542
+ if action_type is None:
543
+ env_action = os.getenv("NETRA_ACTION_TYPE", "MASK")
544
+ # Ensure action_type is one of the valid literal values
545
+ if env_action not in ["BLOCK", "FLAG", "MASK"]:
546
+ action_type = cast(Literal["BLOCK", "FLAG", "MASK"], "FLAG")
547
+ else:
548
+ action_type = cast(Literal["BLOCK", "FLAG", "MASK"], env_action)
549
+ super().__init__(action_type=action_type)
550
+ self.patterns: Dict[str, Pattern[str]] = patterns or DEFAULT_PII_PATTERNS
551
+
552
+ def _detect_pii(self, text: str) -> Tuple[bool, Counter[str], str, Dict[str, str]]:
553
+ """
554
+ Detect PII in a single message.
555
+
556
+ Args:
557
+ text: The text to detect PII in
558
+
559
+ Returns:
560
+ Tuple of (has_pii, counts, masked_text, entities)
561
+ """
562
+ text = self._preprocess(text) # trim & normalize
563
+ if not text:
564
+ return False, Counter(), "", {}
565
+
566
+ spans: Dict[str, List[Tuple[int, int]]] = {}
567
+ counts: Counter[str] = Counter()
568
+
569
+ for label, pattern in self.patterns.items():
570
+ matches = list(pattern.finditer(text))
571
+ if not matches:
572
+ continue
573
+ counts[label] = len(matches)
574
+ spans[label] = [m.span() for m in matches]
575
+
576
+ has_pii_local = bool(counts)
577
+ masked = text
578
+ entities: dict[str, Any] = {}
579
+
580
+ if has_pii_local:
581
+ masked = self._mask_spans(text, spans)
582
+
583
+ return has_pii_local, counts, masked, entities
584
+
585
+
586
+ class PresidioPIIDetector(PIIDetector):
587
+ """
588
+ Presidio-based PII detector. Overrides _detect_single_message to
589
+ call Presidio's Analyzer + Anonymizer on a string.
590
+
591
+ Examples:
592
+ # Using default hash function
593
+ detector = PresidioPIIDetector()
594
+ result = detector.detect("My email is john@example.com")
595
+
596
+ # Using custom hash function
597
+ import hashlib
598
+ def custom_hash(text: str) -> str:
599
+ return hashlib.sha256(text.encode()).hexdigest()[:8]
600
+
601
+ detector = PresidioPIIDetector(
602
+ hash_function=custom_hash,
603
+ anonymizer_cache_size=500,
604
+ action_type="MASK",
605
+ score_threshold=0.8
606
+ )
607
+ """
608
+
609
+ def __init__(
610
+ self,
611
+ entities: Optional[List[str]] = None,
612
+ language: str = "en",
613
+ score_threshold: float = 0.6,
614
+ action_type: Optional[Literal["BLOCK", "FLAG", "MASK"]] = None,
615
+ anonymizer_cache_size: int = 1000,
616
+ hash_function: Optional[Callable[[str], str]] = None,
617
+ ) -> None:
618
+ if action_type is None:
619
+ action_type = "FLAG"
620
+ env_action = os.getenv("NETRA_ACTION_TYPE", "FLAG")
621
+ # Ensure action_type is one of the valid literal values
622
+ if env_action in ["BLOCK", "FLAG", "MASK"]:
623
+ action_type = cast(Literal["BLOCK", "FLAG", "MASK"], env_action)
624
+ super().__init__(action_type=action_type)
625
+ try:
626
+ from presidio_analyzer import AnalyzerEngine # noqa: F401
627
+ except ImportError as exc:
628
+ raise ImportError("Presidio-based PII detection requires: presidio-analyzer. " "Install via pip.") from exc
629
+
630
+ self.language: str = language
631
+ self.entities: Optional[List[str]] = entities if entities else DEFAULT_ENTITIES
632
+ self.score_threshold: float = score_threshold
633
+
634
+ self.analyzer = AnalyzerEngine()
635
+ self.anonymizer = Anonymizer(hash_function=hash_function, cache_size=anonymizer_cache_size)
636
+
637
+ def _detect_pii(self, text: str) -> Tuple[bool, Counter[str], str, Dict[str, str]]:
638
+ """
639
+ Detect PII in a single message.
640
+
641
+ Args:
642
+ text: The text to detect PII in
643
+
644
+ Returns:
645
+ Tuple of (has_pii, counts, masked_text, entities)
646
+ """
647
+ text = self._preprocess(text)
648
+ if not text:
649
+ return False, Counter(), "", {}
650
+
651
+ analyzer_results = self.analyzer.analyze(
652
+ text=text,
653
+ language=self.language,
654
+ entities=self.entities,
655
+ score_threshold=self.score_threshold,
656
+ )
657
+
658
+ counts = Counter([res.entity_type for res in analyzer_results])
659
+ has_pii = bool(counts)
660
+ masked = text
661
+ entities: Dict[str, str] = {}
662
+
663
+ if has_pii:
664
+ try:
665
+ anonymized_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
666
+ masked = anonymized_result.masked_text
667
+ entities = anonymized_result.entities
668
+ except Exception:
669
+ spans: Dict[str, List[Tuple[int, int]]] = {}
670
+ for res in analyzer_results:
671
+ spans.setdefault(res.entity_type, []).append((res.start, res.end))
672
+ masked = self._mask_spans(text, spans)
673
+
674
+ return has_pii, counts, masked, entities
675
+
676
+
677
+ def get_default_detector(
678
+ action_type: Optional[Literal["BLOCK", "FLAG", "MASK"]] = None,
679
+ entities: Optional[List[str]] = None,
680
+ hash_function: Optional[Callable[[str], str]] = None,
681
+ ) -> PIIDetector:
682
+ """
683
+ Returns a default PII detector instance (Presidio-based by default).
684
+ If you want regex-based instead, call `set_default_detector(RegexPIIDetector(...))`.
685
+
686
+ Args:
687
+ action_type: Action to take when PII is detected. Options are:
688
+ - "BLOCK": Raise PIIBlockedException when PII is detected
689
+ - "FLAG": Detect PII but don't block or mask
690
+ - "MASK": Replace PII with mask tokens (default)
691
+ entities: Optional list of entity types to detect. If None, uses Presidio's default entities
692
+ hash_function: Optional custom hash function for anonymization. If None, uses default hash function.
693
+ """
694
+ return PresidioPIIDetector(action_type=action_type, entities=entities, hash_function=hash_function)
695
+
696
+
697
+ # ---------------------------------------------------------------------------- #
698
+ # EXAMPLE USAGE #
699
+ # ---------------------------------------------------------------------------- #
700
+ # from netra.pii import RegexPIIDetector, get_default_detector
701
+ # from netra.exceptions.pii import PIIBlockedException
702
+ #
703
+ # # Create a regex-based detector that blocks on any PII found:
704
+ # regex_detector = RegexPIIDetector(action_type="BLOCK")
705
+ # try:
706
+ # result = regex_detector.detect("My email is sooraj@example.com")
707
+ # # If block_on_pii=True and PII is found, code won't reach here
708
+ # print(f"PII detected: {result.has_pii}, Entities: {result.entity_counts}")
709
+ # except PIIBlockedException as e:
710
+ # # Access structured information about the PII detection
711
+ # print(f"Blocked: {e.blocked}, Entities found: {e.entity_counts}")
712
+ # print(f"Masked version: {e.masked_text}")
713
+ #
714
+ # # Or get the default (Presidio-based) detector:
715
+ # default_detector = get_default_detector(action_type="FLAG")
716
+ # pii_info = default_detector.detect("Call me at 123-456-7890")
717
+ # print(f"Found PII types: {list(pii_info.entity_counts.keys())}")
718
+ #
719
+ # # Using custom hash function with get_default_detector:
720
+ # import hashlib
721
+ # def custom_hash(text: str) -> str:
722
+ # return hashlib.sha256(text.encode()).hexdigest()[:8]
723
+ #
724
+ # detector_with_custom_hash = get_default_detector(
725
+ # action_type="MASK",
726
+ # hash_function=custom_hash
727
+ # )
728
+ # result = detector_with_custom_hash.detect("My email is john@example.com")
729
+ # print(f"Masked text: {result.masked_text}")
730
+ # print(f"Entity mappings: {result.entities}")
731
+ #
732
+ # # Using PresidioPIIDetector directly with custom hash function:
733
+ # from combat.pii import PresidioPIIDetector
734
+ #
735
+ # def md5_hash(text: str) -> str:
736
+ # return hashlib.md5(text.encode()).hexdigest()[:10]
737
+ #
738
+ # presidio_detector = PresidioPIIDetector(
739
+ # hash_function=md5_hash,
740
+ # anonymizer_cache_size=500,
741
+ # action_type="MASK",
742
+ # score_threshold=0.8
743
+ # )
744
+ # result = presidio_detector.detect("Contact me at john.doe@company.com or 555-123-4567")
745
+ # print(f"Masked: {result.masked_text}")
746
+ # print(f"Entities: {result.entities}")
747
+ #
748
+ # # You can also manually raise the exception if needed
749
+ # if pii_info.has_pii:
750
+ # raise PIIBlockedException(
751
+ # message="Custom blocking message",
752
+ # has_pii=pii_info.has_pii,
753
+ # entity_counts=pii_info.entity_counts,
754
+ # masked_text=pii_info.masked_text,
755
+ # is_blocked=True
756
+ # original_text=pii_info.original_text,
757
+ # )