netra-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of netra-sdk might be problematic. Click here for more details.
- netra/__init__.py +148 -0
- netra/anonymizer/__init__.py +7 -0
- netra/anonymizer/anonymizer.py +79 -0
- netra/anonymizer/base.py +159 -0
- netra/anonymizer/fp_anonymizer.py +182 -0
- netra/config.py +111 -0
- netra/decorators.py +167 -0
- netra/exceptions/__init__.py +6 -0
- netra/exceptions/injection.py +33 -0
- netra/exceptions/pii.py +46 -0
- netra/input_scanner.py +142 -0
- netra/instrumentation/__init__.py +257 -0
- netra/instrumentation/aiohttp/__init__.py +378 -0
- netra/instrumentation/aiohttp/version.py +1 -0
- netra/instrumentation/cohere/__init__.py +446 -0
- netra/instrumentation/cohere/version.py +1 -0
- netra/instrumentation/google_genai/__init__.py +506 -0
- netra/instrumentation/google_genai/config.py +5 -0
- netra/instrumentation/google_genai/utils.py +31 -0
- netra/instrumentation/google_genai/version.py +1 -0
- netra/instrumentation/httpx/__init__.py +545 -0
- netra/instrumentation/httpx/version.py +1 -0
- netra/instrumentation/instruments.py +78 -0
- netra/instrumentation/mistralai/__init__.py +545 -0
- netra/instrumentation/mistralai/config.py +5 -0
- netra/instrumentation/mistralai/utils.py +30 -0
- netra/instrumentation/mistralai/version.py +1 -0
- netra/instrumentation/weaviate/__init__.py +121 -0
- netra/instrumentation/weaviate/version.py +1 -0
- netra/pii.py +757 -0
- netra/processors/__init__.py +4 -0
- netra/processors/session_span_processor.py +55 -0
- netra/processors/span_aggregation_processor.py +365 -0
- netra/scanner.py +104 -0
- netra/session.py +185 -0
- netra/session_manager.py +96 -0
- netra/tracer.py +99 -0
- netra/version.py +1 -0
- netra_sdk-0.1.0.dist-info/LICENCE +201 -0
- netra_sdk-0.1.0.dist-info/METADATA +573 -0
- netra_sdk-0.1.0.dist-info/RECORD +42 -0
- netra_sdk-0.1.0.dist-info/WHEEL +4 -0
netra/__init__.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Any, Dict, Optional, Set
|
|
4
|
+
|
|
5
|
+
from netra.instrumentation.instruments import NetraInstruments
|
|
6
|
+
|
|
7
|
+
from .config import Config
|
|
8
|
+
|
|
9
|
+
# Instrumentor functions
|
|
10
|
+
from .instrumentation import init_instrumentations
|
|
11
|
+
from .session import Session
|
|
12
|
+
from .session_manager import SessionManager
|
|
13
|
+
from .tracer import Tracer
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Netra:
|
|
19
|
+
"""
|
|
20
|
+
Main SDK class. Call SDK.init(...) at the start of your application
|
|
21
|
+
to configure OpenTelemetry and enable all built-in LLM + VectorDB instrumentations.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
_initialized = False
|
|
25
|
+
# Use RLock so the thread that already owns the lock can re-acquire it safely
|
|
26
|
+
_init_lock = threading.RLock()
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def is_initialized(cls) -> bool:
|
|
30
|
+
"""Thread-safe check if Netra has been initialized.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
bool: True if Netra has been initialized, False otherwise
|
|
34
|
+
"""
|
|
35
|
+
with cls._init_lock:
|
|
36
|
+
return cls._initialized
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def init(
|
|
40
|
+
cls,
|
|
41
|
+
app_name: Optional[str] = None,
|
|
42
|
+
headers: Optional[str] = None,
|
|
43
|
+
disable_batch: Optional[bool] = None,
|
|
44
|
+
trace_content: Optional[bool] = None,
|
|
45
|
+
resource_attributes: Optional[Dict[str, Any]] = None,
|
|
46
|
+
environment: Optional[str] = None,
|
|
47
|
+
instruments: Optional[Set[NetraInstruments]] = None,
|
|
48
|
+
block_instruments: Optional[Set[NetraInstruments]] = None,
|
|
49
|
+
) -> None:
|
|
50
|
+
# Acquire lock at the start of the method and hold it throughout
|
|
51
|
+
# to prevent race conditions during initialization
|
|
52
|
+
with cls._init_lock:
|
|
53
|
+
# Check if already initialized while holding the lock
|
|
54
|
+
if cls._initialized:
|
|
55
|
+
logger.warning("Netra.init() called more than once; ignoring subsequent calls.")
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
# Build Config
|
|
59
|
+
cfg = Config(
|
|
60
|
+
app_name=app_name,
|
|
61
|
+
headers=headers,
|
|
62
|
+
disable_batch=disable_batch,
|
|
63
|
+
trace_content=trace_content,
|
|
64
|
+
resource_attributes=resource_attributes,
|
|
65
|
+
environment=environment,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Initialize tracer (OTLP exporter, span processor, resource)
|
|
69
|
+
Tracer(cfg)
|
|
70
|
+
|
|
71
|
+
# Instrument all supported modules
|
|
72
|
+
# Pass trace_content flag to instrumentors that can capture prompts/completions
|
|
73
|
+
init_instrumentations(
|
|
74
|
+
should_enrich_metrics=True,
|
|
75
|
+
base64_image_uploader=None,
|
|
76
|
+
instruments=instruments,
|
|
77
|
+
block_instruments=block_instruments,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
cls._initialized = True
|
|
81
|
+
logger.info("Netra successfully initialized.")
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def set_session_id(cls, session_id: str) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Set session_id context attributes in the current OpenTelemetry context.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
session_id: Session identifier
|
|
90
|
+
"""
|
|
91
|
+
SessionManager.set_session_context("session_id", session_id)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def set_user_id(cls, user_id: str) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Set user_id context attributes in the current OpenTelemetry context.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
user_id: User identifier
|
|
100
|
+
"""
|
|
101
|
+
SessionManager.set_session_context("user_id", user_id)
|
|
102
|
+
|
|
103
|
+
@classmethod
|
|
104
|
+
def set_tenant_id(cls, tenant_id: str) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Set user_account_id context attributes in the current OpenTelemetry context.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
user_account_id: User account identifier
|
|
110
|
+
"""
|
|
111
|
+
SessionManager.set_session_context("tenant_id", tenant_id)
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def set_custom_attributes(cls, key: str, value: Any) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Set custom attributes context in the current OpenTelemetry context.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
key: Custom attribute key
|
|
120
|
+
value: Custom attribute value
|
|
121
|
+
"""
|
|
122
|
+
SessionManager.set_session_context("custom_attributes", {key: value})
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def set_custom_event(cls, event_name: str, attributes: Any) -> None:
|
|
126
|
+
"""
|
|
127
|
+
Set custom event in the current OpenTelemetry context.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
event_name: Name of the custom event
|
|
131
|
+
attributes: Attributes of the custom event
|
|
132
|
+
"""
|
|
133
|
+
SessionManager.set_custom_event(event_name, attributes)
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def start_session(
|
|
137
|
+
cls,
|
|
138
|
+
name: str,
|
|
139
|
+
attributes: Optional[Dict[str, str]] = None,
|
|
140
|
+
module_name: str = "combat_sdk",
|
|
141
|
+
) -> Session:
|
|
142
|
+
"""
|
|
143
|
+
Start a new session.
|
|
144
|
+
"""
|
|
145
|
+
return Session(name, attributes, module_name)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
__all__ = ["Netra"]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom anonymizer for PII data that provides consistent hashing of entities.
|
|
3
|
+
|
|
4
|
+
This module provides a custom anonymizer that can be used to replace PII entities
|
|
5
|
+
with consistent hash values, allowing for tracking the same entities across multiple
|
|
6
|
+
texts while maintaining privacy.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Callable, List, Optional
|
|
10
|
+
|
|
11
|
+
from presidio_analyzer.recognizer_result import RecognizerResult
|
|
12
|
+
|
|
13
|
+
from .base import AnonymizationResult, BaseAnonymizer
|
|
14
|
+
from .fp_anonymizer import FormatPreservingEmailAnonymizer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Anonymizer:
|
|
18
|
+
"""
|
|
19
|
+
Main anonymizer that delegates to different anonymizer classes based on entity type.
|
|
20
|
+
|
|
21
|
+
This anonymizer analyzes the entity types and uses appropriate anonymization
|
|
22
|
+
strategies - format-preserving for email addresses and hash-based for other types.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, hash_function: Optional[Callable[[str], str]] = None, cache_size: int = 1000):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the Anonymizer.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
hash_function: Optional custom hash function that takes a string and returns a hash.
|
|
31
|
+
If not provided, a default hash function will be used.
|
|
32
|
+
cache_size: Maximum number of entities to cache. Uses LRU eviction policy.
|
|
33
|
+
Default is 1000. Set to 0 to disable caching.
|
|
34
|
+
"""
|
|
35
|
+
# Initialize different anonymizer instances
|
|
36
|
+
self.base_anonymizer = BaseAnonymizer(hash_function=hash_function, cache_size=cache_size)
|
|
37
|
+
self.email_anonymizer = FormatPreservingEmailAnonymizer()
|
|
38
|
+
|
|
39
|
+
def anonymize(self, text: str, analyzer_results: List[RecognizerResult]) -> AnonymizationResult:
|
|
40
|
+
"""
|
|
41
|
+
Anonymize text by replacing detected entities using appropriate anonymization strategies.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
text: The original text containing PII.
|
|
45
|
+
analyzer_results: List of RecognizerResult objects from the Presidio analyzer.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
AnonymizationResult containing the masked text and a mapping of entity hashes to original values.
|
|
49
|
+
"""
|
|
50
|
+
# Sort results by start index in descending order to avoid offset issues when replacing
|
|
51
|
+
sorted_results = sorted(analyzer_results, key=lambda x: x.start, reverse=True)
|
|
52
|
+
|
|
53
|
+
# Make a copy of the original text that we'll modify
|
|
54
|
+
masked_text = text
|
|
55
|
+
|
|
56
|
+
# Dictionary to store mapping of anonymized values to original entity values
|
|
57
|
+
entities_map = {}
|
|
58
|
+
|
|
59
|
+
# Replace each entity with its anonymized value
|
|
60
|
+
for result in sorted_results:
|
|
61
|
+
entity_type = result.entity_type
|
|
62
|
+
entity_value = text[result.start : result.end]
|
|
63
|
+
|
|
64
|
+
# Use appropriate anonymizer based on entity type
|
|
65
|
+
if entity_type.upper() in ["EMAIL", "EMAIL_ADDRESS"]:
|
|
66
|
+
# Use format-preserving email anonymization
|
|
67
|
+
anonymized_value = self.email_anonymizer._anonymize_email(entity_value)
|
|
68
|
+
placeholder = anonymized_value
|
|
69
|
+
entities_map[anonymized_value] = entity_value
|
|
70
|
+
else:
|
|
71
|
+
# Use base anonymizer for other entity types
|
|
72
|
+
entity_hash = self.base_anonymizer._get_entity_hash(entity_type, entity_value)
|
|
73
|
+
placeholder = f"<{entity_hash}>"
|
|
74
|
+
entities_map[entity_hash] = entity_value
|
|
75
|
+
|
|
76
|
+
# Replace the entity in the text with the placeholder
|
|
77
|
+
masked_text = masked_text[: result.start] + placeholder + masked_text[result.end :]
|
|
78
|
+
|
|
79
|
+
return AnonymizationResult(masked_text=masked_text, entities=entities_map)
|
netra/anonymizer/base.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base anonymizer class for PII data anonymization.
|
|
3
|
+
|
|
4
|
+
This module provides the base anonymizer class that contains the core anonymization
|
|
5
|
+
logic that can be extended by specific anonymizer implementations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
from collections import OrderedDict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Callable, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from presidio_analyzer.recognizer_result import RecognizerResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class AnonymizationResult:
|
|
18
|
+
"""
|
|
19
|
+
Result of anonymization containing the masked text and entity mappings.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
masked_text: The text with PII entities replaced by hash placeholders.
|
|
23
|
+
entities: Dictionary mapping entity hashes to their original values.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
masked_text: str
|
|
27
|
+
entities: Dict[str, str]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BaseAnonymizer:
|
|
31
|
+
"""
|
|
32
|
+
Base anonymizer that replaces entities with consistent hash values.
|
|
33
|
+
|
|
34
|
+
This base anonymizer provides the core anonymization logic that can be
|
|
35
|
+
extended by specific anonymizer implementations for different entity types.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, hash_function: Optional[Callable[[str], str]] = None, cache_size: int = 1000):
|
|
39
|
+
"""
|
|
40
|
+
Initialize the BaseAnonymizer.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
hash_function: Optional custom hash function that takes a string and returns a hash.
|
|
44
|
+
If not provided, a default hash function will be used.
|
|
45
|
+
cache_size: Maximum number of entities to cache. Uses LRU eviction policy.
|
|
46
|
+
Default is 1000. Set to 0 to disable caching.
|
|
47
|
+
"""
|
|
48
|
+
self.hash_function = hash_function or self._default_hash_function
|
|
49
|
+
self.cache_size = cache_size
|
|
50
|
+
|
|
51
|
+
# Initialize LRU cache for entity hashes
|
|
52
|
+
if cache_size > 0:
|
|
53
|
+
self._entity_hash_cache: Optional[OrderedDict[str, str]] = OrderedDict()
|
|
54
|
+
else:
|
|
55
|
+
self._entity_hash_cache = None
|
|
56
|
+
|
|
57
|
+
def _default_hash_function(self, value: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Default hash function using SHA-256.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
value: The string to hash.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
A hexadecimal hash string.
|
|
66
|
+
"""
|
|
67
|
+
return hashlib.sha256(value.encode()).hexdigest()[:8]
|
|
68
|
+
|
|
69
|
+
def _get_entity_hash(self, entity_type: str, entity_value: str) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Get a consistent hash for an entity value, creating one if it doesn't exist.
|
|
72
|
+
Uses LRU cache with configurable size to balance performance and memory usage.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
entity_type: The type of entity (e.g., 'EMAIL', 'PHONE', etc.)
|
|
76
|
+
entity_value: The original value of the entity.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
A hash string for the entity.
|
|
80
|
+
"""
|
|
81
|
+
# Skip caching if cache_size is 0
|
|
82
|
+
if self.cache_size == 0:
|
|
83
|
+
entity_hash = f"{entity_type}_{self.hash_function(entity_value)}"
|
|
84
|
+
return entity_hash
|
|
85
|
+
|
|
86
|
+
# Create a composite key for the entity cache
|
|
87
|
+
cache_key = f"{entity_type}:{entity_value}"
|
|
88
|
+
|
|
89
|
+
# Check if entity exists in cache and move to end (mark as recently used)
|
|
90
|
+
if self._entity_hash_cache is not None and cache_key in self._entity_hash_cache:
|
|
91
|
+
# Move to end to mark as recently used
|
|
92
|
+
self._entity_hash_cache.move_to_end(cache_key)
|
|
93
|
+
return self._entity_hash_cache[cache_key]
|
|
94
|
+
|
|
95
|
+
# Generate a new hash for this entity
|
|
96
|
+
entity_hash = f"{entity_type}_{self.hash_function(entity_value)}"
|
|
97
|
+
|
|
98
|
+
# Add to cache if cache is enabled
|
|
99
|
+
if self._entity_hash_cache is not None:
|
|
100
|
+
self._entity_hash_cache[cache_key] = entity_hash
|
|
101
|
+
|
|
102
|
+
# Evict oldest entry if cache exceeds size limit
|
|
103
|
+
if len(self._entity_hash_cache) > self.cache_size:
|
|
104
|
+
# Remove the least recently used item (first item)
|
|
105
|
+
self._entity_hash_cache.popitem(last=False)
|
|
106
|
+
|
|
107
|
+
return entity_hash
|
|
108
|
+
|
|
109
|
+
def anonymize_entity(self, entity_type: str, entity_value: str) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Anonymize a single entity value.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
entity_type: The type of entity (e.g., 'EMAIL', 'PHONE', etc.)
|
|
115
|
+
entity_value: The original value of the entity.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
The anonymized entity value.
|
|
119
|
+
"""
|
|
120
|
+
# Get or create hash for this entity
|
|
121
|
+
entity_hash = self._get_entity_hash(entity_type, entity_value)
|
|
122
|
+
return f"<{entity_hash}>"
|
|
123
|
+
|
|
124
|
+
def anonymize(self, text: str, analyzer_results: List[RecognizerResult]) -> AnonymizationResult:
|
|
125
|
+
"""
|
|
126
|
+
Anonymize text by replacing detected entities with hash values.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
text: The original text containing PII.
|
|
130
|
+
analyzer_results: List of RecognizerResult objects from the Presidio analyzer.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
AnonymizationResult containing the masked text and a mapping of entity hashes to original values.
|
|
134
|
+
"""
|
|
135
|
+
# Sort results by start index in descending order to avoid offset issues when replacing
|
|
136
|
+
sorted_results = sorted(analyzer_results, key=lambda x: x.start, reverse=True)
|
|
137
|
+
|
|
138
|
+
# Make a copy of the original text that we'll modify
|
|
139
|
+
masked_text = text
|
|
140
|
+
|
|
141
|
+
# Dictionary to store mapping of hash values to original entity values
|
|
142
|
+
entities_map: Dict[str, str] = {}
|
|
143
|
+
|
|
144
|
+
# Replace each entity with its hash
|
|
145
|
+
for result in sorted_results:
|
|
146
|
+
entity_type = result.entity_type
|
|
147
|
+
entity_value = text[result.start : result.end]
|
|
148
|
+
|
|
149
|
+
# Get or create hash for this entity
|
|
150
|
+
entity_hash = self._get_entity_hash(entity_type, entity_value)
|
|
151
|
+
|
|
152
|
+
# Replace the entity in the text with the hash placeholder
|
|
153
|
+
placeholder = f"<{entity_hash}>"
|
|
154
|
+
masked_text = masked_text[: result.start] + placeholder + masked_text[result.end :]
|
|
155
|
+
|
|
156
|
+
# Store the mapping of hash to original value
|
|
157
|
+
entities_map[entity_hash] = entity_value
|
|
158
|
+
|
|
159
|
+
return AnonymizationResult(masked_text=masked_text, entities=entities_map)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import random
|
|
3
|
+
import re
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FormatPreservingEmailAnonymizer:
|
|
8
|
+
def __init__(self, preserve_length: bool = True, preserve_structure: bool = True):
|
|
9
|
+
"""
|
|
10
|
+
Initialize the email anonymizer.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
preserve_length: Whether to preserve the length of original parts
|
|
14
|
+
preserve_structure: Whether to preserve dots, hyphens in the structure
|
|
15
|
+
"""
|
|
16
|
+
self.preserve_length = preserve_length
|
|
17
|
+
self.preserve_structure = preserve_structure
|
|
18
|
+
self.email_cache: Dict[str, str] = {}
|
|
19
|
+
self.part_cache: Dict[str, str] = {} # Cache for individual parts
|
|
20
|
+
|
|
21
|
+
# Character sets for replacement
|
|
22
|
+
self.alphanumeric = "abcdefghijklmnopqrstuvwxyz0123456789"
|
|
23
|
+
self.letters = "abcdefghijklmnopqrstuvwxyz"
|
|
24
|
+
|
|
25
|
+
def _get_deterministic_random(self, seed: str) -> random.Random:
|
|
26
|
+
"""Create a deterministic random generator from a seed."""
|
|
27
|
+
# Use hash of the seed as random seed for consistency
|
|
28
|
+
hash_int = int(hashlib.md5(seed.encode()).hexdigest()[:8], 16)
|
|
29
|
+
return random.Random(hash_int)
|
|
30
|
+
|
|
31
|
+
def _preserve_structure_replace(self, text: str, seed: str) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Replace text while preserving structure (length, special chars, case pattern).
|
|
34
|
+
"""
|
|
35
|
+
if text in self.part_cache:
|
|
36
|
+
return self.part_cache[text]
|
|
37
|
+
|
|
38
|
+
rng = self._get_deterministic_random(seed)
|
|
39
|
+
result = []
|
|
40
|
+
|
|
41
|
+
for char in text:
|
|
42
|
+
if char.isalpha():
|
|
43
|
+
# Preserve case pattern
|
|
44
|
+
new_char = rng.choice(self.letters)
|
|
45
|
+
result.append(new_char.upper() if char.isupper() else new_char)
|
|
46
|
+
elif char.isdigit():
|
|
47
|
+
result.append(str(rng.randint(0, 9)))
|
|
48
|
+
else:
|
|
49
|
+
# Keep special characters (dots, hyphens, etc.)
|
|
50
|
+
result.append(char)
|
|
51
|
+
|
|
52
|
+
anonymized = "".join(result)
|
|
53
|
+
self.part_cache[text] = anonymized
|
|
54
|
+
return anonymized
|
|
55
|
+
|
|
56
|
+
def _simple_hash_replace(self, text: str, target_length: Optional[int] = None) -> str:
|
|
57
|
+
"""Simple hash replacement with optional length preservation."""
|
|
58
|
+
if target_length is None:
|
|
59
|
+
target_length = len(text)
|
|
60
|
+
|
|
61
|
+
hash_val = hashlib.md5(text.encode()).hexdigest()
|
|
62
|
+
|
|
63
|
+
# Create a mix of letters and numbers that looks more natural
|
|
64
|
+
result = []
|
|
65
|
+
for i in range(target_length):
|
|
66
|
+
if i < len(hash_val):
|
|
67
|
+
char = hash_val[i]
|
|
68
|
+
if char.isdigit():
|
|
69
|
+
result.append(char)
|
|
70
|
+
else:
|
|
71
|
+
# Convert hex chars to letters
|
|
72
|
+
result.append(chr(ord("a") + (ord(char) - ord("a")) % 26))
|
|
73
|
+
else:
|
|
74
|
+
result.append("x")
|
|
75
|
+
|
|
76
|
+
return "".join(result)
|
|
77
|
+
|
|
78
|
+
def _anonymize_email(self, email: str) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Anonymize a single email while preserving format and structure.
|
|
81
|
+
"""
|
|
82
|
+
if email in self.email_cache:
|
|
83
|
+
return self.email_cache[email]
|
|
84
|
+
|
|
85
|
+
# Split email into local part and domain
|
|
86
|
+
local_part, domain = email.split("@", 1)
|
|
87
|
+
|
|
88
|
+
if self.preserve_structure:
|
|
89
|
+
# Preserve the structure (dots, hyphens, length, case pattern)
|
|
90
|
+
local_anonymized = self._preserve_structure_replace(local_part, f"local_{local_part}")
|
|
91
|
+
domain_anonymized = self._preserve_structure_replace(domain, f"domain_{domain}")
|
|
92
|
+
else:
|
|
93
|
+
# Simple length-preserving hash
|
|
94
|
+
local_length = len(local_part) if self.preserve_length else 8
|
|
95
|
+
domain_length = len(domain) if self.preserve_length else 8
|
|
96
|
+
|
|
97
|
+
local_anonymized = self._simple_hash_replace(local_part, local_length)
|
|
98
|
+
domain_anonymized = self._simple_hash_replace(domain, domain_length)
|
|
99
|
+
|
|
100
|
+
anonymized_email = f"{local_anonymized}@{domain_anonymized}"
|
|
101
|
+
self.email_cache[email] = anonymized_email
|
|
102
|
+
|
|
103
|
+
return anonymized_email
|
|
104
|
+
|
|
105
|
+
def anonymize_text(self, text: str) -> str:
|
|
106
|
+
"""
|
|
107
|
+
Anonymize all emails in the given text while preserving format.
|
|
108
|
+
"""
|
|
109
|
+
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
|
|
110
|
+
|
|
111
|
+
def replace_email(match: re.Match[str]) -> str:
|
|
112
|
+
email = match.group(0)
|
|
113
|
+
return self._anonymize_email(email)
|
|
114
|
+
|
|
115
|
+
return re.sub(email_pattern, replace_email, text)
|
|
116
|
+
|
|
117
|
+
def get_mapping(self) -> Dict[str, str]:
|
|
118
|
+
"""Return the mapping of original emails to anonymized versions."""
|
|
119
|
+
return self.email_cache.copy()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Example usage and comparison
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
print("Format-Preserving Email Anonymization - Structure Preserving:\n")
|
|
125
|
+
|
|
126
|
+
# Structure-preserving anonymizer
|
|
127
|
+
anonymizer1 = FormatPreservingEmailAnonymizer(preserve_structure=True)
|
|
128
|
+
|
|
129
|
+
test_emails = [
|
|
130
|
+
"john@gmail.com",
|
|
131
|
+
"john@gmail.com",
|
|
132
|
+
"john@outlook.com",
|
|
133
|
+
"joe@outlook.com",
|
|
134
|
+
"user.name@company.co.uk",
|
|
135
|
+
"test-email@sub.example.org",
|
|
136
|
+
"Admin123@BigCorp.net",
|
|
137
|
+
"a@b.co",
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
print("Structure-Preserving Mode:")
|
|
141
|
+
print("=" * 50)
|
|
142
|
+
for email in test_emails:
|
|
143
|
+
anonymized = anonymizer1._anonymize_email(email)
|
|
144
|
+
print(f"{email:25} -> {anonymized}")
|
|
145
|
+
|
|
146
|
+
print("\n" + "=" * 50)
|
|
147
|
+
print("Length-Preserving Mode:")
|
|
148
|
+
print("=" * 50)
|
|
149
|
+
|
|
150
|
+
# Length-preserving but simpler anonymizer
|
|
151
|
+
anonymizer2 = FormatPreservingEmailAnonymizer(preserve_length=True, preserve_structure=False)
|
|
152
|
+
|
|
153
|
+
for email in test_emails:
|
|
154
|
+
anonymized = anonymizer2._anonymize_email(email)
|
|
155
|
+
print(f"{email:25} -> {anonymized}")
|
|
156
|
+
|
|
157
|
+
# Test with full text
|
|
158
|
+
print("\n" + "=" * 70)
|
|
159
|
+
print("Full Text Anonymization Examples:")
|
|
160
|
+
print("=" * 70)
|
|
161
|
+
|
|
162
|
+
test_texts = [
|
|
163
|
+
"Hi, my name is John and my email is john@gmail.com",
|
|
164
|
+
"Contact: support@company.com or admin@BigCorp.net",
|
|
165
|
+
"Emails: user.name@test.co.uk, simple@domain.org",
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
for text in test_texts:
|
|
169
|
+
anonymized = anonymizer1.anonymize_text(text)
|
|
170
|
+
print(f"Original: {text}")
|
|
171
|
+
print(f"Anonymized: {anonymized}")
|
|
172
|
+
print()
|
|
173
|
+
|
|
174
|
+
# Consistency test
|
|
175
|
+
print("Consistency Test:")
|
|
176
|
+
print("-" * 30)
|
|
177
|
+
email = "john@gmail.com"
|
|
178
|
+
result1 = anonymizer1._anonymize_email(email)
|
|
179
|
+
result2 = anonymizer1._anonymize_email(email)
|
|
180
|
+
print(f"First call: {email} -> {result1}")
|
|
181
|
+
print(f"Second call: {email} -> {result2}")
|
|
182
|
+
print(f"Consistent: {'✓' if result1 == result2 else '✗'}")
|
netra/config.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from opentelemetry.util.re import parse_env_headers
|
|
6
|
+
|
|
7
|
+
from netra.version import __version__
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Config:
|
|
11
|
+
"""
|
|
12
|
+
Holds configuration options for the tracer:
|
|
13
|
+
- app_name: Logical name for this service
|
|
14
|
+
- otlp_endpoint: URL for OTLP collector
|
|
15
|
+
- api_key: API key for the collector (sent as Bearer token)
|
|
16
|
+
- headers: Additional headers (W3C Correlation-Context format)
|
|
17
|
+
- disable_batch: Whether to disable batch span processor (bool)
|
|
18
|
+
- trace_content: Whether to capture prompt/completion content (bool)
|
|
19
|
+
- resource_attributes: Custom resource attributes dict (e.g., {'env': 'prod', 'version': '1.0.0'})
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# SDK Constants
|
|
23
|
+
SDK_NAME = "netra"
|
|
24
|
+
LIBRARY_NAME = "netra"
|
|
25
|
+
LIBRARY_VERSION = __version__
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
app_name: Optional[str] = None,
|
|
30
|
+
headers: Optional[str] = None,
|
|
31
|
+
disable_batch: Optional[bool] = None,
|
|
32
|
+
trace_content: Optional[bool] = None,
|
|
33
|
+
resource_attributes: Optional[Dict[str, Any]] = None,
|
|
34
|
+
environment: Optional[str] = None,
|
|
35
|
+
):
|
|
36
|
+
# Application name: from param, else env
|
|
37
|
+
self.app_name = (
|
|
38
|
+
app_name or os.getenv("OTEL_SERVICE_NAME") or os.getenv("NETRA_APP_NAME") or "llm_tracing_service"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# OTLP endpoint: if explicit param, else OTEL_EXPORTER_OTLP_ENDPOINT
|
|
42
|
+
self.otlp_endpoint = os.getenv("NETRA_OTLP_ENDPOINT") or os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
43
|
+
|
|
44
|
+
# API key: if explicit param, else env NETRA_API_KEY
|
|
45
|
+
self.api_key = os.getenv("NETRA_API_KEY")
|
|
46
|
+
self.headers = {}
|
|
47
|
+
|
|
48
|
+
# Custom headers: comma-separated W3C format (if provided, overrides API key)
|
|
49
|
+
headers = headers or os.getenv("NETRA_HEADERS")
|
|
50
|
+
|
|
51
|
+
if isinstance(headers, str):
|
|
52
|
+
self.headers = parse_env_headers(headers)
|
|
53
|
+
|
|
54
|
+
if self.otlp_endpoint == "https://api.dev.getcombat.ai" and not self.api_key:
|
|
55
|
+
print("Error: Missing Netra API key, go to https://app.dev.getcombat.ai/api-key to create one")
|
|
56
|
+
print("Set the NETRA_API_KEY environment variable to the key")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
# Handle API key authentication based on OTLP endpoint
|
|
60
|
+
if self.api_key and self.otlp_endpoint:
|
|
61
|
+
# For Netra endpoints, use x-api-key header
|
|
62
|
+
if "getcombat" in self.otlp_endpoint.lower():
|
|
63
|
+
if not self.headers:
|
|
64
|
+
self.headers = {"x-api-key": self.api_key}
|
|
65
|
+
elif "x-api-key" not in self.headers:
|
|
66
|
+
self.headers = {**self.headers, "x-api-key": self.api_key}
|
|
67
|
+
# For other endpoints, set up basic auth
|
|
68
|
+
else:
|
|
69
|
+
if not self.headers:
|
|
70
|
+
self.headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
71
|
+
elif "Authorization" not in self.headers:
|
|
72
|
+
self.headers = {**self.headers, "Authorization": f"Bearer {self.api_key}"}
|
|
73
|
+
|
|
74
|
+
# Disable batch span processor?
|
|
75
|
+
if disable_batch is not None:
|
|
76
|
+
self.disable_batch = disable_batch
|
|
77
|
+
else:
|
|
78
|
+
# Environment var can be "true"/"false"
|
|
79
|
+
env_db = os.getenv("NETRA_DISABLE_BATCH")
|
|
80
|
+
self.disable_batch = True if (env_db is not None and env_db.lower() in ("1", "true")) else False
|
|
81
|
+
|
|
82
|
+
# Trace content (prompts/completions)? Default true unless env says false
|
|
83
|
+
if trace_content is not None:
|
|
84
|
+
self.trace_content = trace_content
|
|
85
|
+
else:
|
|
86
|
+
env_tc = os.getenv("NETRA_TRACE_CONTENT")
|
|
87
|
+
self.trace_content = False if (env_tc is not None and env_tc.lower() in ("0", "false")) else True
|
|
88
|
+
|
|
89
|
+
# 7. Environment: param override, else env
|
|
90
|
+
if environment is not None:
|
|
91
|
+
self.environment = environment
|
|
92
|
+
else:
|
|
93
|
+
self.environment = os.getenv("NETRA_ENV", "local")
|
|
94
|
+
|
|
95
|
+
# Resource attributes: param override, else parse JSON from env, else empty dict
|
|
96
|
+
if resource_attributes is not None:
|
|
97
|
+
self.resource_attributes = resource_attributes
|
|
98
|
+
else:
|
|
99
|
+
# Expecting something like: {"env":"prod","version":"1.0.0"}
|
|
100
|
+
env_ra = os.getenv("NETRA_RESOURCE_ATTRS")
|
|
101
|
+
if env_ra:
|
|
102
|
+
try:
|
|
103
|
+
self.resource_attributes = json.loads(env_ra)
|
|
104
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
105
|
+
import logging
|
|
106
|
+
|
|
107
|
+
logger = logging.getLogger(__name__)
|
|
108
|
+
logger.warning(f"Failed to parse NETRA_RESOURCE_ATTRS: {e}")
|
|
109
|
+
self.resource_attributes = {}
|
|
110
|
+
else:
|
|
111
|
+
self.resource_attributes = {}
|