raindrop-ai 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
raindrop/analytics.py CHANGED
@@ -13,6 +13,7 @@ from threading import Timer
13
13
  from raindrop.version import VERSION
14
14
  from raindrop.models import TrackAIEvent, Attachment, SignalEvent, DefaultSignal, FeedbackSignal, EditSignal, PartialTrackAIEvent, PartialAIData
15
15
  from raindrop.interaction import Interaction
16
+ from raindrop.redact import perform_pii_redaction
16
17
 
17
18
 
18
19
 
@@ -28,6 +29,7 @@ upload_interval = 1.0
28
29
  buffer = []
29
30
  flush_lock = threading.Lock()
30
31
  debug_logs = False
32
+ redact_pii = False
31
33
  flush_thread = None
32
34
  shutdown_event = threading.Event()
33
35
  max_ingest_size_bytes = 1 * 1024 * 1024 # 1 MB
@@ -44,6 +46,14 @@ def set_debug_logs(value: bool):
44
46
  else:
45
47
  logger.setLevel(logging.INFO)
46
48
 
49
+ def set_redact_pii(value: bool):
50
+ global redact_pii
51
+ redact_pii = value
52
+ if redact_pii:
53
+ logger.info("PII redaction enabled")
54
+ else:
55
+ logger.info("PII redaction disabled")
56
+
47
57
  def start_flush_thread():
48
58
  logger.debug("Opening flush thread")
49
59
  global flush_thread
@@ -177,6 +187,10 @@ def track_ai(
177
187
  payload.properties["$context"] = _get_context()
178
188
 
179
189
  data = payload.model_dump(mode="json")
190
+
191
+ # Apply PII redaction if enabled
192
+ if redact_pii:
193
+ data = perform_pii_redaction(data)
180
194
 
181
195
  size = _get_size(data)
182
196
  if size > max_ingest_size_bytes:
@@ -399,6 +413,11 @@ def _flush_partial_event(event_id: str) -> None:
399
413
 
400
414
  # convert to ordinary TrackAIEvent-ish dict before send
401
415
  data = evt.model_dump(mode="json", exclude_none=True)
416
+
417
+ # Apply PII redaction if enabled
418
+ if redact_pii:
419
+ data = perform_pii_redaction(data)
420
+
402
421
  size = _get_size(data)
403
422
  if size > max_ingest_size_bytes:
404
423
  logger.warning(f"[raindrop] partial event {event_id} > 1 MB; skipping")
raindrop/redact.py ADDED
@@ -0,0 +1,200 @@
1
+ import re
2
+ import json
3
+ import os
4
+ from typing import Dict, Any
5
+
6
+
7
+ class PIIRedactor:
8
+ """PII redactor that uses regex patterns to identify and replace PII."""
9
+
10
+ def __init__(self):
11
+ # Load well-known names
12
+ well_known_names_path = os.path.join(os.path.dirname(__file__), 'well-known-names.json')
13
+ with open(well_known_names_path, 'r') as f:
14
+ self.well_known_names = json.load(f)
15
+
16
+ # Build regex patterns
17
+ self._build_patterns()
18
+
19
+ def _build_patterns(self):
20
+ """Build regex patterns for PII detection."""
21
+ # Email pattern
22
+ self.email_pattern = re.compile(
23
+ r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
24
+ re.IGNORECASE
25
+ )
26
+
27
+ # Credit card pattern (basic - matches 13-19 digits with optional spaces/dashes)
28
+ self.credit_card_pattern = re.compile(
29
+ r'\b(?:\d[ -]*?){13,19}\b'
30
+ )
31
+
32
+ # Phone number pattern (US-style, but flexible)
33
+ self.phone_pattern = re.compile(
34
+ r'(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
35
+ )
36
+
37
+ # SSN pattern
38
+ self.ssn_pattern = re.compile(
39
+ r'\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b'
40
+ )
41
+
42
+ # Password/secret pattern (matches patterns like "password: xxx" or "secret: xxx")
43
+ self.password_pattern = re.compile(
44
+ r'\b(pass(word|phrase)?|secret|pwd|passwd)\s*[:=]\s*\S+',
45
+ re.IGNORECASE
46
+ )
47
+
48
+ # Street address pattern (simplified)
49
+ self.address_pattern = re.compile(
50
+ r'\b\d+\s+[A-Za-z\s]+\s+(street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|plaza|pl|terrace|ter|place|pl|way|parkway|pkwy)\b',
51
+ re.IGNORECASE
52
+ )
53
+
54
+ # Greeting/closing patterns for name detection
55
+ self.greeting_pattern = re.compile(
56
+ r'(^|\.\s+)(dear|hi|hello|greetings|hey|hey there)[\s-]*',
57
+ re.IGNORECASE
58
+ )
59
+
60
+ self.closing_pattern = re.compile(
61
+ r'(thx|thanks|thank you|regards|best|[a-z]+ly|[a-z]+ regards|all the best|happy [a-z]+ing|take care|have a [a-z]+ (weekend|night|day))\s*[,.!]*',
62
+ re.IGNORECASE
63
+ )
64
+
65
+ # Generic name pattern (capitalized words)
66
+ self.generic_name_pattern = re.compile(
67
+ r'( ?(([A-Z][a-z]+)|([A-Z]\.))+)([,.]|[,.]?$)',
68
+ re.MULTILINE
69
+ )
70
+
71
+ # Well-known names pattern
72
+ names_pattern_str = r'\b(' + '|'.join(re.escape(name) for name in self.well_known_names) + r')\b'
73
+ self.well_known_names_pattern = re.compile(names_pattern_str, re.IGNORECASE)
74
+
75
+ # Credentials pattern (API keys, tokens, etc.)
76
+ self.credentials_pattern = re.compile(
77
+ r'\b(api[_-]?key|token|bearer|authorization|auth[_-]?token|access[_-]?token|secret[_-]?key)\s*[:=]\s*["\']?[\w-]+["\']?',
78
+ re.IGNORECASE
79
+ )
80
+
81
+
82
+ def redact_names(self, text: str) -> str:
83
+ """Redact names using greeting/closing context and well-known names."""
84
+ if not isinstance(text, str):
85
+ return text
86
+
87
+ # First, redact well-known names
88
+ text = self.well_known_names_pattern.sub('<REDACTED_NAME>', text)
89
+
90
+ # Find names after greetings
91
+ greeting_matches = list(self.greeting_pattern.finditer(text))
92
+ for match in reversed(greeting_matches): # Process in reverse to maintain positions
93
+ # Look for capitalized words after the greeting
94
+ start_pos = match.end()
95
+ # Find the next word(s) that could be a name
96
+ name_match = re.match(r'\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text[start_pos:])
97
+ if name_match:
98
+ # Replace the name
99
+ name_start = start_pos + name_match.start(1)
100
+ name_end = start_pos + name_match.end(1)
101
+ text = text[:name_start] + '<REDACTED_NAME>' + text[name_end:]
102
+
103
+ # Find names before closings
104
+ lines = text.split('\n')
105
+ for i, line in enumerate(lines):
106
+ closing_match = self.closing_pattern.search(line)
107
+ if closing_match:
108
+ # Look for names before the closing
109
+ before_closing = line[:closing_match.start()]
110
+ # Check if there's a name at the end of the text before closing
111
+ name_before_closing = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*$', before_closing)
112
+ if name_before_closing:
113
+ lines[i] = (before_closing[:name_before_closing.start(1)] +
114
+ '<REDACTED_NAME>' +
115
+ before_closing[name_before_closing.end(1):] +
116
+ line[closing_match.start():])
117
+
118
+ text = '\n'.join(lines)
119
+
120
+ # Use generic name pattern for standalone names (like signatures)
121
+ # Only apply to lines that look like signatures (short lines with just names)
122
+ lines = text.split('\n')
123
+ for i, line in enumerate(lines):
124
+ stripped = line.strip()
125
+ # Check if line looks like a signature (short, starts with capital, no other context)
126
+ if (len(stripped) < 50 and
127
+ re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*[,.]?$', stripped) and
128
+ '<REDACTED_NAME>' not in line):
129
+ lines[i] = line.replace(stripped, '<REDACTED_NAME>')
130
+
131
+ text = '\n'.join(lines)
132
+
133
+ return text
134
+
135
+
136
+ def redact(self, text: str) -> str:
137
+ """Redact PII from the given text using regex patterns."""
138
+ if not isinstance(text, str):
139
+ return text
140
+
141
+ # Apply redactions in order
142
+ # Credentials
143
+ text = self.credentials_pattern.sub('<REDACTED_CREDENTIALS>', text)
144
+
145
+ # Credit card numbers
146
+ text = self.credit_card_pattern.sub('<REDACTED_CREDIT_CARD>', text)
147
+
148
+ # Email addresses
149
+ text = self.email_pattern.sub('<REDACTED_EMAIL>', text)
150
+
151
+ # Phone numbers
152
+ text = self.phone_pattern.sub('<REDACTED_PHONE>', text)
153
+
154
+ # SSN
155
+ text = self.ssn_pattern.sub('<REDACTED_SSN>', text)
156
+
157
+ # Passwords/secrets
158
+ text = self.password_pattern.sub('<REDACTED_SECRET>', text)
159
+
160
+ # Street addresses
161
+ text = self.address_pattern.sub('<REDACTED_ADDRESS>', text)
162
+
163
+ # Names
164
+ text = self.redact_names(text)
165
+
166
+
167
+ # Note: IPs, URLs, usernames, and zipcodes are disabled by default
168
+ # to match JS SDK behavior
169
+
170
+ return text
171
+
172
+
173
+ def perform_pii_redaction(event_data: Dict[str, Any]) -> Dict[str, Any]:
174
+ """
175
+ Redact PII from event data, specifically targeting ai_data input and output fields.
176
+
177
+ Args:
178
+ event_data: The event data dictionary to redact PII from
179
+
180
+ Returns:
181
+ The event data with PII redacted
182
+ """
183
+ redactor = PIIRedactor()
184
+
185
+ # Create a copy to avoid modifying the original
186
+ event_copy = event_data.copy()
187
+
188
+ # Redact PII from ai_data fields if they exist
189
+ if 'ai_data' in event_copy and event_copy['ai_data']:
190
+ ai_data = event_copy['ai_data'].copy() if event_copy['ai_data'] else {}
191
+
192
+ if 'input' in ai_data and ai_data['input']:
193
+ ai_data['input'] = redactor.redact(ai_data['input'])
194
+
195
+ if 'output' in ai_data and ai_data['output']:
196
+ ai_data['output'] = redactor.redact(ai_data['output'])
197
+
198
+ event_copy['ai_data'] = ai_data
199
+
200
+ return event_copy
raindrop/version.py CHANGED
@@ -1 +1 @@
1
- VERSION = "0.0.19"
1
+ VERSION = "0.0.26"