raindrop-ai 0.0.24__tar.gz → 0.0.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/PKG-INFO +1 -1
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/pyproject.toml +1 -1
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/raindrop/analytics.py +19 -0
- raindrop_ai-0.0.26/raindrop/redact.py +200 -0
- raindrop_ai-0.0.26/raindrop/version.py +1 -0
- raindrop_ai-0.0.26/raindrop/well-known-names.json +11548 -0
- raindrop_ai-0.0.24/raindrop/version.py +0 -1
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/README.md +0 -0
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/raindrop/__init__.py +0 -0
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/raindrop/interaction.py +0 -0
- {raindrop_ai-0.0.24 → raindrop_ai-0.0.26}/raindrop/models.py +0 -0
|
@@ -13,6 +13,7 @@ from threading import Timer
|
|
|
13
13
|
from raindrop.version import VERSION
|
|
14
14
|
from raindrop.models import TrackAIEvent, Attachment, SignalEvent, DefaultSignal, FeedbackSignal, EditSignal, PartialTrackAIEvent, PartialAIData
|
|
15
15
|
from raindrop.interaction import Interaction
|
|
16
|
+
from raindrop.redact import perform_pii_redaction
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
|
|
@@ -28,6 +29,7 @@ upload_interval = 1.0
|
|
|
28
29
|
buffer = []
|
|
29
30
|
flush_lock = threading.Lock()
|
|
30
31
|
debug_logs = False
|
|
32
|
+
redact_pii = False
|
|
31
33
|
flush_thread = None
|
|
32
34
|
shutdown_event = threading.Event()
|
|
33
35
|
max_ingest_size_bytes = 1 * 1024 * 1024 # 1 MB
|
|
@@ -44,6 +46,14 @@ def set_debug_logs(value: bool):
|
|
|
44
46
|
else:
|
|
45
47
|
logger.setLevel(logging.INFO)
|
|
46
48
|
|
|
49
|
+
def set_redact_pii(value: bool):
|
|
50
|
+
global redact_pii
|
|
51
|
+
redact_pii = value
|
|
52
|
+
if redact_pii:
|
|
53
|
+
logger.info("PII redaction enabled")
|
|
54
|
+
else:
|
|
55
|
+
logger.info("PII redaction disabled")
|
|
56
|
+
|
|
47
57
|
def start_flush_thread():
|
|
48
58
|
logger.debug("Opening flush thread")
|
|
49
59
|
global flush_thread
|
|
@@ -177,6 +187,10 @@ def track_ai(
|
|
|
177
187
|
payload.properties["$context"] = _get_context()
|
|
178
188
|
|
|
179
189
|
data = payload.model_dump(mode="json")
|
|
190
|
+
|
|
191
|
+
# Apply PII redaction if enabled
|
|
192
|
+
if redact_pii:
|
|
193
|
+
data = perform_pii_redaction(data)
|
|
180
194
|
|
|
181
195
|
size = _get_size(data)
|
|
182
196
|
if size > max_ingest_size_bytes:
|
|
@@ -399,6 +413,11 @@ def _flush_partial_event(event_id: str) -> None:
|
|
|
399
413
|
|
|
400
414
|
# convert to ordinary TrackAIEvent-ish dict before send
|
|
401
415
|
data = evt.model_dump(mode="json", exclude_none=True)
|
|
416
|
+
|
|
417
|
+
# Apply PII redaction if enabled
|
|
418
|
+
if redact_pii:
|
|
419
|
+
data = perform_pii_redaction(data)
|
|
420
|
+
|
|
402
421
|
size = _get_size(data)
|
|
403
422
|
if size > max_ingest_size_bytes:
|
|
404
423
|
logger.warning(f"[raindrop] partial event {event_id} > 1 MB; skipping")
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PIIRedactor:
|
|
8
|
+
"""PII redactor that uses regex patterns to identify and replace PII."""
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
# Load well-known names
|
|
12
|
+
well_known_names_path = os.path.join(os.path.dirname(__file__), 'well-known-names.json')
|
|
13
|
+
with open(well_known_names_path, 'r') as f:
|
|
14
|
+
self.well_known_names = json.load(f)
|
|
15
|
+
|
|
16
|
+
# Build regex patterns
|
|
17
|
+
self._build_patterns()
|
|
18
|
+
|
|
19
|
+
def _build_patterns(self):
|
|
20
|
+
"""Build regex patterns for PII detection."""
|
|
21
|
+
# Email pattern
|
|
22
|
+
self.email_pattern = re.compile(
|
|
23
|
+
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
|
24
|
+
re.IGNORECASE
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Credit card pattern (basic - matches 13-19 digits with optional spaces/dashes)
|
|
28
|
+
self.credit_card_pattern = re.compile(
|
|
29
|
+
r'\b(?:\d[ -]*?){13,19}\b'
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Phone number pattern (US-style, but flexible)
|
|
33
|
+
self.phone_pattern = re.compile(
|
|
34
|
+
r'(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# SSN pattern
|
|
38
|
+
self.ssn_pattern = re.compile(
|
|
39
|
+
r'\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b'
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Password/secret pattern (matches patterns like "password: xxx" or "secret: xxx")
|
|
43
|
+
self.password_pattern = re.compile(
|
|
44
|
+
r'\b(pass(word|phrase)?|secret|pwd|passwd)\s*[:=]\s*\S+',
|
|
45
|
+
re.IGNORECASE
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Street address pattern (simplified)
|
|
49
|
+
self.address_pattern = re.compile(
|
|
50
|
+
r'\b\d+\s+[A-Za-z\s]+\s+(street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|plaza|pl|terrace|ter|place|pl|way|parkway|pkwy)\b',
|
|
51
|
+
re.IGNORECASE
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Greeting/closing patterns for name detection
|
|
55
|
+
self.greeting_pattern = re.compile(
|
|
56
|
+
r'(^|\.\s+)(dear|hi|hello|greetings|hey|hey there)[\s-]*',
|
|
57
|
+
re.IGNORECASE
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
self.closing_pattern = re.compile(
|
|
61
|
+
r'(thx|thanks|thank you|regards|best|[a-z]+ly|[a-z]+ regards|all the best|happy [a-z]+ing|take care|have a [a-z]+ (weekend|night|day))\s*[,.!]*',
|
|
62
|
+
re.IGNORECASE
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Generic name pattern (capitalized words)
|
|
66
|
+
self.generic_name_pattern = re.compile(
|
|
67
|
+
r'( ?(([A-Z][a-z]+)|([A-Z]\.))+)([,.]|[,.]?$)',
|
|
68
|
+
re.MULTILINE
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Well-known names pattern
|
|
72
|
+
names_pattern_str = r'\b(' + '|'.join(re.escape(name) for name in self.well_known_names) + r')\b'
|
|
73
|
+
self.well_known_names_pattern = re.compile(names_pattern_str, re.IGNORECASE)
|
|
74
|
+
|
|
75
|
+
# Credentials pattern (API keys, tokens, etc.)
|
|
76
|
+
self.credentials_pattern = re.compile(
|
|
77
|
+
r'\b(api[_-]?key|token|bearer|authorization|auth[_-]?token|access[_-]?token|secret[_-]?key)\s*[:=]\s*["\']?[\w-]+["\']?',
|
|
78
|
+
re.IGNORECASE
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def redact_names(self, text: str) -> str:
|
|
83
|
+
"""Redact names using greeting/closing context and well-known names."""
|
|
84
|
+
if not isinstance(text, str):
|
|
85
|
+
return text
|
|
86
|
+
|
|
87
|
+
# First, redact well-known names
|
|
88
|
+
text = self.well_known_names_pattern.sub('<REDACTED_NAME>', text)
|
|
89
|
+
|
|
90
|
+
# Find names after greetings
|
|
91
|
+
greeting_matches = list(self.greeting_pattern.finditer(text))
|
|
92
|
+
for match in reversed(greeting_matches): # Process in reverse to maintain positions
|
|
93
|
+
# Look for capitalized words after the greeting
|
|
94
|
+
start_pos = match.end()
|
|
95
|
+
# Find the next word(s) that could be a name
|
|
96
|
+
name_match = re.match(r'\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text[start_pos:])
|
|
97
|
+
if name_match:
|
|
98
|
+
# Replace the name
|
|
99
|
+
name_start = start_pos + name_match.start(1)
|
|
100
|
+
name_end = start_pos + name_match.end(1)
|
|
101
|
+
text = text[:name_start] + '<REDACTED_NAME>' + text[name_end:]
|
|
102
|
+
|
|
103
|
+
# Find names before closings
|
|
104
|
+
lines = text.split('\n')
|
|
105
|
+
for i, line in enumerate(lines):
|
|
106
|
+
closing_match = self.closing_pattern.search(line)
|
|
107
|
+
if closing_match:
|
|
108
|
+
# Look for names before the closing
|
|
109
|
+
before_closing = line[:closing_match.start()]
|
|
110
|
+
# Check if there's a name at the end of the text before closing
|
|
111
|
+
name_before_closing = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*$', before_closing)
|
|
112
|
+
if name_before_closing:
|
|
113
|
+
lines[i] = (before_closing[:name_before_closing.start(1)] +
|
|
114
|
+
'<REDACTED_NAME>' +
|
|
115
|
+
before_closing[name_before_closing.end(1):] +
|
|
116
|
+
line[closing_match.start():])
|
|
117
|
+
|
|
118
|
+
text = '\n'.join(lines)
|
|
119
|
+
|
|
120
|
+
# Use generic name pattern for standalone names (like signatures)
|
|
121
|
+
# Only apply to lines that look like signatures (short lines with just names)
|
|
122
|
+
lines = text.split('\n')
|
|
123
|
+
for i, line in enumerate(lines):
|
|
124
|
+
stripped = line.strip()
|
|
125
|
+
# Check if line looks like a signature (short, starts with capital, no other context)
|
|
126
|
+
if (len(stripped) < 50 and
|
|
127
|
+
re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*[,.]?$', stripped) and
|
|
128
|
+
'<REDACTED_NAME>' not in line):
|
|
129
|
+
lines[i] = line.replace(stripped, '<REDACTED_NAME>')
|
|
130
|
+
|
|
131
|
+
text = '\n'.join(lines)
|
|
132
|
+
|
|
133
|
+
return text
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def redact(self, text: str) -> str:
|
|
137
|
+
"""Redact PII from the given text using regex patterns."""
|
|
138
|
+
if not isinstance(text, str):
|
|
139
|
+
return text
|
|
140
|
+
|
|
141
|
+
# Apply redactions in order
|
|
142
|
+
# Credentials
|
|
143
|
+
text = self.credentials_pattern.sub('<REDACTED_CREDENTIALS>', text)
|
|
144
|
+
|
|
145
|
+
# Credit card numbers
|
|
146
|
+
text = self.credit_card_pattern.sub('<REDACTED_CREDIT_CARD>', text)
|
|
147
|
+
|
|
148
|
+
# Email addresses
|
|
149
|
+
text = self.email_pattern.sub('<REDACTED_EMAIL>', text)
|
|
150
|
+
|
|
151
|
+
# Phone numbers
|
|
152
|
+
text = self.phone_pattern.sub('<REDACTED_PHONE>', text)
|
|
153
|
+
|
|
154
|
+
# SSN
|
|
155
|
+
text = self.ssn_pattern.sub('<REDACTED_SSN>', text)
|
|
156
|
+
|
|
157
|
+
# Passwords/secrets
|
|
158
|
+
text = self.password_pattern.sub('<REDACTED_SECRET>', text)
|
|
159
|
+
|
|
160
|
+
# Street addresses
|
|
161
|
+
text = self.address_pattern.sub('<REDACTED_ADDRESS>', text)
|
|
162
|
+
|
|
163
|
+
# Names
|
|
164
|
+
text = self.redact_names(text)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Note: IPs, URLs, usernames, and zipcodes are disabled by default
|
|
168
|
+
# to match JS SDK behavior
|
|
169
|
+
|
|
170
|
+
return text
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def perform_pii_redaction(event_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
174
|
+
"""
|
|
175
|
+
Redact PII from event data, specifically targeting ai_data input and output fields.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
event_data: The event data dictionary to redact PII from
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
The event data with PII redacted
|
|
182
|
+
"""
|
|
183
|
+
redactor = PIIRedactor()
|
|
184
|
+
|
|
185
|
+
# Create a copy to avoid modifying the original
|
|
186
|
+
event_copy = event_data.copy()
|
|
187
|
+
|
|
188
|
+
# Redact PII from ai_data fields if they exist
|
|
189
|
+
if 'ai_data' in event_copy and event_copy['ai_data']:
|
|
190
|
+
ai_data = event_copy['ai_data'].copy() if event_copy['ai_data'] else {}
|
|
191
|
+
|
|
192
|
+
if 'input' in ai_data and ai_data['input']:
|
|
193
|
+
ai_data['input'] = redactor.redact(ai_data['input'])
|
|
194
|
+
|
|
195
|
+
if 'output' in ai_data and ai_data['output']:
|
|
196
|
+
ai_data['output'] = redactor.redact(ai_data['output'])
|
|
197
|
+
|
|
198
|
+
event_copy['ai_data'] = ai_data
|
|
199
|
+
|
|
200
|
+
return event_copy
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
VERSION = "0.0.26"
|