raindrop-ai 0.0.25__tar.gz → 0.0.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/PKG-INFO +1 -1
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/pyproject.toml +1 -1
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/raindrop/redact.py +66 -0
- raindrop_ai-0.0.26/raindrop/version.py +1 -0
- raindrop_ai-0.0.26/raindrop/well-known-names.json +11548 -0
- raindrop_ai-0.0.25/raindrop/version.py +0 -1
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/README.md +0 -0
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/raindrop/__init__.py +0 -0
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/raindrop/analytics.py +0 -0
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/raindrop/interaction.py +0 -0
- {raindrop_ai-0.0.25 → raindrop_ai-0.0.26}/raindrop/models.py +0 -0
|
@@ -8,6 +8,10 @@ class PIIRedactor:
|
|
|
8
8
|
"""PII redactor that uses regex patterns to identify and replace PII."""
|
|
9
9
|
|
|
10
10
|
def __init__(self):
|
|
11
|
+
# Load well-known names
|
|
12
|
+
well_known_names_path = os.path.join(os.path.dirname(__file__), 'well-known-names.json')
|
|
13
|
+
with open(well_known_names_path, 'r') as f:
|
|
14
|
+
self.well_known_names = json.load(f)
|
|
11
15
|
|
|
12
16
|
# Build regex patterns
|
|
13
17
|
self._build_patterns()
|
|
@@ -64,6 +68,9 @@ class PIIRedactor:
|
|
|
64
68
|
re.MULTILINE
|
|
65
69
|
)
|
|
66
70
|
|
|
71
|
+
# Well-known names pattern
|
|
72
|
+
names_pattern_str = r'\b(' + '|'.join(re.escape(name) for name in self.well_known_names) + r')\b'
|
|
73
|
+
self.well_known_names_pattern = re.compile(names_pattern_str, re.IGNORECASE)
|
|
67
74
|
|
|
68
75
|
# Credentials pattern (API keys, tokens, etc.)
|
|
69
76
|
self.credentials_pattern = re.compile(
|
|
@@ -71,6 +78,61 @@ class PIIRedactor:
|
|
|
71
78
|
re.IGNORECASE
|
|
72
79
|
)
|
|
73
80
|
|
|
81
|
+
|
|
82
|
+
def redact_names(self, text: str) -> str:
|
|
83
|
+
"""Redact names using greeting/closing context and well-known names."""
|
|
84
|
+
if not isinstance(text, str):
|
|
85
|
+
return text
|
|
86
|
+
|
|
87
|
+
# First, redact well-known names
|
|
88
|
+
text = self.well_known_names_pattern.sub('<REDACTED_NAME>', text)
|
|
89
|
+
|
|
90
|
+
# Find names after greetings
|
|
91
|
+
greeting_matches = list(self.greeting_pattern.finditer(text))
|
|
92
|
+
for match in reversed(greeting_matches): # Process in reverse to maintain positions
|
|
93
|
+
# Look for capitalized words after the greeting
|
|
94
|
+
start_pos = match.end()
|
|
95
|
+
# Find the next word(s) that could be a name
|
|
96
|
+
name_match = re.match(r'\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text[start_pos:])
|
|
97
|
+
if name_match:
|
|
98
|
+
# Replace the name
|
|
99
|
+
name_start = start_pos + name_match.start(1)
|
|
100
|
+
name_end = start_pos + name_match.end(1)
|
|
101
|
+
text = text[:name_start] + '<REDACTED_NAME>' + text[name_end:]
|
|
102
|
+
|
|
103
|
+
# Find names before closings
|
|
104
|
+
lines = text.split('\n')
|
|
105
|
+
for i, line in enumerate(lines):
|
|
106
|
+
closing_match = self.closing_pattern.search(line)
|
|
107
|
+
if closing_match:
|
|
108
|
+
# Look for names before the closing
|
|
109
|
+
before_closing = line[:closing_match.start()]
|
|
110
|
+
# Check if there's a name at the end of the text before closing
|
|
111
|
+
name_before_closing = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*$', before_closing)
|
|
112
|
+
if name_before_closing:
|
|
113
|
+
lines[i] = (before_closing[:name_before_closing.start(1)] +
|
|
114
|
+
'<REDACTED_NAME>' +
|
|
115
|
+
before_closing[name_before_closing.end(1):] +
|
|
116
|
+
line[closing_match.start():])
|
|
117
|
+
|
|
118
|
+
text = '\n'.join(lines)
|
|
119
|
+
|
|
120
|
+
# Use generic name pattern for standalone names (like signatures)
|
|
121
|
+
# Only apply to lines that look like signatures (short lines with just names)
|
|
122
|
+
lines = text.split('\n')
|
|
123
|
+
for i, line in enumerate(lines):
|
|
124
|
+
stripped = line.strip()
|
|
125
|
+
# Check if line looks like a signature (short, starts with capital, no other context)
|
|
126
|
+
if (len(stripped) < 50 and
|
|
127
|
+
re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*[,.]?$', stripped) and
|
|
128
|
+
'<REDACTED_NAME>' not in line):
|
|
129
|
+
lines[i] = line.replace(stripped, '<REDACTED_NAME>')
|
|
130
|
+
|
|
131
|
+
text = '\n'.join(lines)
|
|
132
|
+
|
|
133
|
+
return text
|
|
134
|
+
|
|
135
|
+
|
|
74
136
|
def redact(self, text: str) -> str:
|
|
75
137
|
"""Redact PII from the given text using regex patterns."""
|
|
76
138
|
if not isinstance(text, str):
|
|
@@ -98,6 +160,10 @@ class PIIRedactor:
|
|
|
98
160
|
# Street addresses
|
|
99
161
|
text = self.address_pattern.sub('<REDACTED_ADDRESS>', text)
|
|
100
162
|
|
|
163
|
+
# Names
|
|
164
|
+
text = self.redact_names(text)
|
|
165
|
+
|
|
166
|
+
|
|
101
167
|
# Note: IPs, URLs, usernames, and zipcodes are disabled by default
|
|
102
168
|
# to match JS SDK behavior
|
|
103
169
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
VERSION = "0.0.26"
|