langchain 1.0.0rc2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ """Shared redaction utilities for middleware components."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import ipaddress
7
+ import re
8
+ from collections.abc import Callable, Sequence
9
+ from dataclasses import dataclass
10
+ from typing import Literal
11
+ from urllib.parse import urlparse
12
+
13
+ from typing_extensions import TypedDict
14
+
15
+ RedactionStrategy = Literal["block", "redact", "mask", "hash"]
16
+ """Supported strategies for handling detected sensitive values."""
17
+
18
+
19
+ class PIIMatch(TypedDict):
20
+ """Represents an individual match of sensitive data."""
21
+
22
+ type: str
23
+ value: str
24
+ start: int
25
+ end: int
26
+
27
+
28
+ class PIIDetectionError(Exception):
29
+ """Raised when configured to block on detected sensitive values."""
30
+
31
+ def __init__(self, pii_type: str, matches: Sequence[PIIMatch]) -> None:
32
+ """Initialize the exception with match context.
33
+
34
+ Args:
35
+ pii_type: Name of the detected sensitive type.
36
+ matches: All matches that were detected for that type.
37
+ """
38
+ self.pii_type = pii_type
39
+ self.matches = list(matches)
40
+ count = len(matches)
41
+ msg = f"Detected {count} instance(s) of {pii_type} in text content"
42
+ super().__init__(msg)
43
+
44
+
45
+ Detector = Callable[[str], list[PIIMatch]]
46
+ """Callable signature for detectors that locate sensitive values."""
47
+
48
+
49
+ def detect_email(content: str) -> list[PIIMatch]:
50
+ """Detect email addresses in content."""
51
+ pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
52
+ return [
53
+ PIIMatch(
54
+ type="email",
55
+ value=match.group(),
56
+ start=match.start(),
57
+ end=match.end(),
58
+ )
59
+ for match in re.finditer(pattern, content)
60
+ ]
61
+
62
+
63
+ def detect_credit_card(content: str) -> list[PIIMatch]:
64
+ """Detect credit card numbers in content using Luhn validation."""
65
+ pattern = r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"
66
+ matches = []
67
+
68
+ for match in re.finditer(pattern, content):
69
+ card_number = match.group()
70
+ if _passes_luhn(card_number):
71
+ matches.append(
72
+ PIIMatch(
73
+ type="credit_card",
74
+ value=card_number,
75
+ start=match.start(),
76
+ end=match.end(),
77
+ )
78
+ )
79
+
80
+ return matches
81
+
82
+
83
+ def detect_ip(content: str) -> list[PIIMatch]:
84
+ """Detect IPv4 or IPv6 addresses in content."""
85
+ matches: list[PIIMatch] = []
86
+ ipv4_pattern = r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b"
87
+
88
+ for match in re.finditer(ipv4_pattern, content):
89
+ ip_candidate = match.group()
90
+ try:
91
+ ipaddress.ip_address(ip_candidate)
92
+ except ValueError:
93
+ continue
94
+ matches.append(
95
+ PIIMatch(
96
+ type="ip",
97
+ value=ip_candidate,
98
+ start=match.start(),
99
+ end=match.end(),
100
+ )
101
+ )
102
+
103
+ return matches
104
+
105
+
106
+ def detect_mac_address(content: str) -> list[PIIMatch]:
107
+ """Detect MAC addresses in content."""
108
+ pattern = r"\b([0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b"
109
+ return [
110
+ PIIMatch(
111
+ type="mac_address",
112
+ value=match.group(),
113
+ start=match.start(),
114
+ end=match.end(),
115
+ )
116
+ for match in re.finditer(pattern, content)
117
+ ]
118
+
119
+
120
+ def detect_url(content: str) -> list[PIIMatch]:
121
+ """Detect URLs in content using regex and stdlib validation."""
122
+ matches: list[PIIMatch] = []
123
+
124
+ # Pattern 1: URLs with scheme (http:// or https://)
125
+ scheme_pattern = r"https?://[^\s<>\"{}|\\^`\[\]]+"
126
+
127
+ for match in re.finditer(scheme_pattern, content):
128
+ url = match.group()
129
+ result = urlparse(url)
130
+ if result.scheme in ("http", "https") and result.netloc:
131
+ matches.append(
132
+ PIIMatch(
133
+ type="url",
134
+ value=url,
135
+ start=match.start(),
136
+ end=match.end(),
137
+ )
138
+ )
139
+
140
+ # Pattern 2: URLs without scheme (www.example.com or example.com/path)
141
+ # More conservative to avoid false positives
142
+ bare_pattern = (
143
+ r"\b(?:www\.)?[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?"
144
+ r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+(?:/[^\s]*)?"
145
+ )
146
+
147
+ for match in re.finditer(bare_pattern, content):
148
+ start, end = match.start(), match.end()
149
+ # Skip if already matched with scheme
150
+ if any(m["start"] <= start < m["end"] or m["start"] < end <= m["end"] for m in matches):
151
+ continue
152
+
153
+ url = match.group()
154
+ # Only accept if it has a path or starts with www
155
+ # This reduces false positives like "example.com" in prose
156
+ if "/" in url or url.startswith("www."):
157
+ # Add scheme for validation (required for urlparse to work correctly)
158
+ test_url = f"http://{url}"
159
+ result = urlparse(test_url)
160
+ if result.netloc and "." in result.netloc:
161
+ matches.append(
162
+ PIIMatch(
163
+ type="url",
164
+ value=url,
165
+ start=start,
166
+ end=end,
167
+ )
168
+ )
169
+
170
+ return matches
171
+
172
+
173
+ BUILTIN_DETECTORS: dict[str, Detector] = {
174
+ "email": detect_email,
175
+ "credit_card": detect_credit_card,
176
+ "ip": detect_ip,
177
+ "mac_address": detect_mac_address,
178
+ "url": detect_url,
179
+ }
180
+ """Registry of built-in detectors keyed by type name."""
181
+
182
+
183
+ def _passes_luhn(card_number: str) -> bool:
184
+ """Validate credit card number using the Luhn checksum."""
185
+ digits = [int(d) for d in card_number if d.isdigit()]
186
+ if not 13 <= len(digits) <= 19:
187
+ return False
188
+
189
+ checksum = 0
190
+ for index, digit in enumerate(reversed(digits)):
191
+ value = digit
192
+ if index % 2 == 1:
193
+ value *= 2
194
+ if value > 9:
195
+ value -= 9
196
+ checksum += value
197
+ return checksum % 10 == 0
198
+
199
+
200
+ def _apply_redact_strategy(content: str, matches: list[PIIMatch]) -> str:
201
+ result = content
202
+ for match in sorted(matches, key=lambda item: item["start"], reverse=True):
203
+ replacement = f"[REDACTED_{match['type'].upper()}]"
204
+ result = result[: match["start"]] + replacement + result[match["end"] :]
205
+ return result
206
+
207
+
208
+ def _apply_mask_strategy(content: str, matches: list[PIIMatch]) -> str:
209
+ result = content
210
+ for match in sorted(matches, key=lambda item: item["start"], reverse=True):
211
+ value = match["value"]
212
+ pii_type = match["type"]
213
+ if pii_type == "email":
214
+ parts = value.split("@")
215
+ if len(parts) == 2:
216
+ domain_parts = parts[1].split(".")
217
+ masked = (
218
+ f"{parts[0]}@****.{domain_parts[-1]}"
219
+ if len(domain_parts) >= 2
220
+ else f"{parts[0]}@****"
221
+ )
222
+ else:
223
+ masked = "****"
224
+ elif pii_type == "credit_card":
225
+ digits_only = "".join(c for c in value if c.isdigit())
226
+ separator = "-" if "-" in value else " " if " " in value else ""
227
+ if separator:
228
+ masked = f"****{separator}****{separator}****{separator}{digits_only[-4:]}"
229
+ else:
230
+ masked = f"************{digits_only[-4:]}"
231
+ elif pii_type == "ip":
232
+ octets = value.split(".")
233
+ masked = f"*.*.*.{octets[-1]}" if len(octets) == 4 else "****"
234
+ elif pii_type == "mac_address":
235
+ separator = ":" if ":" in value else "-"
236
+ masked = (
237
+ f"**{separator}**{separator}**{separator}**{separator}**{separator}{value[-2:]}"
238
+ )
239
+ elif pii_type == "url":
240
+ masked = "[MASKED_URL]"
241
+ else:
242
+ masked = f"****{value[-4:]}" if len(value) > 4 else "****"
243
+ result = result[: match["start"]] + masked + result[match["end"] :]
244
+ return result
245
+
246
+
247
+ def _apply_hash_strategy(content: str, matches: list[PIIMatch]) -> str:
248
+ result = content
249
+ for match in sorted(matches, key=lambda item: item["start"], reverse=True):
250
+ digest = hashlib.sha256(match["value"].encode()).hexdigest()[:8]
251
+ replacement = f"<{match['type']}_hash:{digest}>"
252
+ result = result[: match["start"]] + replacement + result[match["end"] :]
253
+ return result
254
+
255
+
256
+ def apply_strategy(
257
+ content: str,
258
+ matches: list[PIIMatch],
259
+ strategy: RedactionStrategy,
260
+ ) -> str:
261
+ """Apply the configured strategy to matches within content."""
262
+ if not matches:
263
+ return content
264
+ if strategy == "redact":
265
+ return _apply_redact_strategy(content, matches)
266
+ if strategy == "mask":
267
+ return _apply_mask_strategy(content, matches)
268
+ if strategy == "hash":
269
+ return _apply_hash_strategy(content, matches)
270
+ if strategy == "block":
271
+ raise PIIDetectionError(matches[0]["type"], matches)
272
+ msg = f"Unknown redaction strategy: {strategy}"
273
+ raise ValueError(msg)
274
+
275
+
276
+ def resolve_detector(pii_type: str, detector: Detector | str | None) -> Detector:
277
+ """Return a callable detector for the given configuration."""
278
+ if detector is None:
279
+ if pii_type not in BUILTIN_DETECTORS:
280
+ msg = (
281
+ f"Unknown PII type: {pii_type}. "
282
+ f"Must be one of {list(BUILTIN_DETECTORS.keys())} or provide a custom detector."
283
+ )
284
+ raise ValueError(msg)
285
+ return BUILTIN_DETECTORS[pii_type]
286
+ if isinstance(detector, str):
287
+ pattern = re.compile(detector)
288
+
289
+ def regex_detector(content: str) -> list[PIIMatch]:
290
+ return [
291
+ PIIMatch(
292
+ type=pii_type,
293
+ value=match.group(),
294
+ start=match.start(),
295
+ end=match.end(),
296
+ )
297
+ for match in pattern.finditer(content)
298
+ ]
299
+
300
+ return regex_detector
301
+ return detector
302
+
303
+
304
+ @dataclass(frozen=True)
305
+ class RedactionRule:
306
+ """Configuration for handling a single PII type."""
307
+
308
+ pii_type: str
309
+ strategy: RedactionStrategy = "redact"
310
+ detector: Detector | str | None = None
311
+
312
+ def resolve(self) -> ResolvedRedactionRule:
313
+ """Resolve runtime detector and return an immutable rule."""
314
+ resolved_detector = resolve_detector(self.pii_type, self.detector)
315
+ return ResolvedRedactionRule(
316
+ pii_type=self.pii_type,
317
+ strategy=self.strategy,
318
+ detector=resolved_detector,
319
+ )
320
+
321
+
322
+ @dataclass(frozen=True)
323
+ class ResolvedRedactionRule:
324
+ """Resolved redaction rule ready for execution."""
325
+
326
+ pii_type: str
327
+ strategy: RedactionStrategy
328
+ detector: Detector
329
+
330
+ def apply(self, content: str) -> tuple[str, list[PIIMatch]]:
331
+ """Apply this rule to content, returning new content and matches."""
332
+ matches = self.detector(content)
333
+ if not matches:
334
+ return content, []
335
+ updated = apply_strategy(content, matches, self.strategy)
336
+ return updated, matches
337
+
338
+
339
+ __all__ = [
340
+ "PIIDetectionError",
341
+ "PIIMatch",
342
+ "RedactionRule",
343
+ "ResolvedRedactionRule",
344
+ "apply_strategy",
345
+ "detect_credit_card",
346
+ "detect_email",
347
+ "detect_ip",
348
+ "detect_mac_address",
349
+ "detect_url",
350
+ ]