posthog 7.5.1__py3-none-any.whl → 7.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,522 @@
1
+ import os
2
+ import unittest
3
+
4
+ from posthog.ai.sanitization import (
5
+ redact_base64_data_url,
6
+ sanitize_openai,
7
+ sanitize_openai_response,
8
+ sanitize_anthropic,
9
+ sanitize_gemini,
10
+ sanitize_langchain,
11
+ is_base64_data_url,
12
+ is_raw_base64,
13
+ REDACTED_IMAGE_PLACEHOLDER,
14
+ )
15
+
16
+
17
+ class TestSanitization(unittest.TestCase):
18
+ def setUp(self):
19
+ self.sample_base64_image = "..."
20
+ self.sample_base64_png = "..."
21
+ self.regular_url = "https://example.com/image.jpg"
22
+ self.raw_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUl=="
23
+
24
+ def test_is_base64_data_url(self):
25
+ self.assertTrue(is_base64_data_url(self.sample_base64_image))
26
+ self.assertTrue(is_base64_data_url(self.sample_base64_png))
27
+ self.assertFalse(is_base64_data_url(self.regular_url))
28
+ self.assertFalse(is_base64_data_url("regular text"))
29
+
30
+ def test_is_raw_base64(self):
31
+ self.assertTrue(is_raw_base64(self.raw_base64))
32
+ self.assertFalse(is_raw_base64("short"))
33
+ self.assertFalse(is_raw_base64(self.regular_url))
34
+ self.assertFalse(is_raw_base64("/path/to/file"))
35
+
36
+ def test_redact_base64_data_url(self):
37
+ self.assertEqual(
38
+ redact_base64_data_url(self.sample_base64_image), REDACTED_IMAGE_PLACEHOLDER
39
+ )
40
+ self.assertEqual(
41
+ redact_base64_data_url(self.sample_base64_png), REDACTED_IMAGE_PLACEHOLDER
42
+ )
43
+ self.assertEqual(redact_base64_data_url(self.regular_url), self.regular_url)
44
+ self.assertEqual(redact_base64_data_url(None), None)
45
+ self.assertEqual(redact_base64_data_url(123), 123)
46
+
47
+ def test_sanitize_openai(self):
48
+ input_data = [
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {"type": "text", "text": "What is in this image?"},
53
+ {
54
+ "type": "image_url",
55
+ "image_url": {
56
+ "url": self.sample_base64_image,
57
+ "detail": "high",
58
+ },
59
+ },
60
+ ],
61
+ }
62
+ ]
63
+
64
+ result = sanitize_openai(input_data)
65
+
66
+ self.assertEqual(result[0]["content"][0]["text"], "What is in this image?")
67
+ self.assertEqual(
68
+ result[0]["content"][1]["image_url"]["url"], REDACTED_IMAGE_PLACEHOLDER
69
+ )
70
+ self.assertEqual(result[0]["content"][1]["image_url"]["detail"], "high")
71
+
72
+ def test_sanitize_openai_preserves_regular_urls(self):
73
+ input_data = [
74
+ {
75
+ "role": "user",
76
+ "content": [
77
+ {
78
+ "type": "image_url",
79
+ "image_url": {"url": self.regular_url},
80
+ }
81
+ ],
82
+ }
83
+ ]
84
+
85
+ result = sanitize_openai(input_data)
86
+ self.assertEqual(result[0]["content"][0]["image_url"]["url"], self.regular_url)
87
+
88
+ def test_sanitize_openai_response(self):
89
+ input_data = [
90
+ {
91
+ "role": "user",
92
+ "content": [
93
+ {
94
+ "type": "input_image",
95
+ "image_url": self.sample_base64_image,
96
+ }
97
+ ],
98
+ }
99
+ ]
100
+
101
+ result = sanitize_openai_response(input_data)
102
+ self.assertEqual(
103
+ result[0]["content"][0]["image_url"], REDACTED_IMAGE_PLACEHOLDER
104
+ )
105
+
106
+ def test_sanitize_anthropic(self):
107
+ input_data = [
108
+ {
109
+ "role": "user",
110
+ "content": [
111
+ {"type": "text", "text": "What is in this image?"},
112
+ {
113
+ "type": "image",
114
+ "source": {
115
+ "type": "base64",
116
+ "media_type": "image/jpeg",
117
+ "data": "base64data",
118
+ },
119
+ },
120
+ ],
121
+ }
122
+ ]
123
+
124
+ result = sanitize_anthropic(input_data)
125
+
126
+ self.assertEqual(result[0]["content"][0]["text"], "What is in this image?")
127
+ self.assertEqual(
128
+ result[0]["content"][1]["source"]["data"], REDACTED_IMAGE_PLACEHOLDER
129
+ )
130
+ self.assertEqual(result[0]["content"][1]["source"]["type"], "base64")
131
+ self.assertEqual(result[0]["content"][1]["source"]["media_type"], "image/jpeg")
132
+
133
+ def test_sanitize_gemini(self):
134
+ input_data = [
135
+ {
136
+ "parts": [
137
+ {"text": "What is in this image?"},
138
+ {
139
+ "inline_data": {
140
+ "mime_type": "image/jpeg",
141
+ "data": "base64data",
142
+ }
143
+ },
144
+ ]
145
+ }
146
+ ]
147
+
148
+ result = sanitize_gemini(input_data)
149
+
150
+ self.assertEqual(result[0]["parts"][0]["text"], "What is in this image?")
151
+ self.assertEqual(
152
+ result[0]["parts"][1]["inline_data"]["data"], REDACTED_IMAGE_PLACEHOLDER
153
+ )
154
+ self.assertEqual(
155
+ result[0]["parts"][1]["inline_data"]["mime_type"], "image/jpeg"
156
+ )
157
+
158
+ def test_sanitize_langchain_openai_style(self):
159
+ input_data = [
160
+ {
161
+ "role": "user",
162
+ "content": [
163
+ {
164
+ "type": "image_url",
165
+ "image_url": {"url": self.sample_base64_image},
166
+ }
167
+ ],
168
+ }
169
+ ]
170
+
171
+ result = sanitize_langchain(input_data)
172
+ self.assertEqual(
173
+ result[0]["content"][0]["image_url"]["url"], REDACTED_IMAGE_PLACEHOLDER
174
+ )
175
+
176
+ def test_sanitize_langchain_anthropic_style(self):
177
+ input_data = [
178
+ {
179
+ "role": "user",
180
+ "content": [
181
+ {
182
+ "type": "image",
183
+ "source": {"data": "base64data"},
184
+ }
185
+ ],
186
+ }
187
+ ]
188
+
189
+ result = sanitize_langchain(input_data)
190
+ self.assertEqual(
191
+ result[0]["content"][0]["source"]["data"], REDACTED_IMAGE_PLACEHOLDER
192
+ )
193
+
194
+ def test_sanitize_with_data_url_format(self):
195
+ # Test that data URLs are properly detected and redacted across providers
196
+ data_url = ""
197
+
198
+ # OpenAI format
199
+ openai_data = [
200
+ {
201
+ "role": "user",
202
+ "content": [{"type": "image_url", "image_url": {"url": data_url}}],
203
+ }
204
+ ]
205
+ result = sanitize_openai(openai_data)
206
+ self.assertEqual(
207
+ result[0]["content"][0]["image_url"]["url"], REDACTED_IMAGE_PLACEHOLDER
208
+ )
209
+
210
+ # Anthropic format
211
+ anthropic_data = [
212
+ {
213
+ "role": "user",
214
+ "content": [
215
+ {
216
+ "type": "image",
217
+ "source": {
218
+ "type": "base64",
219
+ "media_type": "image/jpeg",
220
+ "data": data_url,
221
+ },
222
+ }
223
+ ],
224
+ }
225
+ ]
226
+ result = sanitize_anthropic(anthropic_data)
227
+ self.assertEqual(
228
+ result[0]["content"][0]["source"]["data"], REDACTED_IMAGE_PLACEHOLDER
229
+ )
230
+
231
+ # LangChain format
232
+ langchain_data = [
233
+ {"role": "user", "content": [{"type": "image", "data": data_url}]}
234
+ ]
235
+ result = sanitize_langchain(langchain_data)
236
+ self.assertEqual(result[0]["content"][0]["data"], REDACTED_IMAGE_PLACEHOLDER)
237
+
238
+ def test_sanitize_with_raw_base64(self):
239
+ # Test that raw base64 strings (without data URL prefix) are detected
240
+ raw_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUl=="
241
+
242
+ # Test with Anthropic format
243
+ anthropic_data = [
244
+ {
245
+ "role": "user",
246
+ "content": [
247
+ {
248
+ "type": "image",
249
+ "source": {
250
+ "type": "base64",
251
+ "media_type": "image/png",
252
+ "data": raw_base64,
253
+ },
254
+ }
255
+ ],
256
+ }
257
+ ]
258
+ result = sanitize_anthropic(anthropic_data)
259
+ self.assertEqual(
260
+ result[0]["content"][0]["source"]["data"], REDACTED_IMAGE_PLACEHOLDER
261
+ )
262
+
263
+ # Test with Gemini format
264
+ gemini_data = [
265
+ {"parts": [{"inline_data": {"mime_type": "image/png", "data": raw_base64}}]}
266
+ ]
267
+ result = sanitize_gemini(gemini_data)
268
+ self.assertEqual(
269
+ result[0]["parts"][0]["inline_data"]["data"], REDACTED_IMAGE_PLACEHOLDER
270
+ )
271
+
272
+ def test_sanitize_preserves_regular_content(self):
273
+ # Ensure non-base64 content is preserved across all providers
274
+ regular_url = "https://example.com/image.jpg"
275
+ text_content = "What do you see?"
276
+
277
+ # OpenAI
278
+ openai_data = [
279
+ {
280
+ "role": "user",
281
+ "content": [
282
+ {"type": "text", "text": text_content},
283
+ {"type": "image_url", "image_url": {"url": regular_url}},
284
+ ],
285
+ }
286
+ ]
287
+ result = sanitize_openai(openai_data)
288
+ self.assertEqual(result[0]["content"][0]["text"], text_content)
289
+ self.assertEqual(result[0]["content"][1]["image_url"]["url"], regular_url)
290
+
291
+ # Anthropic
292
+ anthropic_data = [
293
+ {
294
+ "role": "user",
295
+ "content": [
296
+ {"type": "text", "text": text_content},
297
+ {"type": "image", "source": {"type": "url", "url": regular_url}},
298
+ ],
299
+ }
300
+ ]
301
+ result = sanitize_anthropic(anthropic_data)
302
+ self.assertEqual(result[0]["content"][0]["text"], text_content)
303
+ # URL-based images should remain unchanged
304
+ self.assertEqual(result[0]["content"][1]["source"]["url"], regular_url)
305
+
306
+ def test_sanitize_handles_non_dict_content(self):
307
+ input_data = [{"role": "user", "content": "Just text"}]
308
+
309
+ result = sanitize_openai(input_data)
310
+ self.assertEqual(result, input_data)
311
+
312
+ def test_sanitize_handles_none_input(self):
313
+ self.assertIsNone(sanitize_openai(None))
314
+ self.assertIsNone(sanitize_anthropic(None))
315
+ self.assertIsNone(sanitize_gemini(None))
316
+ self.assertIsNone(sanitize_langchain(None))
317
+
318
+ def test_sanitize_handles_single_message(self):
319
+ input_data = {
320
+ "role": "user",
321
+ "content": [
322
+ {
323
+ "type": "image_url",
324
+ "image_url": {"url": self.sample_base64_image},
325
+ }
326
+ ],
327
+ }
328
+
329
+ result = sanitize_openai(input_data)
330
+ self.assertEqual(
331
+ result["content"][0]["image_url"]["url"], REDACTED_IMAGE_PLACEHOLDER
332
+ )
333
+
334
+
335
+ class TestAIMultipartRequest(unittest.TestCase):
336
+ """Test that _INTERNAL_LLMA_MULTIMODAL environment variable controls sanitization."""
337
+
338
+ def tearDown(self):
339
+ # Clean up environment variable after each test
340
+ if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
341
+ del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
342
+
343
+ def test_multimodal_disabled_redacts_images(self):
344
+ """When _INTERNAL_LLMA_MULTIMODAL is not set, images should be redacted."""
345
+ if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
346
+ del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
347
+
348
+ base64_image = "..."
349
+ result = redact_base64_data_url(base64_image)
350
+ self.assertEqual(result, REDACTED_IMAGE_PLACEHOLDER)
351
+
352
+ def test_multimodal_enabled_preserves_images(self):
353
+ """When _INTERNAL_LLMA_MULTIMODAL is true, images should be preserved."""
354
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
355
+
356
+ base64_image = "..."
357
+ result = redact_base64_data_url(base64_image)
358
+ self.assertEqual(result, base64_image)
359
+
360
+ def test_multimodal_enabled_with_1(self):
361
+ """_INTERNAL_LLMA_MULTIMODAL=1 should enable multimodal."""
362
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "1"
363
+
364
+ base64_image = "..."
365
+ result = redact_base64_data_url(base64_image)
366
+ self.assertEqual(result, base64_image)
367
+
368
+ def test_multimodal_enabled_with_yes(self):
369
+ """_INTERNAL_LLMA_MULTIMODAL=yes should enable multimodal."""
370
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "yes"
371
+
372
+ base64_image = "..."
373
+ result = redact_base64_data_url(base64_image)
374
+ self.assertEqual(result, base64_image)
375
+
376
+ def test_multimodal_false_redacts_images(self):
377
+ """_INTERNAL_LLMA_MULTIMODAL=false should still redact."""
378
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "false"
379
+
380
+ base64_image = "..."
381
+ result = redact_base64_data_url(base64_image)
382
+ self.assertEqual(result, REDACTED_IMAGE_PLACEHOLDER)
383
+
384
+ def test_anthropic_multimodal_enabled(self):
385
+ """Anthropic images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
386
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
387
+
388
+ input_data = [
389
+ {
390
+ "role": "user",
391
+ "content": [
392
+ {
393
+ "type": "image",
394
+ "source": {
395
+ "type": "base64",
396
+ "media_type": "image/jpeg",
397
+ "data": "base64data",
398
+ },
399
+ }
400
+ ],
401
+ }
402
+ ]
403
+
404
+ result = sanitize_anthropic(input_data)
405
+ self.assertEqual(result[0]["content"][0]["source"]["data"], "base64data")
406
+
407
+ def test_gemini_multimodal_enabled(self):
408
+ """Gemini images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
409
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
410
+
411
+ input_data = [
412
+ {
413
+ "parts": [
414
+ {"inline_data": {"mime_type": "image/jpeg", "data": "base64data"}}
415
+ ]
416
+ }
417
+ ]
418
+
419
+ result = sanitize_gemini(input_data)
420
+ self.assertEqual(result[0]["parts"][0]["inline_data"]["data"], "base64data")
421
+
422
+ def test_langchain_anthropic_style_multimodal_enabled(self):
423
+ """LangChain Anthropic-style images should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
424
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
425
+
426
+ input_data = [
427
+ {
428
+ "role": "user",
429
+ "content": [
430
+ {
431
+ "type": "image",
432
+ "source": {"data": "base64data"},
433
+ }
434
+ ],
435
+ }
436
+ ]
437
+
438
+ result = sanitize_langchain(input_data)
439
+ self.assertEqual(result[0]["content"][0]["source"]["data"], "base64data")
440
+
441
+ def test_openai_audio_redacted_by_default(self):
442
+ """OpenAI audio should be redacted when _INTERNAL_LLMA_MULTIMODAL is not set."""
443
+ if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
444
+ del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
445
+
446
+ input_data = [
447
+ {
448
+ "role": "assistant",
449
+ "content": [
450
+ {"type": "audio", "data": "base64audiodata", "id": "audio_123"}
451
+ ],
452
+ }
453
+ ]
454
+
455
+ result = sanitize_openai(input_data)
456
+ self.assertEqual(result[0]["content"][0]["data"], REDACTED_IMAGE_PLACEHOLDER)
457
+ self.assertEqual(result[0]["content"][0]["id"], "audio_123")
458
+
459
+ def test_openai_audio_preserved_with_flag(self):
460
+ """OpenAI audio should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
461
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
462
+
463
+ input_data = [
464
+ {
465
+ "role": "assistant",
466
+ "content": [
467
+ {"type": "audio", "data": "base64audiodata", "id": "audio_123"}
468
+ ],
469
+ }
470
+ ]
471
+
472
+ result = sanitize_openai(input_data)
473
+ self.assertEqual(result[0]["content"][0]["data"], "base64audiodata")
474
+
475
+ def test_gemini_audio_redacted_by_default(self):
476
+ """Gemini audio should be redacted when _INTERNAL_LLMA_MULTIMODAL is not set."""
477
+ if "_INTERNAL_LLMA_MULTIMODAL" in os.environ:
478
+ del os.environ["_INTERNAL_LLMA_MULTIMODAL"]
479
+
480
+ input_data = [
481
+ {
482
+ "parts": [
483
+ {
484
+ "inline_data": {
485
+ "mime_type": "audio/L16;codec=pcm;rate=24000",
486
+ "data": "base64audiodata",
487
+ }
488
+ }
489
+ ]
490
+ }
491
+ ]
492
+
493
+ result = sanitize_gemini(input_data)
494
+ self.assertEqual(
495
+ result[0]["parts"][0]["inline_data"]["data"], REDACTED_IMAGE_PLACEHOLDER
496
+ )
497
+
498
+ def test_gemini_audio_preserved_with_flag(self):
499
+ """Gemini audio should be preserved when _INTERNAL_LLMA_MULTIMODAL is enabled."""
500
+ os.environ["_INTERNAL_LLMA_MULTIMODAL"] = "true"
501
+
502
+ input_data = [
503
+ {
504
+ "parts": [
505
+ {
506
+ "inline_data": {
507
+ "mime_type": "audio/L16;codec=pcm;rate=24000",
508
+ "data": "base64audiodata",
509
+ }
510
+ }
511
+ ]
512
+ }
513
+ ]
514
+
515
+ result = sanitize_gemini(input_data)
516
+ self.assertEqual(
517
+ result[0]["parts"][0]["inline_data"]["data"], "base64audiodata"
518
+ )
519
+
520
+
521
+ if __name__ == "__main__":
522
+ unittest.main()