posthog 6.7.0__py3-none-any.whl → 6.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,226 @@
1
+ import re
2
+ from typing import Any
3
+ from urllib.parse import urlparse
4
+
5
+ REDACTED_IMAGE_PLACEHOLDER = "[base64 image redacted]"
6
+
7
+
8
+ def is_base64_data_url(text: str) -> bool:
9
+ return re.match(r"^data:([^;]+);base64,", text) is not None
10
+
11
+
12
+ def is_valid_url(text: str) -> bool:
13
+ try:
14
+ result = urlparse(text)
15
+ return bool(result.scheme and result.netloc)
16
+ except Exception:
17
+ pass
18
+
19
+ return text.startswith(("/", "./", "../"))
20
+
21
+
22
+ def is_raw_base64(text: str) -> bool:
23
+ if is_valid_url(text):
24
+ return False
25
+
26
+ return len(text) > 20 and re.match(r"^[A-Za-z0-9+/]+=*$", text) is not None
27
+
28
+
29
+ def redact_base64_data_url(value: Any) -> Any:
30
+ if not isinstance(value, str):
31
+ return value
32
+
33
+ if is_base64_data_url(value):
34
+ return REDACTED_IMAGE_PLACEHOLDER
35
+
36
+ if is_raw_base64(value):
37
+ return REDACTED_IMAGE_PLACEHOLDER
38
+
39
+ return value
40
+
41
+
42
+ def process_messages(messages: Any, transform_content_func) -> Any:
43
+ if not messages:
44
+ return messages
45
+
46
+ def process_content(content: Any) -> Any:
47
+ if isinstance(content, str):
48
+ return content
49
+
50
+ if not content:
51
+ return content
52
+
53
+ if isinstance(content, list):
54
+ return [transform_content_func(item) for item in content]
55
+
56
+ return transform_content_func(content)
57
+
58
+ def process_message(msg: Any) -> Any:
59
+ if not isinstance(msg, dict) or "content" not in msg:
60
+ return msg
61
+ return {**msg, "content": process_content(msg["content"])}
62
+
63
+ if isinstance(messages, list):
64
+ return [process_message(msg) for msg in messages]
65
+
66
+ return process_message(messages)
67
+
68
+
69
+ def sanitize_openai_image(item: Any) -> Any:
70
+ if not isinstance(item, dict):
71
+ return item
72
+
73
+ if (
74
+ item.get("type") == "image_url"
75
+ and isinstance(item.get("image_url"), dict)
76
+ and "url" in item["image_url"]
77
+ ):
78
+ return {
79
+ **item,
80
+ "image_url": {
81
+ **item["image_url"],
82
+ "url": redact_base64_data_url(item["image_url"]["url"]),
83
+ },
84
+ }
85
+
86
+ return item
87
+
88
+
89
+ def sanitize_openai_response_image(item: Any) -> Any:
90
+ if not isinstance(item, dict):
91
+ return item
92
+
93
+ if item.get("type") == "input_image" and "image_url" in item:
94
+ return {
95
+ **item,
96
+ "image_url": redact_base64_data_url(item["image_url"]),
97
+ }
98
+
99
+ return item
100
+
101
+
102
+ def sanitize_anthropic_image(item: Any) -> Any:
103
+ if not isinstance(item, dict):
104
+ return item
105
+
106
+ if (
107
+ item.get("type") == "image"
108
+ and isinstance(item.get("source"), dict)
109
+ and item["source"].get("type") == "base64"
110
+ and "data" in item["source"]
111
+ ):
112
+ # For Anthropic, if the source type is "base64", we should always redact the data
113
+ # The provider is explicitly telling us this is base64 data
114
+ return {
115
+ **item,
116
+ "source": {
117
+ **item["source"],
118
+ "data": REDACTED_IMAGE_PLACEHOLDER,
119
+ },
120
+ }
121
+
122
+ return item
123
+
124
+
125
+ def sanitize_gemini_part(part: Any) -> Any:
126
+ if not isinstance(part, dict):
127
+ return part
128
+
129
+ if (
130
+ "inline_data" in part
131
+ and isinstance(part["inline_data"], dict)
132
+ and "data" in part["inline_data"]
133
+ ):
134
+ # For Gemini, the inline_data structure indicates base64 data
135
+ # We should redact any string data in this context
136
+ return {
137
+ **part,
138
+ "inline_data": {
139
+ **part["inline_data"],
140
+ "data": REDACTED_IMAGE_PLACEHOLDER,
141
+ },
142
+ }
143
+
144
+ return part
145
+
146
+
147
+ def process_gemini_item(item: Any) -> Any:
148
+ if not isinstance(item, dict):
149
+ return item
150
+
151
+ if "parts" in item and item["parts"]:
152
+ parts = item["parts"]
153
+ if isinstance(parts, list):
154
+ parts = [sanitize_gemini_part(part) for part in parts]
155
+ else:
156
+ parts = sanitize_gemini_part(parts)
157
+
158
+ return {**item, "parts": parts}
159
+
160
+ return item
161
+
162
+
163
+ def sanitize_langchain_image(item: Any) -> Any:
164
+ if not isinstance(item, dict):
165
+ return item
166
+
167
+ if (
168
+ item.get("type") == "image_url"
169
+ and isinstance(item.get("image_url"), dict)
170
+ and "url" in item["image_url"]
171
+ ):
172
+ return {
173
+ **item,
174
+ "image_url": {
175
+ **item["image_url"],
176
+ "url": redact_base64_data_url(item["image_url"]["url"]),
177
+ },
178
+ }
179
+
180
+ if item.get("type") == "image" and "data" in item:
181
+ return {**item, "data": redact_base64_data_url(item["data"])}
182
+
183
+ if (
184
+ item.get("type") == "image"
185
+ and isinstance(item.get("source"), dict)
186
+ and "data" in item["source"]
187
+ ):
188
+ # Anthropic style - raw base64 in structured format, always redact
189
+ return {
190
+ **item,
191
+ "source": {
192
+ **item["source"],
193
+ "data": REDACTED_IMAGE_PLACEHOLDER,
194
+ },
195
+ }
196
+
197
+ if item.get("type") == "media" and "data" in item:
198
+ return {**item, "data": redact_base64_data_url(item["data"])}
199
+
200
+ return item
201
+
202
+
203
+ def sanitize_openai(data: Any) -> Any:
204
+ return process_messages(data, sanitize_openai_image)
205
+
206
+
207
+ def sanitize_openai_response(data: Any) -> Any:
208
+ return process_messages(data, sanitize_openai_response_image)
209
+
210
+
211
+ def sanitize_anthropic(data: Any) -> Any:
212
+ return process_messages(data, sanitize_anthropic_image)
213
+
214
+
215
+ def sanitize_gemini(data: Any) -> Any:
216
+ if not data:
217
+ return data
218
+
219
+ if isinstance(data, list):
220
+ return [process_gemini_item(item) for item in data]
221
+
222
+ return process_gemini_item(data)
223
+
224
+
225
+ def sanitize_langchain(data: Any) -> Any:
226
+ return process_messages(data, sanitize_langchain_image)
posthog/ai/types.py ADDED
@@ -0,0 +1,142 @@
1
+ """
2
+ Common type definitions for PostHog AI SDK.
3
+
4
+ These types are used for formatting messages and responses across different AI providers
5
+ (Anthropic, OpenAI, Gemini, etc.) to ensure consistency in tracking and data structure.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Optional, TypedDict, Union
9
+
10
+
11
+ class FormattedTextContent(TypedDict):
12
+ """Formatted text content item."""
13
+
14
+ type: str # Literal["text"]
15
+ text: str
16
+
17
+
18
+ class FormattedFunctionCall(TypedDict, total=False):
19
+ """Formatted function/tool call content item."""
20
+
21
+ type: str # Literal["function"]
22
+ id: Optional[str]
23
+ function: Dict[str, Any] # Contains 'name' and 'arguments'
24
+
25
+
26
+ class FormattedImageContent(TypedDict):
27
+ """Formatted image content item."""
28
+
29
+ type: str # Literal["image"]
30
+ image: str
31
+
32
+
33
+ # Union type for all formatted content items
34
+ FormattedContentItem = Union[
35
+ FormattedTextContent,
36
+ FormattedFunctionCall,
37
+ FormattedImageContent,
38
+ Dict[str, Any], # Fallback for unknown content types
39
+ ]
40
+
41
+
42
+ class FormattedMessage(TypedDict):
43
+ """
44
+ Standardized message format for PostHog tracking.
45
+
46
+ Used across all providers to ensure consistent message structure
47
+ when sending events to PostHog.
48
+ """
49
+
50
+ role: str
51
+ content: Union[str, List[FormattedContentItem], Any]
52
+
53
+
54
+ class TokenUsage(TypedDict, total=False):
55
+ """
56
+ Token usage information for AI model responses.
57
+
58
+ Different providers may populate different fields.
59
+ """
60
+
61
+ input_tokens: int
62
+ output_tokens: int
63
+ cache_read_input_tokens: Optional[int]
64
+ cache_creation_input_tokens: Optional[int]
65
+ reasoning_tokens: Optional[int]
66
+
67
+
68
+ class ProviderResponse(TypedDict, total=False):
69
+ """
70
+ Standardized provider response format.
71
+
72
+ Used for consistent response formatting across all providers.
73
+ """
74
+
75
+ messages: List[FormattedMessage]
76
+ usage: TokenUsage
77
+ error: Optional[str]
78
+
79
+
80
+ class StreamingUsageStats(TypedDict, total=False):
81
+ """
82
+ Usage statistics collected during streaming.
83
+
84
+ Different providers populate different fields during streaming.
85
+ """
86
+
87
+ input_tokens: int
88
+ output_tokens: int
89
+ cache_read_input_tokens: Optional[int]
90
+ cache_creation_input_tokens: Optional[int]
91
+ reasoning_tokens: Optional[int]
92
+ # OpenAI-specific names
93
+ prompt_tokens: Optional[int]
94
+ completion_tokens: Optional[int]
95
+ total_tokens: Optional[int]
96
+
97
+
98
+ class StreamingContentBlock(TypedDict, total=False):
99
+ """
100
+ Content block used during streaming to accumulate content.
101
+
102
+ Used for tracking text and function calls as they stream in.
103
+ """
104
+
105
+ type: str # "text" or "function"
106
+ text: Optional[str]
107
+ id: Optional[str]
108
+ function: Optional[Dict[str, Any]]
109
+
110
+
111
+ class ToolInProgress(TypedDict):
112
+ """
113
+ Tracks a tool/function call being accumulated during streaming.
114
+
115
+ Used by Anthropic to accumulate JSON input for tools.
116
+ """
117
+
118
+ block: StreamingContentBlock
119
+ input_string: str
120
+
121
+
122
+ class StreamingEventData(TypedDict):
123
+ """
124
+ Standardized data for streaming events across all providers.
125
+
126
+ This type ensures consistent data structure when capturing streaming events,
127
+ with all provider-specific formatting already completed.
128
+ """
129
+
130
+ provider: str # "openai", "anthropic", "gemini"
131
+ model: str
132
+ base_url: str
133
+ kwargs: Dict[str, Any] # Original kwargs for tool extraction and special handling
134
+ formatted_input: Any # Provider-formatted input ready for tracking
135
+ formatted_output: Any # Provider-formatted output ready for tracking
136
+ usage_stats: TokenUsage # Standardized token counts
137
+ latency: float
138
+ distinct_id: Optional[str]
139
+ trace_id: Optional[str]
140
+ properties: Optional[Dict[str, Any]]
141
+ privacy_mode: bool
142
+ groups: Optional[Dict[str, Any]]