@flisk/analyze-tracking 0.7.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -61
- package/bin/cli.js +1 -1
- package/package.json +18 -3
- package/src/analyze/go/astTraversal.js +121 -0
- package/src/analyze/go/constants.js +20 -0
- package/src/analyze/go/eventDeduplicator.js +47 -0
- package/src/analyze/go/eventExtractor.js +156 -0
- package/src/analyze/go/goAstParser/constants.js +39 -0
- package/src/analyze/go/goAstParser/expressionParser.js +281 -0
- package/src/analyze/go/goAstParser/index.js +52 -0
- package/src/analyze/go/goAstParser/statementParser.js +387 -0
- package/src/analyze/go/goAstParser/tokenizer.js +196 -0
- package/src/analyze/go/goAstParser/typeParser.js +202 -0
- package/src/analyze/go/goAstParser/utils.js +99 -0
- package/src/analyze/go/index.js +55 -0
- package/src/analyze/go/propertyExtractor.js +670 -0
- package/src/analyze/go/trackingDetector.js +71 -0
- package/src/analyze/go/trackingExtractor.js +54 -0
- package/src/analyze/go/typeContext.js +88 -0
- package/src/analyze/go/utils.js +215 -0
- package/src/analyze/index.js +11 -7
- package/src/analyze/javascript/constants.js +115 -0
- package/src/analyze/javascript/detectors/analytics-source.js +119 -0
- package/src/analyze/javascript/detectors/index.js +10 -0
- package/src/analyze/javascript/extractors/event-extractor.js +179 -0
- package/src/analyze/javascript/extractors/index.js +13 -0
- package/src/analyze/javascript/extractors/property-extractor.js +172 -0
- package/src/analyze/javascript/index.js +38 -0
- package/src/analyze/javascript/parser.js +126 -0
- package/src/analyze/javascript/utils/function-finder.js +123 -0
- package/src/analyze/python/index.js +111 -0
- package/src/analyze/python/pythonTrackingAnalyzer.py +814 -0
- package/src/analyze/ruby/detectors.js +46 -0
- package/src/analyze/ruby/extractors.js +258 -0
- package/src/analyze/ruby/index.js +51 -0
- package/src/analyze/ruby/traversal.js +123 -0
- package/src/analyze/ruby/types.js +30 -0
- package/src/analyze/ruby/visitor.js +66 -0
- package/src/analyze/typescript/constants.js +109 -0
- package/src/analyze/typescript/detectors/analytics-source.js +120 -0
- package/src/analyze/typescript/detectors/index.js +10 -0
- package/src/analyze/typescript/extractors/event-extractor.js +269 -0
- package/src/analyze/typescript/extractors/index.js +14 -0
- package/src/analyze/typescript/extractors/property-extractor.js +395 -0
- package/src/analyze/typescript/index.js +48 -0
- package/src/analyze/typescript/parser.js +131 -0
- package/src/analyze/typescript/utils/function-finder.js +114 -0
- package/src/analyze/typescript/utils/type-resolver.js +193 -0
- package/src/generateDescriptions/index.js +81 -0
- package/src/generateDescriptions/llmUtils.js +33 -0
- package/src/generateDescriptions/promptUtils.js +62 -0
- package/src/generateDescriptions/schemaUtils.js +61 -0
- package/src/index.js +7 -2
- package/src/{fileProcessor.js → utils/fileProcessor.js} +5 -0
- package/src/{repoDetails.js → utils/repoDetails.js} +5 -0
- package/src/{yamlGenerator.js → utils/yamlGenerator.js} +5 -0
- package/.github/workflows/npm-publish.yml +0 -33
- package/.github/workflows/pr-check.yml +0 -17
- package/jest.config.js +0 -7
- package/src/analyze/analyzeGoFile.js +0 -1164
- package/src/analyze/analyzeJsFile.js +0 -72
- package/src/analyze/analyzePythonFile.js +0 -41
- package/src/analyze/analyzeRubyFile.js +0 -409
- package/src/analyze/analyzeTsFile.js +0 -69
- package/src/analyze/go2json.js +0 -1069
- package/src/analyze/helpers.js +0 -217
- package/src/analyze/pythonTrackingAnalyzer.py +0 -439
- package/src/generateDescriptions.js +0 -196
- package/tests/detectSource.test.js +0 -20
- package/tests/extractProperties.test.js +0 -109
- package/tests/findWrappingFunction.test.js +0 -30
|
@@ -0,0 +1,814 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Python Analytics Tracking Analyzer
|
|
4
|
+
|
|
5
|
+
This module analyzes Python source code to identify analytics tracking calls from various
|
|
6
|
+
libraries and extracts event information including event names, properties, and metadata.
|
|
7
|
+
|
|
8
|
+
Supported analytics libraries:
|
|
9
|
+
- Segment Analytics
|
|
10
|
+
- Mixpanel
|
|
11
|
+
- Amplitude
|
|
12
|
+
- PostHog
|
|
13
|
+
- Rudderstack
|
|
14
|
+
- Snowplow
|
|
15
|
+
- Custom tracking functions
|
|
16
|
+
|
|
17
|
+
The analyzer uses Python's AST (Abstract Syntax Tree) module to parse code and identify
|
|
18
|
+
tracking patterns specific to each library.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import ast
|
|
22
|
+
import json
|
|
23
|
+
from typing import Dict, List, Optional, Any, Union
|
|
24
|
+
|
|
25
|
+
# Type aliases for clarity
|
|
26
|
+
PropertyType = Union[str, Dict[str, Any]]
|
|
27
|
+
EventProperties = Dict[str, Dict[str, PropertyType]]
|
|
28
|
+
AnalyticsEvent = Dict[str, Any]
|
|
29
|
+
|
|
30
|
+
# Supported analytics sources
|
|
31
|
+
ANALYTICS_SOURCES = {
|
|
32
|
+
'segment': {'object': 'analytics', 'method': 'track'},
|
|
33
|
+
'mixpanel': {'object': 'mp', 'method': 'track'},
|
|
34
|
+
'rudderstack': {'object': 'rudder_analytics', 'method': 'track'},
|
|
35
|
+
'posthog': {'object': 'posthog', 'method': 'capture'},
|
|
36
|
+
'amplitude': {'event_class': 'BaseEvent'},
|
|
37
|
+
'snowplow': {'event_class': 'StructuredEvent', 'tracker_object': 'tracker'}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Type mappings from Python to JSON Schema types
|
|
41
|
+
TYPE_MAPPINGS = {
|
|
42
|
+
'int': 'number',
|
|
43
|
+
'float': 'number',
|
|
44
|
+
'str': 'string',
|
|
45
|
+
'bool': 'boolean',
|
|
46
|
+
'None': 'null',
|
|
47
|
+
'NoneType': 'null'
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Container types that map to arrays
|
|
51
|
+
ARRAY_TYPES = {'List', 'Tuple', 'Set', 'list', 'tuple', 'set'}
|
|
52
|
+
|
|
53
|
+
# Container types that map to objects
|
|
54
|
+
OBJECT_TYPES = {'Dict', 'dict'}
|
|
55
|
+
|
|
56
|
+
class TrackingVisitor(ast.NodeVisitor):
|
|
57
|
+
"""
|
|
58
|
+
AST visitor that identifies and extracts analytics tracking calls from Python code.
|
|
59
|
+
|
|
60
|
+
This visitor traverses the AST and looks for function calls that match known
|
|
61
|
+
analytics library patterns. It extracts event names, properties, and metadata
|
|
62
|
+
for each tracking call found.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
events: List of analytics events found in the code
|
|
66
|
+
filepath: Path to the file being analyzed
|
|
67
|
+
current_function: Name of the current function being visited
|
|
68
|
+
function_stack: Stack of function contexts for nested functions
|
|
69
|
+
var_types: Dictionary of variable types in the current scope
|
|
70
|
+
var_types_stack: Stack of variable type scopes
|
|
71
|
+
custom_function: Optional name of a custom tracking function
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, filepath: str, custom_function: Optional[str] = None):
|
|
75
|
+
"""
|
|
76
|
+
Initialize the tracking visitor.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
filepath: Path to the Python file being analyzed
|
|
80
|
+
custom_function: Optional name of a custom tracking function to detect
|
|
81
|
+
"""
|
|
82
|
+
self.events: List[AnalyticsEvent] = []
|
|
83
|
+
self.filepath = filepath
|
|
84
|
+
self.current_function = 'global'
|
|
85
|
+
self.function_stack: List[str] = []
|
|
86
|
+
self.var_types: Dict[str, PropertyType] = {}
|
|
87
|
+
self.var_types_stack: List[Dict[str, PropertyType]] = []
|
|
88
|
+
self.custom_function = custom_function
|
|
89
|
+
|
|
90
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Visit a function definition node and track context and variable types.
|
|
93
|
+
|
|
94
|
+
This method maintains the function context stack and creates a new scope
|
|
95
|
+
for variable types when entering a function. It also extracts type
|
|
96
|
+
annotations from function parameters.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
node: The function definition AST node
|
|
100
|
+
"""
|
|
101
|
+
# Save previous function context and variable types
|
|
102
|
+
self.function_stack.append(self.current_function)
|
|
103
|
+
self.var_types_stack.append(self.var_types)
|
|
104
|
+
|
|
105
|
+
# Create new scope for variable types
|
|
106
|
+
self.var_types = {}
|
|
107
|
+
self.current_function = node.name
|
|
108
|
+
|
|
109
|
+
# Extract parameter type annotations
|
|
110
|
+
for arg in node.args.args:
|
|
111
|
+
if arg.annotation:
|
|
112
|
+
# Store the type annotation for this parameter
|
|
113
|
+
self.var_types[arg.arg] = self.extract_type_annotation(arg.annotation)
|
|
114
|
+
|
|
115
|
+
# Visit children
|
|
116
|
+
self.generic_visit(node)
|
|
117
|
+
|
|
118
|
+
# Restore function context and variable types
|
|
119
|
+
self.current_function = self.function_stack.pop()
|
|
120
|
+
self.var_types = self.var_types_stack.pop()
|
|
121
|
+
|
|
122
|
+
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Visit a class definition node and track context.
|
|
125
|
+
|
|
126
|
+
Similar to function definitions, this maintains proper context
|
|
127
|
+
for methods within classes.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
node: The class definition AST node
|
|
131
|
+
"""
|
|
132
|
+
# Track class context for methods
|
|
133
|
+
class_name = node.name
|
|
134
|
+
self.function_stack.append(self.current_function)
|
|
135
|
+
self.var_types_stack.append(self.var_types)
|
|
136
|
+
|
|
137
|
+
# Create new scope for the class
|
|
138
|
+
self.var_types = {}
|
|
139
|
+
self.current_function = class_name
|
|
140
|
+
|
|
141
|
+
self.generic_visit(node)
|
|
142
|
+
|
|
143
|
+
# Restore context
|
|
144
|
+
self.current_function = self.function_stack.pop()
|
|
145
|
+
self.var_types = self.var_types_stack.pop()
|
|
146
|
+
|
|
147
|
+
def extract_type_annotation(self, annotation: ast.AST) -> PropertyType:
|
|
148
|
+
"""
|
|
149
|
+
Extract type information from a type annotation node.
|
|
150
|
+
|
|
151
|
+
Converts Python type annotations to JSON Schema compatible types.
|
|
152
|
+
Handles simple types (int, str, bool) and generic types (List[int], Dict[str, int]).
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
annotation: The type annotation AST node
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
A string representing the JSON Schema type or a dictionary for complex types
|
|
159
|
+
"""
|
|
160
|
+
if isinstance(annotation, ast.Name):
|
|
161
|
+
# Simple types like int, str, bool
|
|
162
|
+
type_name = annotation.id
|
|
163
|
+
return TYPE_MAPPINGS.get(type_name, 'any')
|
|
164
|
+
|
|
165
|
+
elif isinstance(annotation, ast.Subscript):
|
|
166
|
+
# Handle generic types like List[int], Dict[str, int]
|
|
167
|
+
if hasattr(annotation.value, 'id'):
|
|
168
|
+
container_type = annotation.value.id
|
|
169
|
+
|
|
170
|
+
if container_type in ARRAY_TYPES:
|
|
171
|
+
# Try to get the type parameter for arrays
|
|
172
|
+
if isinstance(annotation.slice, ast.Name):
|
|
173
|
+
element_type = self.extract_type_annotation(annotation.slice)
|
|
174
|
+
return {
|
|
175
|
+
'type': 'array',
|
|
176
|
+
'items': {'type': element_type}
|
|
177
|
+
}
|
|
178
|
+
return 'array'
|
|
179
|
+
|
|
180
|
+
elif container_type in OBJECT_TYPES:
|
|
181
|
+
return 'object'
|
|
182
|
+
|
|
183
|
+
# Default for unknown or complex types
|
|
184
|
+
return 'any'
|
|
185
|
+
|
|
186
|
+
def visit_AnnAssign(self, node: ast.AnnAssign) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Visit variable assignments with type annotations.
|
|
189
|
+
|
|
190
|
+
Tracks variable types from annotated assignments like:
|
|
191
|
+
user_id: str = "123"
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
node: The annotated assignment AST node
|
|
195
|
+
"""
|
|
196
|
+
if isinstance(node.target, ast.Name) and node.annotation:
|
|
197
|
+
# Store the type annotation for this variable
|
|
198
|
+
self.var_types[node.target.id] = self.extract_type_annotation(node.annotation)
|
|
199
|
+
self.generic_visit(node)
|
|
200
|
+
|
|
201
|
+
def visit_Assign(self, node: ast.Assign) -> None:
|
|
202
|
+
"""
|
|
203
|
+
Visit regular assignments to track simple type inferences.
|
|
204
|
+
|
|
205
|
+
Attempts to infer types from literal values in assignments like:
|
|
206
|
+
user_id = "123" # Inferred as string
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
node: The assignment AST node
|
|
210
|
+
"""
|
|
211
|
+
if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
|
|
212
|
+
var_name = node.targets[0].id
|
|
213
|
+
# Try to infer type from literal values
|
|
214
|
+
if isinstance(node.value, ast.Constant):
|
|
215
|
+
self.var_types[var_name] = self.get_value_type(node.value.value)
|
|
216
|
+
self.generic_visit(node)
|
|
217
|
+
|
|
218
|
+
def visit_Call(self, node: ast.Call) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Visit function call nodes to detect analytics tracking calls.
|
|
221
|
+
|
|
222
|
+
This is the main method that identifies tracking calls from various
|
|
223
|
+
analytics libraries and extracts relevant information.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
node: The function call AST node
|
|
227
|
+
"""
|
|
228
|
+
# Check if this is an analytics tracking call
|
|
229
|
+
source = self.detect_source(node)
|
|
230
|
+
if source:
|
|
231
|
+
event_name = self.extract_event_name(node, source)
|
|
232
|
+
if event_name:
|
|
233
|
+
properties = self.extract_properties(node, source)
|
|
234
|
+
|
|
235
|
+
# Create the event record
|
|
236
|
+
event = {
|
|
237
|
+
"eventName": event_name,
|
|
238
|
+
"source": source,
|
|
239
|
+
"properties": properties,
|
|
240
|
+
"filePath": self.filepath,
|
|
241
|
+
"line": node.lineno,
|
|
242
|
+
"functionName": self.current_function
|
|
243
|
+
}
|
|
244
|
+
self.events.append(event)
|
|
245
|
+
|
|
246
|
+
# Continue visiting child nodes
|
|
247
|
+
self.generic_visit(node)
|
|
248
|
+
|
|
249
|
+
def detect_source(self, node: ast.Call) -> Optional[str]:
|
|
250
|
+
"""
|
|
251
|
+
Detect which analytics library is being used in a function call.
|
|
252
|
+
|
|
253
|
+
Checks the function call against known patterns for supported analytics
|
|
254
|
+
libraries including Segment, Mixpanel, Amplitude, PostHog, Rudderstack,
|
|
255
|
+
Snowplow, and custom functions.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
node: The function call AST node
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
The name of the detected analytics source, or None if not recognized
|
|
262
|
+
"""
|
|
263
|
+
# Check for method calls (e.g., analytics.track())
|
|
264
|
+
if isinstance(node.func, ast.Attribute):
|
|
265
|
+
return self._detect_method_call_source(node)
|
|
266
|
+
|
|
267
|
+
# Check for direct function calls
|
|
268
|
+
elif isinstance(node.func, ast.Name):
|
|
269
|
+
return self._detect_function_call_source(node)
|
|
270
|
+
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
def _detect_method_call_source(self, node: ast.Call) -> Optional[str]:
|
|
274
|
+
"""Helper method to detect analytics source from method calls."""
|
|
275
|
+
if not hasattr(node.func.value, 'id'):
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
obj_id = node.func.value.id
|
|
279
|
+
method_name = node.func.attr
|
|
280
|
+
|
|
281
|
+
# Check standard analytics libraries
|
|
282
|
+
for source, config in ANALYTICS_SOURCES.items():
|
|
283
|
+
if 'object' in config and 'method' in config:
|
|
284
|
+
if obj_id == config['object'] and method_name == config['method']:
|
|
285
|
+
return source
|
|
286
|
+
|
|
287
|
+
# Special case: Amplitude with BaseEvent
|
|
288
|
+
if method_name == 'track' and self._is_amplitude_call(node):
|
|
289
|
+
return 'amplitude'
|
|
290
|
+
|
|
291
|
+
# Special case: Snowplow with StructuredEvent
|
|
292
|
+
if method_name == 'track' and self._is_snowplow_tracker_call(node):
|
|
293
|
+
return 'snowplow'
|
|
294
|
+
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
def _detect_function_call_source(self, node: ast.Call) -> Optional[str]:
|
|
298
|
+
"""Helper method to detect analytics source from direct function calls."""
|
|
299
|
+
func_name = node.func.id
|
|
300
|
+
|
|
301
|
+
# Check for Snowplow direct functions
|
|
302
|
+
if func_name in ['trackStructEvent', 'buildStructEvent']:
|
|
303
|
+
return 'snowplow'
|
|
304
|
+
|
|
305
|
+
# Check for Snowplow's snowplow('trackStructEvent', {...}) pattern
|
|
306
|
+
if func_name == 'snowplow' and self._is_snowplow_function_call(node):
|
|
307
|
+
return 'snowplow'
|
|
308
|
+
|
|
309
|
+
# Check for custom tracking function
|
|
310
|
+
if self.custom_function and func_name == self.custom_function:
|
|
311
|
+
return 'custom'
|
|
312
|
+
|
|
313
|
+
return None
|
|
314
|
+
|
|
315
|
+
def _is_amplitude_call(self, node: ast.Call) -> bool:
|
|
316
|
+
"""Check if the call matches Amplitude's BaseEvent pattern."""
|
|
317
|
+
if len(node.args) < 1:
|
|
318
|
+
return False
|
|
319
|
+
|
|
320
|
+
first_arg = node.args[0]
|
|
321
|
+
if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
|
|
322
|
+
return first_arg.func.id == 'BaseEvent'
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
def _is_snowplow_tracker_call(self, node: ast.Call) -> bool:
|
|
326
|
+
"""Check if the call matches Snowplow's tracker.track() pattern."""
|
|
327
|
+
if len(node.args) < 1:
|
|
328
|
+
return False
|
|
329
|
+
|
|
330
|
+
first_arg = node.args[0]
|
|
331
|
+
# Check if first argument is StructuredEvent
|
|
332
|
+
if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
|
|
333
|
+
return first_arg.func.id == 'StructuredEvent'
|
|
334
|
+
|
|
335
|
+
# Also check if it might be a variable (simple heuristic)
|
|
336
|
+
if isinstance(first_arg, ast.Name) and hasattr(node.func, 'value'):
|
|
337
|
+
return node.func.value.id == 'tracker'
|
|
338
|
+
|
|
339
|
+
return False
|
|
340
|
+
|
|
341
|
+
def _is_snowplow_function_call(self, node: ast.Call) -> bool:
|
|
342
|
+
"""Check if this is a snowplow('trackStructEvent', {...}) call."""
|
|
343
|
+
if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
|
|
344
|
+
return node.args[0].value == 'trackStructEvent'
|
|
345
|
+
return False
|
|
346
|
+
|
|
347
|
+
def extract_event_name(self, node: ast.Call, source: str) -> Optional[str]:
|
|
348
|
+
"""
|
|
349
|
+
Extract the event name from an analytics tracking call.
|
|
350
|
+
|
|
351
|
+
Different analytics libraries have different patterns for specifying
|
|
352
|
+
event names. This method handles the extraction for each supported source.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
node: The function call AST node
|
|
356
|
+
source: The detected analytics source
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
The extracted event name, or None if not found
|
|
360
|
+
"""
|
|
361
|
+
try:
|
|
362
|
+
if source in ['segment', 'rudderstack', 'mixpanel']:
|
|
363
|
+
return self._extract_standard_event_name(node)
|
|
364
|
+
elif source == 'amplitude':
|
|
365
|
+
return self._extract_amplitude_event_name(node)
|
|
366
|
+
elif source == 'posthog':
|
|
367
|
+
return self._extract_posthog_event_name(node)
|
|
368
|
+
elif source == 'snowplow':
|
|
369
|
+
return self._extract_snowplow_event_name(node)
|
|
370
|
+
elif source == 'custom':
|
|
371
|
+
return self._extract_custom_event_name(node)
|
|
372
|
+
except Exception:
|
|
373
|
+
# Silently fail and return None for any extraction errors
|
|
374
|
+
pass
|
|
375
|
+
|
|
376
|
+
return None
|
|
377
|
+
|
|
378
|
+
def _extract_standard_event_name(self, node: ast.Call) -> Optional[str]:
|
|
379
|
+
"""Extract event name for Segment/Rudderstack/Mixpanel format."""
|
|
380
|
+
# Format: library.track(user_id/distinct_id, 'event_name', {...})
|
|
381
|
+
if len(node.args) >= 2 and isinstance(node.args[1], ast.Constant):
|
|
382
|
+
return node.args[1].value
|
|
383
|
+
return None
|
|
384
|
+
|
|
385
|
+
def _extract_amplitude_event_name(self, node: ast.Call) -> Optional[str]:
|
|
386
|
+
"""Extract event name for Amplitude format."""
|
|
387
|
+
# Format: client.track(BaseEvent(event_type='event_name', ...))
|
|
388
|
+
if len(node.args) < 1 or not isinstance(node.args[0], ast.Call):
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
base_event_call = node.args[0]
|
|
392
|
+
# Look for event_type in keyword arguments
|
|
393
|
+
for keyword in base_event_call.keywords:
|
|
394
|
+
if keyword.arg == 'event_type' and isinstance(keyword.value, ast.Constant):
|
|
395
|
+
return keyword.value.value
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
def _extract_posthog_event_name(self, node: ast.Call) -> Optional[str]:
|
|
399
|
+
"""Extract event name for PostHog format."""
|
|
400
|
+
# PostHog has multiple formats:
|
|
401
|
+
# 1. posthog.capture('distinct_id', 'event_name', {...})
|
|
402
|
+
# 2. posthog.capture('distinct_id', event='event_name', properties={...})
|
|
403
|
+
|
|
404
|
+
# Check for named parameters first (event='event_name')
|
|
405
|
+
for keyword in node.keywords:
|
|
406
|
+
if keyword.arg == 'event' and isinstance(keyword.value, ast.Constant):
|
|
407
|
+
return keyword.value.value
|
|
408
|
+
|
|
409
|
+
# If no named event parameter, check positional args (second arg is event name)
|
|
410
|
+
if len(node.args) >= 2 and isinstance(node.args[1], ast.Constant):
|
|
411
|
+
return node.args[1].value
|
|
412
|
+
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
def _extract_snowplow_event_name(self, node: ast.Call) -> Optional[str]:
|
|
416
|
+
"""Extract event name for Snowplow format."""
|
|
417
|
+
# Pattern 1: tracker.track(StructuredEvent(action='event_name', ...))
|
|
418
|
+
if len(node.args) >= 1:
|
|
419
|
+
first_arg = node.args[0]
|
|
420
|
+
if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
|
|
421
|
+
if first_arg.func.id == 'StructuredEvent':
|
|
422
|
+
# Look for action in keyword arguments
|
|
423
|
+
for keyword in first_arg.keywords:
|
|
424
|
+
if keyword.arg == 'action' and isinstance(keyword.value, ast.Constant):
|
|
425
|
+
return keyword.value.value
|
|
426
|
+
|
|
427
|
+
# Pattern 2 & 3: Other Snowplow patterns would need additional handling
|
|
428
|
+
# For now, return None for these cases
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
def _extract_custom_event_name(self, node: ast.Call) -> Optional[str]:
|
|
432
|
+
"""Extract event name for custom tracking function."""
|
|
433
|
+
# Standard format: customFunction('event_name', {...})
|
|
434
|
+
if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
|
|
435
|
+
return node.args[0].value
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
def extract_properties(self, node: ast.Call, source: str) -> EventProperties:
|
|
439
|
+
"""
|
|
440
|
+
Extract properties from an analytics tracking call.
|
|
441
|
+
|
|
442
|
+
This method extracts the properties/attributes passed to the tracking call,
|
|
443
|
+
handling the different formats used by various analytics libraries.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
node: The function call AST node
|
|
447
|
+
source: The detected analytics source
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
Dictionary of properties with their types
|
|
451
|
+
"""
|
|
452
|
+
properties = {}
|
|
453
|
+
|
|
454
|
+
try:
|
|
455
|
+
# Extract user/distinct ID if present
|
|
456
|
+
user_id_prop = self._extract_user_id(node, source)
|
|
457
|
+
if user_id_prop:
|
|
458
|
+
properties.update(user_id_prop)
|
|
459
|
+
|
|
460
|
+
# Special handling for Snowplow StructuredEvent
|
|
461
|
+
if source == 'snowplow':
|
|
462
|
+
snowplow_props = self._extract_snowplow_properties(node)
|
|
463
|
+
properties.update(snowplow_props)
|
|
464
|
+
else:
|
|
465
|
+
# Get the properties dictionary node for other sources
|
|
466
|
+
props_node = self._get_properties_node(node, source)
|
|
467
|
+
|
|
468
|
+
# Extract properties from the dictionary
|
|
469
|
+
if props_node and isinstance(props_node, ast.Dict):
|
|
470
|
+
extracted_props = self._extract_dict_properties(props_node, source)
|
|
471
|
+
properties.update(extracted_props)
|
|
472
|
+
|
|
473
|
+
except Exception:
|
|
474
|
+
# Silently fail and return what we have so far
|
|
475
|
+
pass
|
|
476
|
+
|
|
477
|
+
return properties
|
|
478
|
+
|
|
479
|
+
def _extract_user_id(self, node: ast.Call, source: str) -> EventProperties:
|
|
480
|
+
"""Extract user/distinct ID from tracking call if present."""
|
|
481
|
+
user_id_props = {}
|
|
482
|
+
|
|
483
|
+
if source in ['segment', 'rudderstack']:
|
|
484
|
+
# Format: analytics.track(user_id, ...)
|
|
485
|
+
if len(node.args) > 0:
|
|
486
|
+
user_id_node = node.args[0]
|
|
487
|
+
if self._is_non_null_value(user_id_node):
|
|
488
|
+
user_id_props["user_id"] = {"type": "string"}
|
|
489
|
+
|
|
490
|
+
elif source == 'mixpanel':
|
|
491
|
+
# Format: mp.track(distinct_id, ...)
|
|
492
|
+
if len(node.args) > 0:
|
|
493
|
+
distinct_id_node = node.args[0]
|
|
494
|
+
if self._is_non_null_value(distinct_id_node):
|
|
495
|
+
user_id_props["distinct_id"] = {"type": "string"}
|
|
496
|
+
|
|
497
|
+
elif source == 'amplitude':
|
|
498
|
+
# Check BaseEvent for user_id parameter
|
|
499
|
+
user_id_props.update(self._extract_amplitude_user_id(node))
|
|
500
|
+
|
|
501
|
+
elif source == 'posthog':
|
|
502
|
+
# Check if event is not anonymous and extract distinct_id
|
|
503
|
+
user_id_props.update(self._extract_posthog_user_id(node))
|
|
504
|
+
|
|
505
|
+
return user_id_props
|
|
506
|
+
|
|
507
|
+
def _is_non_null_value(self, node: ast.AST) -> bool:
|
|
508
|
+
"""Check if a node represents a non-null value."""
|
|
509
|
+
if isinstance(node, ast.Constant):
|
|
510
|
+
return node.value is not None
|
|
511
|
+
elif isinstance(node, ast.Name):
|
|
512
|
+
return True # Variable reference, assume non-null
|
|
513
|
+
return False
|
|
514
|
+
|
|
515
|
+
def _extract_amplitude_user_id(self, node: ast.Call) -> EventProperties:
|
|
516
|
+
"""Extract user_id from Amplitude BaseEvent call."""
|
|
517
|
+
if len(node.args) < 1 or not isinstance(node.args[0], ast.Call):
|
|
518
|
+
return {}
|
|
519
|
+
|
|
520
|
+
base_event_call = node.args[0]
|
|
521
|
+
for keyword in base_event_call.keywords:
|
|
522
|
+
if keyword.arg == 'user_id' and self._is_non_null_value(keyword.value):
|
|
523
|
+
return {"user_id": {"type": "string"}}
|
|
524
|
+
return {}
|
|
525
|
+
|
|
526
|
+
def _extract_posthog_user_id(self, node: ast.Call) -> EventProperties:
|
|
527
|
+
"""Extract distinct_id from PostHog call if not anonymous."""
|
|
528
|
+
# Check if event is anonymous by looking for $process_person_profile: False
|
|
529
|
+
props_node = self._get_properties_node(node, 'posthog')
|
|
530
|
+
|
|
531
|
+
if props_node and isinstance(props_node, ast.Dict):
|
|
532
|
+
for i, key_node in enumerate(props_node.keys):
|
|
533
|
+
if (isinstance(key_node, ast.Constant) and
|
|
534
|
+
key_node.value == '$process_person_profile'):
|
|
535
|
+
value_node = props_node.values[i]
|
|
536
|
+
if isinstance(value_node, ast.Constant) and value_node.value is False:
|
|
537
|
+
return {} # Anonymous event
|
|
538
|
+
|
|
539
|
+
# Extract distinct_id if not anonymous
|
|
540
|
+
if len(node.args) > 0 and isinstance(node.args[0], ast.Constant):
|
|
541
|
+
distinct_id = node.args[0].value
|
|
542
|
+
if distinct_id:
|
|
543
|
+
return {"distinct_id": {"type": "string"}}
|
|
544
|
+
return {}
|
|
545
|
+
|
|
546
|
+
def _get_properties_node(self, node: ast.Call, source: str) -> Optional[ast.Dict]:
|
|
547
|
+
"""Get the properties dictionary node based on the analytics source."""
|
|
548
|
+
if source in ['segment', 'rudderstack', 'mixpanel']:
|
|
549
|
+
# Properties are in the third argument
|
|
550
|
+
if len(node.args) > 2:
|
|
551
|
+
return node.args[2]
|
|
552
|
+
|
|
553
|
+
elif source == 'amplitude':
|
|
554
|
+
# Look for event_properties in BaseEvent
|
|
555
|
+
if len(node.args) >= 1 and isinstance(node.args[0], ast.Call):
|
|
556
|
+
base_event_call = node.args[0]
|
|
557
|
+
for keyword in base_event_call.keywords:
|
|
558
|
+
if keyword.arg == 'event_properties' and isinstance(keyword.value, ast.Dict):
|
|
559
|
+
return keyword.value
|
|
560
|
+
|
|
561
|
+
elif source == 'custom':
|
|
562
|
+
# Properties are in the second argument
|
|
563
|
+
if len(node.args) > 1:
|
|
564
|
+
return node.args[1]
|
|
565
|
+
|
|
566
|
+
elif source == 'posthog':
|
|
567
|
+
# Check named parameters first, then positional
|
|
568
|
+
for keyword in node.keywords:
|
|
569
|
+
if keyword.arg == 'properties' and isinstance(keyword.value, ast.Dict):
|
|
570
|
+
return keyword.value
|
|
571
|
+
if len(node.args) > 2:
|
|
572
|
+
return node.args[2]
|
|
573
|
+
|
|
574
|
+
elif source == 'snowplow':
|
|
575
|
+
# Handle StructuredEvent pattern
|
|
576
|
+
if len(node.args) >= 1:
|
|
577
|
+
first_arg = node.args[0]
|
|
578
|
+
if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
|
|
579
|
+
if first_arg.func.id == 'StructuredEvent':
|
|
580
|
+
# Return None as properties are handled differently for Snowplow
|
|
581
|
+
return None
|
|
582
|
+
|
|
583
|
+
return None
|
|
584
|
+
|
|
585
|
+
def _extract_dict_properties(self, dict_node: ast.Dict, source: str) -> EventProperties:
|
|
586
|
+
"""Extract properties from a dictionary node."""
|
|
587
|
+
properties = {}
|
|
588
|
+
|
|
589
|
+
for i, key_node in enumerate(dict_node.keys):
|
|
590
|
+
if isinstance(key_node, ast.Constant) and hasattr(key_node, 'value'):
|
|
591
|
+
key = key_node.value
|
|
592
|
+
value_node = dict_node.values[i]
|
|
593
|
+
|
|
594
|
+
# Special handling for PostHog $set and $set_once
|
|
595
|
+
if source == 'posthog' and key in ['$set', '$set_once']:
|
|
596
|
+
if isinstance(value_node, ast.Dict):
|
|
597
|
+
nested_props = self.extract_nested_dict(value_node)
|
|
598
|
+
for nested_key, nested_value in nested_props.items():
|
|
599
|
+
properties[f"{key}.{nested_key}"] = nested_value
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
# Skip PostHog internal properties
|
|
603
|
+
if source == 'posthog' and key == '$process_person_profile':
|
|
604
|
+
continue
|
|
605
|
+
|
|
606
|
+
# Extract property type
|
|
607
|
+
prop_type = self._extract_property_type(value_node)
|
|
608
|
+
if prop_type:
|
|
609
|
+
properties[key] = prop_type
|
|
610
|
+
|
|
611
|
+
return properties
|
|
612
|
+
|
|
613
|
+
def _extract_snowplow_properties(self, node: ast.Call) -> EventProperties:
|
|
614
|
+
"""Extract properties from Snowplow tracking call."""
|
|
615
|
+
properties = {}
|
|
616
|
+
|
|
617
|
+
# Pattern: tracker.track(StructuredEvent(...))
|
|
618
|
+
if len(node.args) >= 1:
|
|
619
|
+
first_arg = node.args[0]
|
|
620
|
+
if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
|
|
621
|
+
if first_arg.func.id == 'StructuredEvent':
|
|
622
|
+
# Extract all keyword arguments except 'action'
|
|
623
|
+
for keyword in first_arg.keywords:
|
|
624
|
+
if keyword.arg and keyword.arg != 'action':
|
|
625
|
+
# Map property_ to property for consistency
|
|
626
|
+
prop_name = 'property' if keyword.arg == 'property_' else keyword.arg
|
|
627
|
+
prop_type = self._extract_property_type(keyword.value)
|
|
628
|
+
if prop_type:
|
|
629
|
+
properties[prop_name] = prop_type
|
|
630
|
+
|
|
631
|
+
return properties
|
|
632
|
+
|
|
633
|
+
def _extract_property_type(self, value_node: ast.AST) -> Optional[Dict[str, Any]]:
|
|
634
|
+
"""Extract the type information for a property value."""
|
|
635
|
+
if isinstance(value_node, ast.Constant):
|
|
636
|
+
value_type = self.get_value_type(value_node.value)
|
|
637
|
+
return {"type": value_type}
|
|
638
|
+
|
|
639
|
+
elif isinstance(value_node, ast.Name):
|
|
640
|
+
# Check if we know the type of this variable
|
|
641
|
+
var_name = value_node.id
|
|
642
|
+
if var_name in self.var_types:
|
|
643
|
+
var_type = self.var_types[var_name]
|
|
644
|
+
if isinstance(var_type, dict):
|
|
645
|
+
return var_type
|
|
646
|
+
else:
|
|
647
|
+
return {"type": var_type}
|
|
648
|
+
else:
|
|
649
|
+
return {"type": "any"}
|
|
650
|
+
|
|
651
|
+
elif isinstance(value_node, ast.Dict):
|
|
652
|
+
# Nested dictionary
|
|
653
|
+
nested_props = self.extract_nested_dict(value_node)
|
|
654
|
+
return {
|
|
655
|
+
"type": "object",
|
|
656
|
+
"properties": nested_props
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
elif isinstance(value_node, (ast.List, ast.Tuple)):
|
|
660
|
+
# Array/list/tuple
|
|
661
|
+
item_type = self.infer_sequence_item_type(value_node)
|
|
662
|
+
return {
|
|
663
|
+
"type": "array",
|
|
664
|
+
"items": item_type
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
return None
|
|
668
|
+
|
|
669
|
+
def infer_sequence_item_type(self, seq_node: Union[ast.List, ast.Tuple]) -> Dict[str, str]:
|
|
670
|
+
"""
|
|
671
|
+
Analyze a sequence (list or tuple) to determine the type of its items.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
seq_node: The list or tuple AST node
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
Dictionary representing the item type
|
|
678
|
+
"""
|
|
679
|
+
if not hasattr(seq_node, 'elts') or not seq_node.elts:
|
|
680
|
+
return {"type": "any"}
|
|
681
|
+
|
|
682
|
+
# Get types of all elements
|
|
683
|
+
element_types = []
|
|
684
|
+
for element in seq_node.elts:
|
|
685
|
+
if isinstance(element, ast.Constant):
|
|
686
|
+
element_types.append(self.get_value_type(element.value))
|
|
687
|
+
elif isinstance(element, ast.Name):
|
|
688
|
+
if element.id in self.var_types:
|
|
689
|
+
var_type = self.var_types[element.id]
|
|
690
|
+
element_types.append(var_type if isinstance(var_type, str) else "any")
|
|
691
|
+
else:
|
|
692
|
+
element_types.append("any")
|
|
693
|
+
elif isinstance(element, ast.Dict):
|
|
694
|
+
element_types.append("object")
|
|
695
|
+
elif isinstance(element, (ast.List, ast.Tuple)):
|
|
696
|
+
element_types.append("array")
|
|
697
|
+
else:
|
|
698
|
+
element_types.append("any")
|
|
699
|
+
|
|
700
|
+
# Determine the common type
|
|
701
|
+
unique_types = set(element_types)
|
|
702
|
+
|
|
703
|
+
if len(unique_types) == 1:
|
|
704
|
+
return {"type": element_types[0]}
|
|
705
|
+
elif unique_types <= {"number", "string"}:
|
|
706
|
+
# Common mixed case - numbers and strings
|
|
707
|
+
return {"type": "string"}
|
|
708
|
+
elif unique_types <= {"number", "boolean"}:
|
|
709
|
+
# Numbers and booleans
|
|
710
|
+
return {"type": "number"}
|
|
711
|
+
else:
|
|
712
|
+
# Mixed types
|
|
713
|
+
return {"type": "any"}
|
|
714
|
+
|
|
715
|
+
def extract_nested_dict(self, dict_node: ast.Dict) -> EventProperties:
|
|
716
|
+
"""
|
|
717
|
+
Extract properties from a nested dictionary.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
dict_node: The dictionary AST node
|
|
721
|
+
|
|
722
|
+
Returns:
|
|
723
|
+
Dictionary of properties with their types
|
|
724
|
+
"""
|
|
725
|
+
nested_props = {}
|
|
726
|
+
|
|
727
|
+
for i, key_node in enumerate(dict_node.keys):
|
|
728
|
+
if isinstance(key_node, ast.Constant) and hasattr(key_node, 'value'):
|
|
729
|
+
key = key_node.value
|
|
730
|
+
value_node = dict_node.values[i]
|
|
731
|
+
|
|
732
|
+
prop_type = self._extract_property_type(value_node)
|
|
733
|
+
if prop_type:
|
|
734
|
+
nested_props[key] = prop_type
|
|
735
|
+
|
|
736
|
+
return nested_props
|
|
737
|
+
|
|
738
|
+
def get_value_type(self, value: Any) -> str:
|
|
739
|
+
"""
|
|
740
|
+
Determine the JSON Schema type for a Python value.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
value: The Python value
|
|
744
|
+
|
|
745
|
+
Returns:
|
|
746
|
+
String representing the JSON Schema type
|
|
747
|
+
"""
|
|
748
|
+
if isinstance(value, bool):
|
|
749
|
+
return "boolean"
|
|
750
|
+
elif isinstance(value, str):
|
|
751
|
+
return "string"
|
|
752
|
+
elif isinstance(value, (int, float)):
|
|
753
|
+
return "number"
|
|
754
|
+
elif value is None:
|
|
755
|
+
return "null"
|
|
756
|
+
return "any"
|
|
757
|
+
|
|
758
|
+
def analyze_python_code(code: str, filepath: str, custom_function: Optional[str] = None) -> str:
|
|
759
|
+
"""
|
|
760
|
+
Analyze Python code for analytics tracking calls.
|
|
761
|
+
|
|
762
|
+
This function parses Python code and identifies analytics tracking calls,
|
|
763
|
+
extracting event names, properties, and metadata.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
code: The Python source code to analyze
|
|
767
|
+
filepath: Path to the file being analyzed
|
|
768
|
+
custom_function: Optional name of a custom tracking function
|
|
769
|
+
|
|
770
|
+
Returns:
|
|
771
|
+
JSON string containing array of tracking events
|
|
772
|
+
"""
|
|
773
|
+
try:
|
|
774
|
+
# Parse the Python code
|
|
775
|
+
tree = ast.parse(code)
|
|
776
|
+
|
|
777
|
+
# Create visitor and analyze
|
|
778
|
+
visitor = TrackingVisitor(filepath, custom_function)
|
|
779
|
+
visitor.visit(tree)
|
|
780
|
+
|
|
781
|
+
# Return events as JSON
|
|
782
|
+
return json.dumps(visitor.events)
|
|
783
|
+
except Exception as e:
|
|
784
|
+
# Return empty array on parse errors
|
|
785
|
+
return json.dumps([])
|
|
786
|
+
|
|
787
|
+
# Command-line interface
|
|
788
|
+
if __name__ == "__main__":
|
|
789
|
+
import sys
|
|
790
|
+
import argparse
|
|
791
|
+
|
|
792
|
+
parser = argparse.ArgumentParser(
|
|
793
|
+
description='Analyze Python code for analytics tracking calls',
|
|
794
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
795
|
+
epilog="Example: %(prog)s myapp.py [--custom-function track_event]"
|
|
796
|
+
)
|
|
797
|
+
parser.add_argument('file', help='Python file to analyze')
|
|
798
|
+
parser.add_argument(
|
|
799
|
+
'-c', '--custom-function',
|
|
800
|
+
help='Name of custom tracking function to detect'
|
|
801
|
+
)
|
|
802
|
+
args = parser.parse_args()
|
|
803
|
+
|
|
804
|
+
try:
|
|
805
|
+
with open(args.file, 'r') as f:
|
|
806
|
+
code = f.read()
|
|
807
|
+
result = analyze_python_code(code, args.file, args.custom_function)
|
|
808
|
+
print(result)
|
|
809
|
+
except FileNotFoundError:
|
|
810
|
+
print(f"Error: File '{args.file}' not found", file=sys.stderr)
|
|
811
|
+
sys.exit(1)
|
|
812
|
+
except Exception as e:
|
|
813
|
+
print(f"Error analyzing file: {str(e)}", file=sys.stderr)
|
|
814
|
+
sys.exit(1)
|