@flisk/analyze-tracking 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +35 -61
  2. package/bin/cli.js +1 -1
  3. package/package.json +18 -3
  4. package/src/analyze/go/astTraversal.js +121 -0
  5. package/src/analyze/go/constants.js +20 -0
  6. package/src/analyze/go/eventDeduplicator.js +47 -0
  7. package/src/analyze/go/eventExtractor.js +156 -0
  8. package/src/analyze/go/goAstParser/constants.js +39 -0
  9. package/src/analyze/go/goAstParser/expressionParser.js +281 -0
  10. package/src/analyze/go/goAstParser/index.js +52 -0
  11. package/src/analyze/go/goAstParser/statementParser.js +387 -0
  12. package/src/analyze/go/goAstParser/tokenizer.js +196 -0
  13. package/src/analyze/go/goAstParser/typeParser.js +202 -0
  14. package/src/analyze/go/goAstParser/utils.js +99 -0
  15. package/src/analyze/go/index.js +55 -0
  16. package/src/analyze/go/propertyExtractor.js +670 -0
  17. package/src/analyze/go/trackingDetector.js +71 -0
  18. package/src/analyze/go/trackingExtractor.js +54 -0
  19. package/src/analyze/go/typeContext.js +88 -0
  20. package/src/analyze/go/utils.js +215 -0
  21. package/src/analyze/index.js +11 -7
  22. package/src/analyze/javascript/constants.js +115 -0
  23. package/src/analyze/javascript/detectors/analytics-source.js +119 -0
  24. package/src/analyze/javascript/detectors/index.js +10 -0
  25. package/src/analyze/javascript/extractors/event-extractor.js +179 -0
  26. package/src/analyze/javascript/extractors/index.js +13 -0
  27. package/src/analyze/javascript/extractors/property-extractor.js +172 -0
  28. package/src/analyze/javascript/index.js +38 -0
  29. package/src/analyze/javascript/parser.js +126 -0
  30. package/src/analyze/javascript/utils/function-finder.js +123 -0
  31. package/src/analyze/python/index.js +111 -0
  32. package/src/analyze/python/pythonTrackingAnalyzer.py +814 -0
  33. package/src/analyze/ruby/detectors.js +46 -0
  34. package/src/analyze/ruby/extractors.js +258 -0
  35. package/src/analyze/ruby/index.js +51 -0
  36. package/src/analyze/ruby/traversal.js +123 -0
  37. package/src/analyze/ruby/types.js +30 -0
  38. package/src/analyze/ruby/visitor.js +66 -0
  39. package/src/analyze/typescript/constants.js +109 -0
  40. package/src/analyze/typescript/detectors/analytics-source.js +120 -0
  41. package/src/analyze/typescript/detectors/index.js +10 -0
  42. package/src/analyze/typescript/extractors/event-extractor.js +269 -0
  43. package/src/analyze/typescript/extractors/index.js +14 -0
  44. package/src/analyze/typescript/extractors/property-extractor.js +395 -0
  45. package/src/analyze/typescript/index.js +48 -0
  46. package/src/analyze/typescript/parser.js +131 -0
  47. package/src/analyze/typescript/utils/function-finder.js +114 -0
  48. package/src/analyze/typescript/utils/type-resolver.js +193 -0
  49. package/src/generateDescriptions/index.js +81 -0
  50. package/src/generateDescriptions/llmUtils.js +33 -0
  51. package/src/generateDescriptions/promptUtils.js +62 -0
  52. package/src/generateDescriptions/schemaUtils.js +61 -0
  53. package/src/index.js +7 -2
  54. package/src/{fileProcessor.js → utils/fileProcessor.js} +5 -0
  55. package/src/{repoDetails.js → utils/repoDetails.js} +5 -0
  56. package/src/{yamlGenerator.js → utils/yamlGenerator.js} +5 -0
  57. package/.github/workflows/npm-publish.yml +0 -33
  58. package/.github/workflows/pr-check.yml +0 -17
  59. package/jest.config.js +0 -7
  60. package/src/analyze/analyzeGoFile.js +0 -1164
  61. package/src/analyze/analyzeJsFile.js +0 -72
  62. package/src/analyze/analyzePythonFile.js +0 -41
  63. package/src/analyze/analyzeRubyFile.js +0 -409
  64. package/src/analyze/analyzeTsFile.js +0 -69
  65. package/src/analyze/go2json.js +0 -1069
  66. package/src/analyze/helpers.js +0 -217
  67. package/src/analyze/pythonTrackingAnalyzer.py +0 -439
  68. package/src/generateDescriptions.js +0 -196
  69. package/tests/detectSource.test.js +0 -20
  70. package/tests/extractProperties.test.js +0 -109
  71. package/tests/findWrappingFunction.test.js +0 -30
@@ -0,0 +1,814 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Python Analytics Tracking Analyzer
4
+
5
+ This module analyzes Python source code to identify analytics tracking calls from various
6
+ libraries and extracts event information including event names, properties, and metadata.
7
+
8
+ Supported analytics libraries:
9
+ - Segment Analytics
10
+ - Mixpanel
11
+ - Amplitude
12
+ - PostHog
13
+ - Rudderstack
14
+ - Snowplow
15
+ - Custom tracking functions
16
+
17
+ The analyzer uses Python's AST (Abstract Syntax Tree) module to parse code and identify
18
+ tracking patterns specific to each library.
19
+ """
20
+
21
+ import ast
22
+ import json
23
+ from typing import Dict, List, Optional, Any, Union
24
+
25
+ # Type aliases for clarity
26
+ PropertyType = Union[str, Dict[str, Any]]
27
+ EventProperties = Dict[str, Dict[str, PropertyType]]
28
+ AnalyticsEvent = Dict[str, Any]
29
+
30
+ # Supported analytics sources
31
+ ANALYTICS_SOURCES = {
32
+ 'segment': {'object': 'analytics', 'method': 'track'},
33
+ 'mixpanel': {'object': 'mp', 'method': 'track'},
34
+ 'rudderstack': {'object': 'rudder_analytics', 'method': 'track'},
35
+ 'posthog': {'object': 'posthog', 'method': 'capture'},
36
+ 'amplitude': {'event_class': 'BaseEvent'},
37
+ 'snowplow': {'event_class': 'StructuredEvent', 'tracker_object': 'tracker'}
38
+ }
39
+
40
+ # Type mappings from Python to JSON Schema types
41
+ TYPE_MAPPINGS = {
42
+ 'int': 'number',
43
+ 'float': 'number',
44
+ 'str': 'string',
45
+ 'bool': 'boolean',
46
+ 'None': 'null',
47
+ 'NoneType': 'null'
48
+ }
49
+
50
+ # Container types that map to arrays
51
+ ARRAY_TYPES = {'List', 'Tuple', 'Set', 'list', 'tuple', 'set'}
52
+
53
+ # Container types that map to objects
54
+ OBJECT_TYPES = {'Dict', 'dict'}
55
+
56
+ class TrackingVisitor(ast.NodeVisitor):
57
+ """
58
+ AST visitor that identifies and extracts analytics tracking calls from Python code.
59
+
60
+ This visitor traverses the AST and looks for function calls that match known
61
+ analytics library patterns. It extracts event names, properties, and metadata
62
+ for each tracking call found.
63
+
64
+ Attributes:
65
+ events: List of analytics events found in the code
66
+ filepath: Path to the file being analyzed
67
+ current_function: Name of the current function being visited
68
+ function_stack: Stack of function contexts for nested functions
69
+ var_types: Dictionary of variable types in the current scope
70
+ var_types_stack: Stack of variable type scopes
71
+ custom_function: Optional name of a custom tracking function
72
+ """
73
+
74
+ def __init__(self, filepath: str, custom_function: Optional[str] = None):
75
+ """
76
+ Initialize the tracking visitor.
77
+
78
+ Args:
79
+ filepath: Path to the Python file being analyzed
80
+ custom_function: Optional name of a custom tracking function to detect
81
+ """
82
+ self.events: List[AnalyticsEvent] = []
83
+ self.filepath = filepath
84
+ self.current_function = 'global'
85
+ self.function_stack: List[str] = []
86
+ self.var_types: Dict[str, PropertyType] = {}
87
+ self.var_types_stack: List[Dict[str, PropertyType]] = []
88
+ self.custom_function = custom_function
89
+
90
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
91
+ """
92
+ Visit a function definition node and track context and variable types.
93
+
94
+ This method maintains the function context stack and creates a new scope
95
+ for variable types when entering a function. It also extracts type
96
+ annotations from function parameters.
97
+
98
+ Args:
99
+ node: The function definition AST node
100
+ """
101
+ # Save previous function context and variable types
102
+ self.function_stack.append(self.current_function)
103
+ self.var_types_stack.append(self.var_types)
104
+
105
+ # Create new scope for variable types
106
+ self.var_types = {}
107
+ self.current_function = node.name
108
+
109
+ # Extract parameter type annotations
110
+ for arg in node.args.args:
111
+ if arg.annotation:
112
+ # Store the type annotation for this parameter
113
+ self.var_types[arg.arg] = self.extract_type_annotation(arg.annotation)
114
+
115
+ # Visit children
116
+ self.generic_visit(node)
117
+
118
+ # Restore function context and variable types
119
+ self.current_function = self.function_stack.pop()
120
+ self.var_types = self.var_types_stack.pop()
121
+
122
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
123
+ """
124
+ Visit a class definition node and track context.
125
+
126
+ Similar to function definitions, this maintains proper context
127
+ for methods within classes.
128
+
129
+ Args:
130
+ node: The class definition AST node
131
+ """
132
+ # Track class context for methods
133
+ class_name = node.name
134
+ self.function_stack.append(self.current_function)
135
+ self.var_types_stack.append(self.var_types)
136
+
137
+ # Create new scope for the class
138
+ self.var_types = {}
139
+ self.current_function = class_name
140
+
141
+ self.generic_visit(node)
142
+
143
+ # Restore context
144
+ self.current_function = self.function_stack.pop()
145
+ self.var_types = self.var_types_stack.pop()
146
+
147
+ def extract_type_annotation(self, annotation: ast.AST) -> PropertyType:
148
+ """
149
+ Extract type information from a type annotation node.
150
+
151
+ Converts Python type annotations to JSON Schema compatible types.
152
+ Handles simple types (int, str, bool) and generic types (List[int], Dict[str, int]).
153
+
154
+ Args:
155
+ annotation: The type annotation AST node
156
+
157
+ Returns:
158
+ A string representing the JSON Schema type or a dictionary for complex types
159
+ """
160
+ if isinstance(annotation, ast.Name):
161
+ # Simple types like int, str, bool
162
+ type_name = annotation.id
163
+ return TYPE_MAPPINGS.get(type_name, 'any')
164
+
165
+ elif isinstance(annotation, ast.Subscript):
166
+ # Handle generic types like List[int], Dict[str, int]
167
+ if hasattr(annotation.value, 'id'):
168
+ container_type = annotation.value.id
169
+
170
+ if container_type in ARRAY_TYPES:
171
+ # Try to get the type parameter for arrays
172
+ if isinstance(annotation.slice, ast.Name):
173
+ element_type = self.extract_type_annotation(annotation.slice)
174
+ return {
175
+ 'type': 'array',
176
+ 'items': {'type': element_type}
177
+ }
178
+ return 'array'
179
+
180
+ elif container_type in OBJECT_TYPES:
181
+ return 'object'
182
+
183
+ # Default for unknown or complex types
184
+ return 'any'
185
+
186
+ def visit_AnnAssign(self, node: ast.AnnAssign) -> None:
187
+ """
188
+ Visit variable assignments with type annotations.
189
+
190
+ Tracks variable types from annotated assignments like:
191
+ user_id: str = "123"
192
+
193
+ Args:
194
+ node: The annotated assignment AST node
195
+ """
196
+ if isinstance(node.target, ast.Name) and node.annotation:
197
+ # Store the type annotation for this variable
198
+ self.var_types[node.target.id] = self.extract_type_annotation(node.annotation)
199
+ self.generic_visit(node)
200
+
201
+ def visit_Assign(self, node: ast.Assign) -> None:
202
+ """
203
+ Visit regular assignments to track simple type inferences.
204
+
205
+ Attempts to infer types from literal values in assignments like:
206
+ user_id = "123" # Inferred as string
207
+
208
+ Args:
209
+ node: The assignment AST node
210
+ """
211
+ if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
212
+ var_name = node.targets[0].id
213
+ # Try to infer type from literal values
214
+ if isinstance(node.value, ast.Constant):
215
+ self.var_types[var_name] = self.get_value_type(node.value.value)
216
+ self.generic_visit(node)
217
+
218
+ def visit_Call(self, node: ast.Call) -> None:
219
+ """
220
+ Visit function call nodes to detect analytics tracking calls.
221
+
222
+ This is the main method that identifies tracking calls from various
223
+ analytics libraries and extracts relevant information.
224
+
225
+ Args:
226
+ node: The function call AST node
227
+ """
228
+ # Check if this is an analytics tracking call
229
+ source = self.detect_source(node)
230
+ if source:
231
+ event_name = self.extract_event_name(node, source)
232
+ if event_name:
233
+ properties = self.extract_properties(node, source)
234
+
235
+ # Create the event record
236
+ event = {
237
+ "eventName": event_name,
238
+ "source": source,
239
+ "properties": properties,
240
+ "filePath": self.filepath,
241
+ "line": node.lineno,
242
+ "functionName": self.current_function
243
+ }
244
+ self.events.append(event)
245
+
246
+ # Continue visiting child nodes
247
+ self.generic_visit(node)
248
+
249
+ def detect_source(self, node: ast.Call) -> Optional[str]:
250
+ """
251
+ Detect which analytics library is being used in a function call.
252
+
253
+ Checks the function call against known patterns for supported analytics
254
+ libraries including Segment, Mixpanel, Amplitude, PostHog, Rudderstack,
255
+ Snowplow, and custom functions.
256
+
257
+ Args:
258
+ node: The function call AST node
259
+
260
+ Returns:
261
+ The name of the detected analytics source, or None if not recognized
262
+ """
263
+ # Check for method calls (e.g., analytics.track())
264
+ if isinstance(node.func, ast.Attribute):
265
+ return self._detect_method_call_source(node)
266
+
267
+ # Check for direct function calls
268
+ elif isinstance(node.func, ast.Name):
269
+ return self._detect_function_call_source(node)
270
+
271
+ return None
272
+
273
+ def _detect_method_call_source(self, node: ast.Call) -> Optional[str]:
274
+ """Helper method to detect analytics source from method calls."""
275
+ if not hasattr(node.func.value, 'id'):
276
+ return None
277
+
278
+ obj_id = node.func.value.id
279
+ method_name = node.func.attr
280
+
281
+ # Check standard analytics libraries
282
+ for source, config in ANALYTICS_SOURCES.items():
283
+ if 'object' in config and 'method' in config:
284
+ if obj_id == config['object'] and method_name == config['method']:
285
+ return source
286
+
287
+ # Special case: Amplitude with BaseEvent
288
+ if method_name == 'track' and self._is_amplitude_call(node):
289
+ return 'amplitude'
290
+
291
+ # Special case: Snowplow with StructuredEvent
292
+ if method_name == 'track' and self._is_snowplow_tracker_call(node):
293
+ return 'snowplow'
294
+
295
+ return None
296
+
297
+ def _detect_function_call_source(self, node: ast.Call) -> Optional[str]:
298
+ """Helper method to detect analytics source from direct function calls."""
299
+ func_name = node.func.id
300
+
301
+ # Check for Snowplow direct functions
302
+ if func_name in ['trackStructEvent', 'buildStructEvent']:
303
+ return 'snowplow'
304
+
305
+ # Check for Snowplow's snowplow('trackStructEvent', {...}) pattern
306
+ if func_name == 'snowplow' and self._is_snowplow_function_call(node):
307
+ return 'snowplow'
308
+
309
+ # Check for custom tracking function
310
+ if self.custom_function and func_name == self.custom_function:
311
+ return 'custom'
312
+
313
+ return None
314
+
315
+ def _is_amplitude_call(self, node: ast.Call) -> bool:
316
+ """Check if the call matches Amplitude's BaseEvent pattern."""
317
+ if len(node.args) < 1:
318
+ return False
319
+
320
+ first_arg = node.args[0]
321
+ if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
322
+ return first_arg.func.id == 'BaseEvent'
323
+ return False
324
+
325
+ def _is_snowplow_tracker_call(self, node: ast.Call) -> bool:
326
+ """Check if the call matches Snowplow's tracker.track() pattern."""
327
+ if len(node.args) < 1:
328
+ return False
329
+
330
+ first_arg = node.args[0]
331
+ # Check if first argument is StructuredEvent
332
+ if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
333
+ return first_arg.func.id == 'StructuredEvent'
334
+
335
+ # Also check if it might be a variable (simple heuristic)
336
+ if isinstance(first_arg, ast.Name) and hasattr(node.func, 'value'):
337
+ return node.func.value.id == 'tracker'
338
+
339
+ return False
340
+
341
+ def _is_snowplow_function_call(self, node: ast.Call) -> bool:
342
+ """Check if this is a snowplow('trackStructEvent', {...}) call."""
343
+ if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
344
+ return node.args[0].value == 'trackStructEvent'
345
+ return False
346
+
347
+ def extract_event_name(self, node: ast.Call, source: str) -> Optional[str]:
348
+ """
349
+ Extract the event name from an analytics tracking call.
350
+
351
+ Different analytics libraries have different patterns for specifying
352
+ event names. This method handles the extraction for each supported source.
353
+
354
+ Args:
355
+ node: The function call AST node
356
+ source: The detected analytics source
357
+
358
+ Returns:
359
+ The extracted event name, or None if not found
360
+ """
361
+ try:
362
+ if source in ['segment', 'rudderstack', 'mixpanel']:
363
+ return self._extract_standard_event_name(node)
364
+ elif source == 'amplitude':
365
+ return self._extract_amplitude_event_name(node)
366
+ elif source == 'posthog':
367
+ return self._extract_posthog_event_name(node)
368
+ elif source == 'snowplow':
369
+ return self._extract_snowplow_event_name(node)
370
+ elif source == 'custom':
371
+ return self._extract_custom_event_name(node)
372
+ except Exception:
373
+ # Silently fail and return None for any extraction errors
374
+ pass
375
+
376
+ return None
377
+
378
+ def _extract_standard_event_name(self, node: ast.Call) -> Optional[str]:
379
+ """Extract event name for Segment/Rudderstack/Mixpanel format."""
380
+ # Format: library.track(user_id/distinct_id, 'event_name', {...})
381
+ if len(node.args) >= 2 and isinstance(node.args[1], ast.Constant):
382
+ return node.args[1].value
383
+ return None
384
+
385
+ def _extract_amplitude_event_name(self, node: ast.Call) -> Optional[str]:
386
+ """Extract event name for Amplitude format."""
387
+ # Format: client.track(BaseEvent(event_type='event_name', ...))
388
+ if len(node.args) < 1 or not isinstance(node.args[0], ast.Call):
389
+ return None
390
+
391
+ base_event_call = node.args[0]
392
+ # Look for event_type in keyword arguments
393
+ for keyword in base_event_call.keywords:
394
+ if keyword.arg == 'event_type' and isinstance(keyword.value, ast.Constant):
395
+ return keyword.value.value
396
+ return None
397
+
398
+ def _extract_posthog_event_name(self, node: ast.Call) -> Optional[str]:
399
+ """Extract event name for PostHog format."""
400
+ # PostHog has multiple formats:
401
+ # 1. posthog.capture('distinct_id', 'event_name', {...})
402
+ # 2. posthog.capture('distinct_id', event='event_name', properties={...})
403
+
404
+ # Check for named parameters first (event='event_name')
405
+ for keyword in node.keywords:
406
+ if keyword.arg == 'event' and isinstance(keyword.value, ast.Constant):
407
+ return keyword.value.value
408
+
409
+ # If no named event parameter, check positional args (second arg is event name)
410
+ if len(node.args) >= 2 and isinstance(node.args[1], ast.Constant):
411
+ return node.args[1].value
412
+
413
+ return None
414
+
415
+ def _extract_snowplow_event_name(self, node: ast.Call) -> Optional[str]:
416
+ """Extract event name for Snowplow format."""
417
+ # Pattern 1: tracker.track(StructuredEvent(action='event_name', ...))
418
+ if len(node.args) >= 1:
419
+ first_arg = node.args[0]
420
+ if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
421
+ if first_arg.func.id == 'StructuredEvent':
422
+ # Look for action in keyword arguments
423
+ for keyword in first_arg.keywords:
424
+ if keyword.arg == 'action' and isinstance(keyword.value, ast.Constant):
425
+ return keyword.value.value
426
+
427
+ # Pattern 2 & 3: Other Snowplow patterns would need additional handling
428
+ # For now, return None for these cases
429
+ return None
430
+
431
+ def _extract_custom_event_name(self, node: ast.Call) -> Optional[str]:
432
+ """Extract event name for custom tracking function."""
433
+ # Standard format: customFunction('event_name', {...})
434
+ if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
435
+ return node.args[0].value
436
+ return None
437
+
438
+ def extract_properties(self, node: ast.Call, source: str) -> EventProperties:
439
+ """
440
+ Extract properties from an analytics tracking call.
441
+
442
+ This method extracts the properties/attributes passed to the tracking call,
443
+ handling the different formats used by various analytics libraries.
444
+
445
+ Args:
446
+ node: The function call AST node
447
+ source: The detected analytics source
448
+
449
+ Returns:
450
+ Dictionary of properties with their types
451
+ """
452
+ properties = {}
453
+
454
+ try:
455
+ # Extract user/distinct ID if present
456
+ user_id_prop = self._extract_user_id(node, source)
457
+ if user_id_prop:
458
+ properties.update(user_id_prop)
459
+
460
+ # Special handling for Snowplow StructuredEvent
461
+ if source == 'snowplow':
462
+ snowplow_props = self._extract_snowplow_properties(node)
463
+ properties.update(snowplow_props)
464
+ else:
465
+ # Get the properties dictionary node for other sources
466
+ props_node = self._get_properties_node(node, source)
467
+
468
+ # Extract properties from the dictionary
469
+ if props_node and isinstance(props_node, ast.Dict):
470
+ extracted_props = self._extract_dict_properties(props_node, source)
471
+ properties.update(extracted_props)
472
+
473
+ except Exception:
474
+ # Silently fail and return what we have so far
475
+ pass
476
+
477
+ return properties
478
+
479
+ def _extract_user_id(self, node: ast.Call, source: str) -> EventProperties:
480
+ """Extract user/distinct ID from tracking call if present."""
481
+ user_id_props = {}
482
+
483
+ if source in ['segment', 'rudderstack']:
484
+ # Format: analytics.track(user_id, ...)
485
+ if len(node.args) > 0:
486
+ user_id_node = node.args[0]
487
+ if self._is_non_null_value(user_id_node):
488
+ user_id_props["user_id"] = {"type": "string"}
489
+
490
+ elif source == 'mixpanel':
491
+ # Format: mp.track(distinct_id, ...)
492
+ if len(node.args) > 0:
493
+ distinct_id_node = node.args[0]
494
+ if self._is_non_null_value(distinct_id_node):
495
+ user_id_props["distinct_id"] = {"type": "string"}
496
+
497
+ elif source == 'amplitude':
498
+ # Check BaseEvent for user_id parameter
499
+ user_id_props.update(self._extract_amplitude_user_id(node))
500
+
501
+ elif source == 'posthog':
502
+ # Check if event is not anonymous and extract distinct_id
503
+ user_id_props.update(self._extract_posthog_user_id(node))
504
+
505
+ return user_id_props
506
+
507
+ def _is_non_null_value(self, node: ast.AST) -> bool:
508
+ """Check if a node represents a non-null value."""
509
+ if isinstance(node, ast.Constant):
510
+ return node.value is not None
511
+ elif isinstance(node, ast.Name):
512
+ return True # Variable reference, assume non-null
513
+ return False
514
+
515
+ def _extract_amplitude_user_id(self, node: ast.Call) -> EventProperties:
516
+ """Extract user_id from Amplitude BaseEvent call."""
517
+ if len(node.args) < 1 or not isinstance(node.args[0], ast.Call):
518
+ return {}
519
+
520
+ base_event_call = node.args[0]
521
+ for keyword in base_event_call.keywords:
522
+ if keyword.arg == 'user_id' and self._is_non_null_value(keyword.value):
523
+ return {"user_id": {"type": "string"}}
524
+ return {}
525
+
526
+ def _extract_posthog_user_id(self, node: ast.Call) -> EventProperties:
527
+ """Extract distinct_id from PostHog call if not anonymous."""
528
+ # Check if event is anonymous by looking for $process_person_profile: False
529
+ props_node = self._get_properties_node(node, 'posthog')
530
+
531
+ if props_node and isinstance(props_node, ast.Dict):
532
+ for i, key_node in enumerate(props_node.keys):
533
+ if (isinstance(key_node, ast.Constant) and
534
+ key_node.value == '$process_person_profile'):
535
+ value_node = props_node.values[i]
536
+ if isinstance(value_node, ast.Constant) and value_node.value is False:
537
+ return {} # Anonymous event
538
+
539
+ # Extract distinct_id if not anonymous
540
+ if len(node.args) > 0 and isinstance(node.args[0], ast.Constant):
541
+ distinct_id = node.args[0].value
542
+ if distinct_id:
543
+ return {"distinct_id": {"type": "string"}}
544
+ return {}
545
+
546
+ def _get_properties_node(self, node: ast.Call, source: str) -> Optional[ast.Dict]:
547
+ """Get the properties dictionary node based on the analytics source."""
548
+ if source in ['segment', 'rudderstack', 'mixpanel']:
549
+ # Properties are in the third argument
550
+ if len(node.args) > 2:
551
+ return node.args[2]
552
+
553
+ elif source == 'amplitude':
554
+ # Look for event_properties in BaseEvent
555
+ if len(node.args) >= 1 and isinstance(node.args[0], ast.Call):
556
+ base_event_call = node.args[0]
557
+ for keyword in base_event_call.keywords:
558
+ if keyword.arg == 'event_properties' and isinstance(keyword.value, ast.Dict):
559
+ return keyword.value
560
+
561
+ elif source == 'custom':
562
+ # Properties are in the second argument
563
+ if len(node.args) > 1:
564
+ return node.args[1]
565
+
566
+ elif source == 'posthog':
567
+ # Check named parameters first, then positional
568
+ for keyword in node.keywords:
569
+ if keyword.arg == 'properties' and isinstance(keyword.value, ast.Dict):
570
+ return keyword.value
571
+ if len(node.args) > 2:
572
+ return node.args[2]
573
+
574
+ elif source == 'snowplow':
575
+ # Handle StructuredEvent pattern
576
+ if len(node.args) >= 1:
577
+ first_arg = node.args[0]
578
+ if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
579
+ if first_arg.func.id == 'StructuredEvent':
580
+ # Return None as properties are handled differently for Snowplow
581
+ return None
582
+
583
+ return None
584
+
585
+ def _extract_dict_properties(self, dict_node: ast.Dict, source: str) -> EventProperties:
586
+ """Extract properties from a dictionary node."""
587
+ properties = {}
588
+
589
+ for i, key_node in enumerate(dict_node.keys):
590
+ if isinstance(key_node, ast.Constant) and hasattr(key_node, 'value'):
591
+ key = key_node.value
592
+ value_node = dict_node.values[i]
593
+
594
+ # Special handling for PostHog $set and $set_once
595
+ if source == 'posthog' and key in ['$set', '$set_once']:
596
+ if isinstance(value_node, ast.Dict):
597
+ nested_props = self.extract_nested_dict(value_node)
598
+ for nested_key, nested_value in nested_props.items():
599
+ properties[f"{key}.{nested_key}"] = nested_value
600
+ continue
601
+
602
+ # Skip PostHog internal properties
603
+ if source == 'posthog' and key == '$process_person_profile':
604
+ continue
605
+
606
+ # Extract property type
607
+ prop_type = self._extract_property_type(value_node)
608
+ if prop_type:
609
+ properties[key] = prop_type
610
+
611
+ return properties
612
+
613
+ def _extract_snowplow_properties(self, node: ast.Call) -> EventProperties:
614
+ """Extract properties from Snowplow tracking call."""
615
+ properties = {}
616
+
617
+ # Pattern: tracker.track(StructuredEvent(...))
618
+ if len(node.args) >= 1:
619
+ first_arg = node.args[0]
620
+ if isinstance(first_arg, ast.Call) and isinstance(first_arg.func, ast.Name):
621
+ if first_arg.func.id == 'StructuredEvent':
622
+ # Extract all keyword arguments except 'action'
623
+ for keyword in first_arg.keywords:
624
+ if keyword.arg and keyword.arg != 'action':
625
+ # Map property_ to property for consistency
626
+ prop_name = 'property' if keyword.arg == 'property_' else keyword.arg
627
+ prop_type = self._extract_property_type(keyword.value)
628
+ if prop_type:
629
+ properties[prop_name] = prop_type
630
+
631
+ return properties
632
+
633
+ def _extract_property_type(self, value_node: ast.AST) -> Optional[Dict[str, Any]]:
634
+ """Extract the type information for a property value."""
635
+ if isinstance(value_node, ast.Constant):
636
+ value_type = self.get_value_type(value_node.value)
637
+ return {"type": value_type}
638
+
639
+ elif isinstance(value_node, ast.Name):
640
+ # Check if we know the type of this variable
641
+ var_name = value_node.id
642
+ if var_name in self.var_types:
643
+ var_type = self.var_types[var_name]
644
+ if isinstance(var_type, dict):
645
+ return var_type
646
+ else:
647
+ return {"type": var_type}
648
+ else:
649
+ return {"type": "any"}
650
+
651
+ elif isinstance(value_node, ast.Dict):
652
+ # Nested dictionary
653
+ nested_props = self.extract_nested_dict(value_node)
654
+ return {
655
+ "type": "object",
656
+ "properties": nested_props
657
+ }
658
+
659
+ elif isinstance(value_node, (ast.List, ast.Tuple)):
660
+ # Array/list/tuple
661
+ item_type = self.infer_sequence_item_type(value_node)
662
+ return {
663
+ "type": "array",
664
+ "items": item_type
665
+ }
666
+
667
+ return None
668
+
669
+ def infer_sequence_item_type(self, seq_node: Union[ast.List, ast.Tuple]) -> Dict[str, str]:
670
+ """
671
+ Analyze a sequence (list or tuple) to determine the type of its items.
672
+
673
+ Args:
674
+ seq_node: The list or tuple AST node
675
+
676
+ Returns:
677
+ Dictionary representing the item type
678
+ """
679
+ if not hasattr(seq_node, 'elts') or not seq_node.elts:
680
+ return {"type": "any"}
681
+
682
+ # Get types of all elements
683
+ element_types = []
684
+ for element in seq_node.elts:
685
+ if isinstance(element, ast.Constant):
686
+ element_types.append(self.get_value_type(element.value))
687
+ elif isinstance(element, ast.Name):
688
+ if element.id in self.var_types:
689
+ var_type = self.var_types[element.id]
690
+ element_types.append(var_type if isinstance(var_type, str) else "any")
691
+ else:
692
+ element_types.append("any")
693
+ elif isinstance(element, ast.Dict):
694
+ element_types.append("object")
695
+ elif isinstance(element, (ast.List, ast.Tuple)):
696
+ element_types.append("array")
697
+ else:
698
+ element_types.append("any")
699
+
700
+ # Determine the common type
701
+ unique_types = set(element_types)
702
+
703
+ if len(unique_types) == 1:
704
+ return {"type": element_types[0]}
705
+ elif unique_types <= {"number", "string"}:
706
+ # Common mixed case - numbers and strings
707
+ return {"type": "string"}
708
+ elif unique_types <= {"number", "boolean"}:
709
+ # Numbers and booleans
710
+ return {"type": "number"}
711
+ else:
712
+ # Mixed types
713
+ return {"type": "any"}
714
+
715
+ def extract_nested_dict(self, dict_node: ast.Dict) -> EventProperties:
716
+ """
717
+ Extract properties from a nested dictionary.
718
+
719
+ Args:
720
+ dict_node: The dictionary AST node
721
+
722
+ Returns:
723
+ Dictionary of properties with their types
724
+ """
725
+ nested_props = {}
726
+
727
+ for i, key_node in enumerate(dict_node.keys):
728
+ if isinstance(key_node, ast.Constant) and hasattr(key_node, 'value'):
729
+ key = key_node.value
730
+ value_node = dict_node.values[i]
731
+
732
+ prop_type = self._extract_property_type(value_node)
733
+ if prop_type:
734
+ nested_props[key] = prop_type
735
+
736
+ return nested_props
737
+
738
+ def get_value_type(self, value: Any) -> str:
739
+ """
740
+ Determine the JSON Schema type for a Python value.
741
+
742
+ Args:
743
+ value: The Python value
744
+
745
+ Returns:
746
+ String representing the JSON Schema type
747
+ """
748
+ if isinstance(value, bool):
749
+ return "boolean"
750
+ elif isinstance(value, str):
751
+ return "string"
752
+ elif isinstance(value, (int, float)):
753
+ return "number"
754
+ elif value is None:
755
+ return "null"
756
+ return "any"
757
+
758
+ def analyze_python_code(code: str, filepath: str, custom_function: Optional[str] = None) -> str:
759
+ """
760
+ Analyze Python code for analytics tracking calls.
761
+
762
+ This function parses Python code and identifies analytics tracking calls,
763
+ extracting event names, properties, and metadata.
764
+
765
+ Args:
766
+ code: The Python source code to analyze
767
+ filepath: Path to the file being analyzed
768
+ custom_function: Optional name of a custom tracking function
769
+
770
+ Returns:
771
+ JSON string containing array of tracking events
772
+ """
773
+ try:
774
+ # Parse the Python code
775
+ tree = ast.parse(code)
776
+
777
+ # Create visitor and analyze
778
+ visitor = TrackingVisitor(filepath, custom_function)
779
+ visitor.visit(tree)
780
+
781
+ # Return events as JSON
782
+ return json.dumps(visitor.events)
783
+ except Exception as e:
784
+ # Return empty array on parse errors
785
+ return json.dumps([])
786
+
787
+ # Command-line interface
788
+ if __name__ == "__main__":
789
+ import sys
790
+ import argparse
791
+
792
+ parser = argparse.ArgumentParser(
793
+ description='Analyze Python code for analytics tracking calls',
794
+ formatter_class=argparse.RawDescriptionHelpFormatter,
795
+ epilog="Example: %(prog)s myapp.py [--custom-function track_event]"
796
+ )
797
+ parser.add_argument('file', help='Python file to analyze')
798
+ parser.add_argument(
799
+ '-c', '--custom-function',
800
+ help='Name of custom tracking function to detect'
801
+ )
802
+ args = parser.parse_args()
803
+
804
+ try:
805
+ with open(args.file, 'r') as f:
806
+ code = f.read()
807
+ result = analyze_python_code(code, args.file, args.custom_function)
808
+ print(result)
809
+ except FileNotFoundError:
810
+ print(f"Error: File '{args.file}' not found", file=sys.stderr)
811
+ sys.exit(1)
812
+ except Exception as e:
813
+ print(f"Error analyzing file: {str(e)}", file=sys.stderr)
814
+ sys.exit(1)