@flisk/analyze-tracking 0.5.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,9 @@ const ts = require('typescript');
3
3
  const { getAllFiles } = require('../fileProcessor');
4
4
  const { analyzeJsFile } = require('./analyzeJsFile');
5
5
  const { analyzeTsFile } = require('./analyzeTsFile');
6
+ const { analyzePythonFile } = require('./analyzePythonFile');
6
7
  const { analyzeRubyFile } = require('./analyzeRubyFile');
8
+ const { analyzeGoFile } = require('./analyzeGoFile');
7
9
 
8
10
  async function analyzeDirectory(dirPath, customFunction) {
9
11
  const allEvents = {};
@@ -20,14 +22,20 @@ async function analyzeDirectory(dirPath, customFunction) {
20
22
 
21
23
  const isJsFile = /\.(jsx?)$/.test(file);
22
24
  const isTsFile = /\.(tsx?)$/.test(file);
25
+ const isPythonFile = /\.(py)$/.test(file);
23
26
  const isRubyFile = /\.(rb)$/.test(file);
27
+ const isGoFile = /\.(go)$/.test(file);
24
28
 
25
29
  if (isJsFile) {
26
30
  events = analyzeJsFile(file, customFunction);
27
31
  } else if (isTsFile) {
28
32
  events = analyzeTsFile(file, tsProgram, customFunction);
33
+ } else if (isPythonFile) {
34
+ events = await analyzePythonFile(file, customFunction);
29
35
  } else if (isRubyFile) {
30
- events = await analyzeRubyFile(file);
36
+ events = await analyzeRubyFile(file, customFunction);
37
+ } else if (isGoFile) {
38
+ events = await analyzeGoFile(file, customFunction);
31
39
  } else {
32
40
  console.info(`Skipping file ${file} because it is not a supported file type`);
33
41
  continue;
@@ -0,0 +1,439 @@
1
+ import ast
2
+ import json
3
+
4
+ class TrackingVisitor(ast.NodeVisitor):
5
+ def __init__(self, filepath, custom_function=None):
6
+ self.events = []
7
+ self.filepath = filepath
8
+ self.current_function = 'global'
9
+ self.function_stack = []
10
+ # Track variable types in the current scope
11
+ self.var_types = {}
12
+ # Stack of variable type scopes
13
+ self.var_types_stack = []
14
+ # Custom tracking function name
15
+ self.custom_function = custom_function
16
+
17
+ def visit_FunctionDef(self, node):
18
+ # Save previous function context and variable types
19
+ self.function_stack.append(self.current_function)
20
+ self.var_types_stack.append(self.var_types)
21
+
22
+ # Create new scope for variable types
23
+ self.var_types = {}
24
+ self.current_function = node.name
25
+
26
+ # Extract parameter type annotations
27
+ for arg in node.args.args:
28
+ if arg.annotation:
29
+ # Store the type annotation for this parameter
30
+ self.var_types[arg.arg] = self.extract_type_annotation(arg.annotation)
31
+
32
+ # Visit children
33
+ self.generic_visit(node)
34
+
35
+ # Restore function context and variable types
36
+ self.current_function = self.function_stack.pop()
37
+ self.var_types = self.var_types_stack.pop()
38
+
39
+ def extract_type_annotation(self, annotation):
40
+ """Extract type information from a type annotation node"""
41
+ if isinstance(annotation, ast.Name):
42
+ # Simple types like int, str, bool
43
+ type_name = annotation.id
44
+ if type_name == 'int' or type_name == 'float':
45
+ return 'number'
46
+ elif type_name == 'str':
47
+ return 'string'
48
+ elif type_name == 'bool':
49
+ return 'boolean'
50
+ elif type_name == 'None' or type_name == 'NoneType':
51
+ return 'null'
52
+ # Could add more type mappings here
53
+ elif isinstance(annotation, ast.Subscript):
54
+ # Handle generic types like List[int], Dict[str, int]
55
+ if hasattr(annotation.value, 'id'):
56
+ container_type = annotation.value.id
57
+ if container_type in ('List', 'Tuple', 'Set', 'list', 'tuple', 'set'):
58
+ # Try to get the type parameter
59
+ if isinstance(annotation.slice, ast.Name):
60
+ element_type = self.extract_type_annotation(annotation.slice)
61
+ return {
62
+ 'type': 'array',
63
+ 'items': {'type': element_type}
64
+ }
65
+ return 'array'
66
+ elif container_type in ('Dict', 'dict'):
67
+ return 'object'
68
+ # Default for unknown or complex types
69
+ return 'any'
70
+
71
+ def visit_AnnAssign(self, node):
72
+ """Visit variable assignments with type annotations"""
73
+ if isinstance(node.target, ast.Name) and node.annotation:
74
+ # Store the type annotation for this variable
75
+ self.var_types[node.target.id] = self.extract_type_annotation(node.annotation)
76
+ self.generic_visit(node)
77
+
78
+ def visit_Assign(self, node):
79
+ """Visit regular assignments to track simple type inferences"""
80
+ if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
81
+ var_name = node.targets[0].id
82
+ # Try to infer type from literal values
83
+ if isinstance(node.value, ast.Constant):
84
+ self.var_types[var_name] = self.get_value_type(node.value.value)
85
+ self.generic_visit(node)
86
+
87
+ def visit_ClassDef(self, node):
88
+ # Track class context for methods
89
+ class_name = node.name
90
+ self.function_stack.append(self.current_function)
91
+ self.var_types_stack.append(self.var_types)
92
+
93
+ # Create new scope for the class
94
+ self.var_types = {}
95
+ self.current_function = class_name
96
+
97
+ self.generic_visit(node)
98
+
99
+ # Restore context
100
+ self.current_function = self.function_stack.pop()
101
+ self.var_types = self.var_types_stack.pop()
102
+
103
+ def visit_Call(self, node):
104
+ # Check for analytics tracking calls
105
+ source = self.detect_source(node)
106
+ if source:
107
+ event_name = self.extract_event_name(node, source)
108
+ if event_name:
109
+ properties = self.extract_properties(node, source)
110
+ self.events.append({
111
+ "eventName": event_name,
112
+ "source": source,
113
+ "properties": properties,
114
+ "filePath": self.filepath,
115
+ "line": node.lineno,
116
+ "functionName": self.current_function
117
+ })
118
+
119
+ # Continue visiting child nodes
120
+ self.generic_visit(node)
121
+
122
+ def detect_source(self, node):
123
+ # Check for analytics tracking libraries
124
+ if isinstance(node.func, ast.Attribute):
125
+ if hasattr(node.func.value, 'id'):
126
+ obj_id = node.func.value.id
127
+ method_name = node.func.attr
128
+
129
+ # Segment analytics
130
+ if obj_id == 'analytics' and method_name == 'track':
131
+ return 'segment'
132
+ # Mixpanel
133
+ if obj_id == 'mixpanel' and method_name == 'track':
134
+ return 'mixpanel'
135
+ # Amplitude
136
+ if obj_id == 'amplitude' and method_name == 'track':
137
+ return 'amplitude'
138
+ # Rudderstack
139
+ if obj_id == 'rudder_analytics' and method_name == 'track':
140
+ return 'rudderstack'
141
+ # mParticle
142
+ if obj_id == 'mParticle' and method_name == 'logEvent':
143
+ return 'mparticle'
144
+ # PostHog
145
+ if obj_id == 'posthog' and method_name == 'capture':
146
+ return 'posthog'
147
+ # Pendo
148
+ if obj_id == 'pendo' and method_name == 'track':
149
+ return 'pendo'
150
+ # Heap
151
+ if obj_id == 'heap' and method_name == 'track':
152
+ return 'heap'
153
+
154
+ # Check for Snowplow struct event patterns
155
+ if isinstance(node.func, ast.Name) and node.func.id in ['trackStructEvent', 'buildStructEvent']:
156
+ return 'snowplow'
157
+
158
+ # Check for Snowplow's snowplow('trackStructEvent', {...}) pattern
159
+ if isinstance(node.func, ast.Name) and node.func.id == 'snowplow':
160
+ if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
161
+ if node.args[0].value == 'trackStructEvent':
162
+ return 'snowplow'
163
+
164
+ # Check for custom tracking function
165
+ if self.custom_function and isinstance(node.func, ast.Name) and node.func.id == self.custom_function:
166
+ return 'custom'
167
+
168
+ return None
169
+
170
+ def extract_event_name(self, node, source):
171
+ try:
172
+ if source in ['segment', 'mixpanel', 'amplitude', 'rudderstack', 'pendo', 'heap', 'custom']:
173
+ # Standard format: library.track('event_name', {...})
174
+ # Custom function follows same format: customFunction('event_name', {...})
175
+ if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
176
+ return node.args[0].value
177
+
178
+ elif source == 'mparticle':
179
+ # mParticle: mParticle.logEvent('event_name', {...})
180
+ if len(node.args) >= 1 and isinstance(node.args[0], ast.Constant):
181
+ return node.args[0].value
182
+
183
+ elif source == 'posthog':
184
+ # PostHog has multiple formats:
185
+ # 1. posthog.capture('distinct_id', 'event_name', {...})
186
+ # 2. posthog.capture('distinct_id', event='event_name', properties={...})
187
+
188
+ # Check for named parameters first (event='event_name')
189
+ for keyword in node.keywords:
190
+ if keyword.arg == 'event' and isinstance(keyword.value, ast.Constant):
191
+ return keyword.value.value
192
+
193
+ # If no named event parameter, check positional args (second arg is event name)
194
+ if len(node.args) >= 2 and isinstance(node.args[1], ast.Constant):
195
+ return node.args[1].value
196
+
197
+ elif source == 'snowplow':
198
+ # Snowplow struct events use 'action' as the event name
199
+ if len(node.args) >= 1:
200
+ # Handle different snowplow call patterns
201
+ props_node = None
202
+
203
+ # Direct trackStructEvent/buildStructEvent call
204
+ if isinstance(node.func, ast.Name) and node.func.id in ['trackStructEvent', 'buildStructEvent']:
205
+ if len(node.args) >= 1:
206
+ props_node = node.args[0]
207
+
208
+ # snowplow('trackStructEvent', {...}) pattern
209
+ elif isinstance(node.func, ast.Name) and node.func.id == 'snowplow':
210
+ if len(node.args) >= 2:
211
+ props_node = node.args[1]
212
+
213
+ # Extract 'action' from properties
214
+ if props_node and isinstance(props_node, ast.Dict):
215
+ for i, key_node in enumerate(props_node.keys):
216
+ if isinstance(key_node, ast.Constant) and key_node.value == 'action':
217
+ value_node = props_node.values[i]
218
+ if isinstance(value_node, ast.Constant):
219
+ return value_node.value
220
+ except:
221
+ pass
222
+ return None
223
+
224
+ def extract_properties(self, node, source):
225
+ properties = {}
226
+ try:
227
+ props_node = None
228
+
229
+ # Get the properties object based on source
230
+ if source in ['segment', 'mixpanel', 'amplitude', 'rudderstack', 'mparticle', 'pendo', 'heap', 'custom']:
231
+ # Standard format: library.track('event_name', {properties})
232
+ # Custom function follows same format: customFunction('event_name', {...})
233
+ if len(node.args) > 1:
234
+ props_node = node.args[1]
235
+
236
+ elif source == 'posthog':
237
+ # PostHog has multiple formats
238
+ is_anonymous = False
239
+ distinct_id = None
240
+
241
+ # Check for properties in named parameters first
242
+ for keyword in node.keywords:
243
+ if keyword.arg == 'properties' and isinstance(keyword.value, ast.Dict):
244
+ props_node = keyword.value
245
+
246
+ # Check if event is anonymous
247
+ for i, key_node in enumerate(props_node.keys):
248
+ if (isinstance(key_node, ast.Constant) and
249
+ key_node.value == '$process_person_profile'):
250
+ value_node = props_node.values[i]
251
+ if (isinstance(value_node, ast.Constant) and
252
+ value_node.value is False):
253
+ is_anonymous = True
254
+
255
+ # If no named properties, check positional args (third arg)
256
+ if props_node is None and len(node.args) > 2:
257
+ props_node = node.args[2]
258
+
259
+ # Add distinct_id as property if it exists and event is not anonymous
260
+ if not is_anonymous and len(node.args) > 0 and isinstance(node.args[0], ast.Constant):
261
+ distinct_id = node.args[0].value
262
+ if distinct_id:
263
+ properties["distinct_id"] = {"type": "string"}
264
+
265
+ elif source == 'snowplow':
266
+ # For Snowplow struct events
267
+ if isinstance(node.func, ast.Name) and node.func.id in ['trackStructEvent', 'buildStructEvent']:
268
+ if len(node.args) >= 1:
269
+ props_node = node.args[0]
270
+
271
+ # snowplow('trackStructEvent', {...}) pattern
272
+ elif isinstance(node.func, ast.Name) and node.func.id == 'snowplow':
273
+ if len(node.args) >= 2:
274
+ props_node = node.args[1]
275
+
276
+ # Extract properties from the dictionary
277
+ if props_node and isinstance(props_node, ast.Dict):
278
+ for i, key_node in enumerate(props_node.keys):
279
+ if isinstance(key_node, ast.Constant) and hasattr(key_node, 'value'):
280
+ key = key_node.value
281
+ value_node = props_node.values[i]
282
+
283
+ # Special handling for PostHog $set and $set_once
284
+ if source == 'posthog' and key in ['$set', '$set_once']:
285
+ if isinstance(value_node, ast.Dict):
286
+ nested_props = self.extract_nested_dict(value_node)
287
+ for nested_key, nested_value in nested_props.items():
288
+ properties[f"{key}.{nested_key}"] = nested_value
289
+ continue
290
+
291
+ # Skip PostHog internal properties
292
+ if source == 'posthog' and key == '$process_person_profile':
293
+ continue
294
+
295
+ # Handle different value types
296
+ if isinstance(value_node, ast.Constant):
297
+ value_type = self.get_value_type(value_node.value)
298
+ properties[key] = {"type": value_type}
299
+ elif isinstance(value_node, ast.Name):
300
+ # Check if we know the type of this variable
301
+ var_name = value_node.id
302
+ if var_name in self.var_types:
303
+ # Get the type for this variable
304
+ var_type = self.var_types[var_name]
305
+
306
+ # Handle structured types (arrays or objects)
307
+ if isinstance(var_type, dict):
308
+ properties[key] = var_type
309
+ else:
310
+ properties[key] = {"type": var_type}
311
+ else:
312
+ properties[key] = {"type": "any"}
313
+ elif isinstance(value_node, ast.Dict):
314
+ # Nested dictionary
315
+ nested_props = self.extract_nested_dict(value_node)
316
+ properties[key] = {
317
+ "type": "object",
318
+ "properties": nested_props
319
+ }
320
+ elif isinstance(value_node, ast.List) or isinstance(value_node, ast.Tuple):
321
+ # Array/list/tuple
322
+ item_type = self.infer_sequence_item_type(value_node)
323
+ properties[key] = {
324
+ "type": "array",
325
+ "items": item_type
326
+ }
327
+ except:
328
+ pass
329
+ return properties
330
+
331
+ def infer_sequence_item_type(self, seq_node):
332
+ """Analyze a sequence (list or tuple) to determine the type of its items"""
333
+ if not hasattr(seq_node, 'elts') or not seq_node.elts:
334
+ return {"type": "any"}
335
+
336
+ # Get types of all elements
337
+ element_types = []
338
+ for element in seq_node.elts:
339
+ if isinstance(element, ast.Constant):
340
+ element_types.append(self.get_value_type(element.value))
341
+ elif isinstance(element, ast.Name):
342
+ if element.id in self.var_types:
343
+ element_types.append(self.var_types[element.id])
344
+ else:
345
+ element_types.append("any")
346
+ elif isinstance(element, ast.Dict):
347
+ element_types.append("object")
348
+ elif isinstance(element, ast.List) or isinstance(element, ast.Tuple):
349
+ element_types.append("array")
350
+ else:
351
+ element_types.append("any")
352
+
353
+ # Check if all elements are the same type
354
+ if len(set(element_types)) == 1:
355
+ return {"type": element_types[0]}
356
+
357
+ # Check if all types are either number or string (common mixed case)
358
+ if set(element_types) <= {"number", "string"}:
359
+ return {"type": "string"}
360
+
361
+ # Check if all types are either number or boolean
362
+ if set(element_types) <= {"number", "boolean"}:
363
+ return {"type": "number"}
364
+
365
+ # Otherwise, it's a mixed type array
366
+ return {"type": "any"}
367
+
368
+ def extract_nested_dict(self, dict_node):
369
+ nested_props = {}
370
+ for i, key_node in enumerate(dict_node.keys):
371
+ if isinstance(key_node, ast.Constant) and hasattr(key_node, 'value'):
372
+ key = key_node.value
373
+ value_node = dict_node.values[i]
374
+
375
+ if isinstance(value_node, ast.Constant):
376
+ value_type = self.get_value_type(value_node.value)
377
+ nested_props[key] = {"type": value_type}
378
+ elif isinstance(value_node, ast.Name):
379
+ # Check if we know the type of this variable
380
+ var_name = value_node.id
381
+ if var_name in self.var_types:
382
+ nested_props[key] = {"type": self.var_types[var_name]}
383
+ else:
384
+ nested_props[key] = {"type": "any"}
385
+ elif isinstance(value_node, ast.Dict):
386
+ sub_props = self.extract_nested_dict(value_node)
387
+ nested_props[key] = {
388
+ "type": "object",
389
+ "properties": sub_props
390
+ }
391
+ elif isinstance(value_node, ast.List) or isinstance(value_node, ast.Tuple):
392
+ # Array/list/tuple
393
+ item_type = self.infer_sequence_item_type(value_node)
394
+ nested_props[key] = {
395
+ "type": "array",
396
+ "items": item_type
397
+ }
398
+ return nested_props
399
+
400
+ def get_value_type(self, value):
401
+ if isinstance(value, str):
402
+ return "string"
403
+ elif isinstance(value, (int, float)):
404
+ return "number"
405
+ elif isinstance(value, bool):
406
+ return "boolean"
407
+ elif value is None:
408
+ return "null"
409
+ return "any"
410
+
411
+ def analyze_python_code(code, filepath, custom_function=None):
412
+ # Parse the Python code
413
+ tree = ast.parse(code)
414
+ visitor = TrackingVisitor(filepath, custom_function)
415
+ visitor.visit(tree)
416
+
417
+ # Return events as JSON
418
+ return json.dumps(visitor.events)
419
+
420
+ if __name__ == "__main__":
421
+ import sys
422
+ import argparse
423
+
424
+ parser = argparse.ArgumentParser(description='Analyze Python code for tracking calls')
425
+ parser.add_argument('file', help='Python file to analyze')
426
+ parser.add_argument('-c', '--custom-function', help='Name of custom tracking function')
427
+ args = parser.parse_args()
428
+
429
+ try:
430
+ with open(args.file, 'r') as f:
431
+ code = f.read()
432
+ result = analyze_python_code(code, args.file, args.custom_function)
433
+ print(result)
434
+ except FileNotFoundError:
435
+ print(f"Error: File '{args.file}' not found")
436
+ sys.exit(1)
437
+ except Exception as e:
438
+ print(f"Error analyzing file: {str(e)}")
439
+ sys.exit(1)