code-analyser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,512 @@
1
+ """
2
+ Similarity analysis for detecting potential plagiarism in code submissions
3
+ """
4
+
5
+ import ast
6
+ import difflib
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+ from typing import Any
10
+
11
+ import structlog
12
+
13
+ logger = structlog.get_logger()
14
+
15
+
16
+ class SimilarityMethod(Enum):
17
+ """Available similarity analysis methods"""
18
+ AST_STRUCTURAL = "ast_structural"
19
+ TOKEN_BASED = "token_based"
20
+ LINE_BASED = "line_based"
21
+ FUNCTION_SIGNATURE = "function_signature"
22
+ VARIABLE_PATTERN = "variable_pattern"
23
+
24
+
25
+ @dataclass
26
+ class SimilarityMatch:
27
+ """Represents a similarity match between code sections"""
28
+ method: SimilarityMethod
29
+ score: float # 0.0 to 1.0
30
+ confidence: float # 0.0 to 1.0
31
+ matched_sections: dict[str, Any]
32
+ explanation: str
33
+
34
+
35
+ @dataclass
36
+ class SimilarityResult:
37
+ """Result of similarity analysis between two code submissions"""
38
+ overall_score: float # 0.0 to 1.0
39
+ matches: list[SimilarityMatch]
40
+ flagged: bool
41
+ threshold_used: float
42
+ methods_used: list[SimilarityMethod]
43
+
44
+ # Detailed breakdown
45
+ structural_similarity: float = 0.0
46
+ token_similarity: float = 0.0
47
+ line_similarity: float = 0.0
48
+ function_similarity: float = 0.0
49
+
50
+
51
+ class PythonSimilarityAnalyzer:
52
+ """Similarity analyzer specifically for Python code"""
53
+
54
+ def __init__(self, threshold: float = 0.8):
55
+ self.threshold = threshold
56
+
57
+ def analyze_similarity(
58
+ self,
59
+ code1: str,
60
+ code2: str,
61
+ methods: list[SimilarityMethod] | None = None
62
+ ) -> SimilarityResult:
63
+ """
64
+ Analyze similarity between two Python code submissions
65
+
66
+ Args:
67
+ code1: First code submission
68
+ code2: Second code submission
69
+ methods: Similarity methods to use (default: all)
70
+
71
+ Returns:
72
+ SimilarityResult with detailed similarity analysis
73
+ """
74
+ if methods is None:
75
+ methods = list(SimilarityMethod)
76
+
77
+ matches = []
78
+ scores = {}
79
+
80
+ try:
81
+ # AST structural similarity
82
+ if SimilarityMethod.AST_STRUCTURAL in methods:
83
+ structural_match = self._analyze_ast_similarity(code1, code2)
84
+ if structural_match:
85
+ matches.append(structural_match)
86
+ scores['structural'] = structural_match.score
87
+
88
+ # Token-based similarity
89
+ if SimilarityMethod.TOKEN_BASED in methods:
90
+ token_match = self._analyze_token_similarity(code1, code2)
91
+ if token_match:
92
+ matches.append(token_match)
93
+ scores['token'] = token_match.score
94
+
95
+ # Line-based similarity
96
+ if SimilarityMethod.LINE_BASED in methods:
97
+ line_match = self._analyze_line_similarity(code1, code2)
98
+ if line_match:
99
+ matches.append(line_match)
100
+ scores['line'] = line_match.score
101
+
102
+ # Function signature similarity
103
+ if SimilarityMethod.FUNCTION_SIGNATURE in methods:
104
+ function_match = self._analyze_function_similarity(code1, code2)
105
+ if function_match:
106
+ matches.append(function_match)
107
+ scores['function'] = function_match.score
108
+
109
+ # Calculate overall score (weighted average)
110
+ overall_score = self._calculate_overall_score(scores)
111
+
112
+ result = SimilarityResult(
113
+ overall_score=overall_score,
114
+ matches=matches,
115
+ flagged=overall_score >= self.threshold,
116
+ threshold_used=self.threshold,
117
+ methods_used=methods,
118
+ structural_similarity=scores.get('structural', 0.0),
119
+ token_similarity=scores.get('token', 0.0),
120
+ line_similarity=scores.get('line', 0.0),
121
+ function_similarity=scores.get('function', 0.0)
122
+ )
123
+
124
+ logger.info("Similarity analysis completed",
125
+ overall_score=overall_score,
126
+ flagged=result.flagged,
127
+ matches_count=len(matches))
128
+
129
+ return result
130
+
131
+ except Exception as e:
132
+ logger.error("Similarity analysis failed", error=str(e))
133
+ return SimilarityResult(
134
+ overall_score=0.0,
135
+ matches=[],
136
+ flagged=False,
137
+ threshold_used=self.threshold,
138
+ methods_used=methods
139
+ )
140
+
141
+ def _analyze_ast_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
142
+ """Analyze structural similarity using AST comparison"""
143
+ try:
144
+ tree1 = ast.parse(code1)
145
+ tree2 = ast.parse(code2)
146
+
147
+ # Extract structural features
148
+ features1 = self._extract_ast_features(tree1)
149
+ features2 = self._extract_ast_features(tree2)
150
+
151
+ # Compare structures
152
+ similarity = self._compare_ast_features(features1, features2)
153
+
154
+ if similarity > 0.1: # Only report significant similarities
155
+ return SimilarityMatch(
156
+ method=SimilarityMethod.AST_STRUCTURAL,
157
+ score=similarity,
158
+ confidence=0.9, # AST comparison is highly reliable
159
+ matched_sections={
160
+ "common_patterns": self._find_common_ast_patterns(features1, features2)
161
+ },
162
+ explanation=f"Structural similarity: {similarity:.2f} based on AST analysis"
163
+ )
164
+
165
+ except SyntaxError:
166
+ logger.warning("Syntax error in code - skipping AST analysis")
167
+ except Exception as e:
168
+ logger.error("AST similarity analysis failed", error=str(e))
169
+
170
+ return None
171
+
172
+ def _extract_ast_features(self, tree: ast.AST) -> dict[str, Any]:
173
+ """Extract structural features from AST"""
174
+ features: dict[str, Any] = {
175
+ 'node_types': [],
176
+ 'function_names': [],
177
+ 'class_names': [],
178
+ 'control_structures': [],
179
+ 'nesting_pattern': [],
180
+ 'variable_names': set(),
181
+ 'import_modules': []
182
+ }
183
+
184
+ for node in ast.walk(tree):
185
+ features['node_types'].append(type(node).__name__)
186
+
187
+ if isinstance(node, ast.FunctionDef):
188
+ features['function_names'].append(node.name)
189
+ elif isinstance(node, ast.ClassDef):
190
+ features['class_names'].append(node.name)
191
+ elif isinstance(node, ast.If | ast.While | ast.For):
192
+ features['control_structures'].append(type(node).__name__)
193
+ elif isinstance(node, ast.Name):
194
+ features['variable_names'].add(node.id)
195
+ elif isinstance(node, ast.Import):
196
+ for alias in node.names:
197
+ features['import_modules'].append(alias.name)
198
+ elif isinstance(node, ast.ImportFrom):
199
+ if node.module:
200
+ features['import_modules'].append(node.module)
201
+
202
+ # Convert set to list for comparison
203
+ features['variable_names'] = list(features['variable_names'])
204
+
205
+ return features
206
+
207
+ def _compare_ast_features(self, features1: dict[str, Any], features2: dict[str, Any]) -> float:
208
+ """Compare AST features and return similarity score"""
209
+ total_score = 0.0
210
+ weights = {
211
+ 'node_types': 0.3,
212
+ 'function_names': 0.2,
213
+ 'class_names': 0.15,
214
+ 'control_structures': 0.15,
215
+ 'variable_names': 0.1,
216
+ 'import_modules': 0.1
217
+ }
218
+
219
+ for feature, weight in weights.items():
220
+ list1 = features1.get(feature, [])
221
+ list2 = features2.get(feature, [])
222
+
223
+ # Calculate Jaccard similarity for lists
224
+ set1, set2 = set(list1), set(list2)
225
+ intersection = len(set1.intersection(set2))
226
+ union = len(set1.union(set2))
227
+
228
+ similarity = intersection / union if union > 0 else 0.0
229
+ total_score += similarity * weight
230
+
231
+ return total_score
232
+
233
+ def _find_common_ast_patterns(self, features1: dict[str, Any], features2: dict[str, Any]) -> dict[str, list]:
234
+ """Find common patterns in AST features"""
235
+ common_patterns = {}
236
+
237
+ for feature_type in ['function_names', 'class_names', 'control_structures']:
238
+ list1 = features1.get(feature_type, [])
239
+ list2 = features2.get(feature_type, [])
240
+ common = list(set(list1).intersection(set(list2)))
241
+ if common:
242
+ common_patterns[feature_type] = common
243
+
244
+ return common_patterns
245
+
246
+ def _analyze_token_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
247
+ """Analyze similarity based on code tokens"""
248
+ try:
249
+ # Tokenize code (simplified - split by whitespace and symbols)
250
+ tokens1 = self._tokenize_code(code1)
251
+ tokens2 = self._tokenize_code(code2)
252
+
253
+ # Calculate token similarity using sequence matching
254
+ matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
255
+ similarity = matcher.ratio()
256
+
257
+ if similarity > 0.3: # Only report significant token similarities
258
+ common_tokens = self._find_common_token_sequences(tokens1, tokens2)
259
+
260
+ return SimilarityMatch(
261
+ method=SimilarityMethod.TOKEN_BASED,
262
+ score=similarity,
263
+ confidence=0.7,
264
+ matched_sections={
265
+ "common_tokens": common_tokens[:10], # First 10 common sequences
266
+ "total_common_tokens": len(common_tokens)
267
+ },
268
+ explanation=f"Token similarity: {similarity:.2f} based on code token analysis"
269
+ )
270
+
271
+ except Exception as e:
272
+ logger.error("Token similarity analysis failed", error=str(e))
273
+
274
+ return None
275
+
276
+ def _tokenize_code(self, code: str) -> list[str]:
277
+ """Simple code tokenization"""
278
+ import re
279
+ # Split on whitespace and common symbols
280
+ tokens = re.findall(r'\w+|[^\w\s]', code)
281
+ # Filter out very common tokens
282
+ common_tokens = {'def', 'class', 'if', 'else', 'for', 'while', 'import', 'return'}
283
+ return [token for token in tokens if token.lower() not in common_tokens]
284
+
285
+ def _find_common_token_sequences(self, tokens1: list[str], tokens2: list[str]) -> list[str]:
286
+ """Find common token sequences"""
287
+ matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
288
+ common_sequences = []
289
+
290
+ for match in matcher.get_matching_blocks():
291
+ if match.size > 2: # Only sequences of 3+ tokens
292
+ sequence = ' '.join(tokens1[match.a:match.a + match.size])
293
+ common_sequences.append(sequence)
294
+
295
+ return common_sequences
296
+
297
+ def _analyze_line_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
298
+ """Analyze similarity based on code lines"""
299
+ try:
300
+ lines1 = [line.strip() for line in code1.split('\n') if line.strip()]
301
+ lines2 = [line.strip() for line in code2.split('\n') if line.strip()]
302
+
303
+ # Calculate line-based similarity
304
+ matcher = difflib.SequenceMatcher(None, lines1, lines2)
305
+ similarity = matcher.ratio()
306
+
307
+ if similarity > 0.4: # Only report significant line similarities
308
+ common_lines = []
309
+ for match in matcher.get_matching_blocks():
310
+ if match.size > 1: # Sequences of 2+ lines
311
+ for i in range(match.size):
312
+ common_lines.append(lines1[match.a + i])
313
+
314
+ return SimilarityMatch(
315
+ method=SimilarityMethod.LINE_BASED,
316
+ score=similarity,
317
+ confidence=0.6,
318
+ matched_sections={
319
+ "common_lines": common_lines[:5], # First 5 common lines
320
+ "total_common_lines": len(common_lines)
321
+ },
322
+ explanation=f"Line similarity: {similarity:.2f} based on exact line matching"
323
+ )
324
+
325
+ except Exception as e:
326
+ logger.error("Line similarity analysis failed", error=str(e))
327
+
328
+ return None
329
+
330
+ def _analyze_function_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
331
+ """Analyze similarity based on function signatures and structure"""
332
+ try:
333
+ functions1 = self._extract_functions(code1)
334
+ functions2 = self._extract_functions(code2)
335
+
336
+ if not functions1 or not functions2:
337
+ return None
338
+
339
+ # Compare function signatures
340
+ common_functions = []
341
+ for func1 in functions1:
342
+ for func2 in functions2:
343
+ similarity = self._compare_functions(func1, func2)
344
+ if similarity > 0.7:
345
+ common_functions.append({
346
+ 'name1': func1['name'],
347
+ 'name2': func2['name'],
348
+ 'similarity': similarity
349
+ })
350
+
351
+ if common_functions:
352
+ avg_similarity = sum(f['similarity'] for f in common_functions) / len(common_functions)
353
+
354
+ return SimilarityMatch(
355
+ method=SimilarityMethod.FUNCTION_SIGNATURE,
356
+ score=avg_similarity,
357
+ confidence=0.8,
358
+ matched_sections={
359
+ "common_functions": common_functions,
360
+ "total_functions": len(common_functions)
361
+ },
362
+ explanation=f"Function similarity: {avg_similarity:.2f} based on function structure"
363
+ )
364
+
365
+ except Exception as e:
366
+ logger.error("Function similarity analysis failed", error=str(e))
367
+
368
+ return None
369
+
370
+ def _extract_functions(self, code: str) -> list[dict[str, Any]]:
371
+ """Extract function information from code"""
372
+ try:
373
+ tree = ast.parse(code)
374
+ functions = []
375
+
376
+ for node in ast.walk(tree):
377
+ if isinstance(node, ast.FunctionDef):
378
+ functions.append({
379
+ 'name': node.name,
380
+ 'args': [arg.arg for arg in node.args.args],
381
+ 'body_length': len(node.body),
382
+ 'returns': any(isinstance(n, ast.Return) for n in ast.walk(node))
383
+ })
384
+
385
+ return functions
386
+ except SyntaxError:
387
+ return []
388
+
389
+ def _compare_functions(self, func1: dict[str, Any], func2: dict[str, Any]) -> float:
390
+ """Compare two function signatures"""
391
+ score: float = 0.0
392
+
393
+ # Name similarity (less weight if names are obviously different)
394
+ name_sim: float = difflib.SequenceMatcher(None, func1['name'], func2['name']).ratio()
395
+ score += name_sim * 0.3
396
+
397
+ # Argument similarity
398
+ args1, args2 = set(func1['args']), set(func2['args'])
399
+ arg_sim = len(args1.intersection(args2)) / max(len(args1), len(args2), 1)
400
+ score += arg_sim * 0.4
401
+
402
+ # Body length similarity
403
+ len1, len2 = func1['body_length'], func2['body_length']
404
+ len_sim = 1 - abs(len1 - len2) / max(len1, len2, 1)
405
+ score += len_sim * 0.2
406
+
407
+ # Return statement similarity
408
+ ret_sim = 1.0 if func1['returns'] == func2['returns'] else 0.0
409
+ score += ret_sim * 0.1
410
+
411
+ return score
412
+
413
+ def _calculate_overall_score(self, scores: dict[str, float]) -> float:
414
+ """Calculate weighted overall similarity score"""
415
+ if not scores:
416
+ return 0.0
417
+
418
+ weights = {
419
+ 'structural': 0.4,
420
+ 'token': 0.3,
421
+ 'line': 0.2,
422
+ 'function': 0.1
423
+ }
424
+
425
+ weighted_sum = 0.0
426
+ total_weight = 0.0
427
+
428
+ for score_type, score in scores.items():
429
+ weight = weights.get(score_type, 0.1)
430
+ weighted_sum += score * weight
431
+ total_weight += weight
432
+
433
+ return weighted_sum / total_weight if total_weight > 0 else 0.0
434
+
435
+
436
+ class SimilarityDetector:
437
+ """Main similarity detection service"""
438
+
439
+ def __init__(self, threshold: float = 0.8):
440
+ self.threshold = threshold
441
+ self.python_analyzer = PythonSimilarityAnalyzer(threshold)
442
+
443
+ def compare_submissions(
444
+ self,
445
+ submission1: dict[str, Any],
446
+ submission2: dict[str, Any],
447
+ methods: list[SimilarityMethod] | None = None
448
+ ) -> SimilarityResult:
449
+ """
450
+ Compare two code submissions for similarity
451
+
452
+ Args:
453
+ submission1: First submission with 'code' and 'language'
454
+ submission2: Second submission with 'code' and 'language'
455
+ methods: Similarity methods to use
456
+
457
+ Returns:
458
+ SimilarityResult with similarity analysis
459
+ """
460
+ code1 = submission1.get('code', '')
461
+ code2 = submission2.get('code', '')
462
+ language = submission1.get('language', 'python').lower()
463
+
464
+ if language == 'python':
465
+ return self.python_analyzer.analyze_similarity(code1, code2, methods)
466
+ else:
467
+ logger.warning("Similarity analysis not implemented for language", language=language)
468
+ return SimilarityResult(
469
+ overall_score=0.0,
470
+ matches=[],
471
+ flagged=False,
472
+ threshold_used=self.threshold,
473
+ methods_used=methods or []
474
+ )
475
+
476
+ def batch_similarity_check(
477
+ self,
478
+ submissions: list[dict[str, Any]],
479
+ methods: list[SimilarityMethod] | None = None
480
+ ) -> list[tuple[int, int, SimilarityResult]]:
481
+ """
482
+ Perform pairwise similarity checks on a batch of submissions
483
+
484
+ Args:
485
+ submissions: List of submissions to compare
486
+ methods: Similarity methods to use
487
+
488
+ Returns:
489
+ List of (index1, index2, SimilarityResult) for flagged pairs
490
+ """
491
+ results = []
492
+
493
+ for i in range(len(submissions)):
494
+ for j in range(i + 1, len(submissions)):
495
+ similarity = self.compare_submissions(
496
+ submissions[i],
497
+ submissions[j],
498
+ methods
499
+ )
500
+
501
+ if similarity.flagged or similarity.overall_score > 0.3: # Report moderate+ similarities
502
+ results.append((i, j, similarity))
503
+
504
+ logger.info("Batch similarity check completed",
505
+ total_comparisons=len(submissions) * (len(submissions) - 1) // 2,
506
+ flagged_pairs=len(results))
507
+
508
+ return results
509
+
510
+
511
+ # Global similarity detector instance
512
+ similarity_detector = SimilarityDetector()
@@ -0,0 +1 @@
1
+ """API routes and endpoints"""
@@ -0,0 +1 @@
1
+ """API route modules"""