code-analyser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_analyser-0.1.0.dist-info/METADATA +283 -0
- code_analyser-0.1.0.dist-info/RECORD +34 -0
- code_analyser-0.1.0.dist-info/WHEEL +4 -0
- code_analyser-0.1.0.dist-info/licenses/LICENSE +21 -0
- codelens/__init__.py +7 -0
- codelens/__main__.py +19 -0
- codelens/analyzers/__init__.py +30 -0
- codelens/analyzers/base.py +139 -0
- codelens/analyzers/manager.py +207 -0
- codelens/analyzers/python_analyzer.py +344 -0
- codelens/analyzers/similarity_analyzer.py +512 -0
- codelens/api/__init__.py +1 -0
- codelens/api/routes/__init__.py +1 -0
- codelens/api/routes/analysis.py +441 -0
- codelens/api/routes/reports.py +438 -0
- codelens/api/routes/rubrics.py +349 -0
- codelens/api/schemas.py +305 -0
- codelens/cli.py +297 -0
- codelens/core/__init__.py +1 -0
- codelens/core/config.py +91 -0
- codelens/db/__init__.py +1 -0
- codelens/db/database.py +57 -0
- codelens/main.py +111 -0
- codelens/models/__init__.py +14 -0
- codelens/models/assignments.py +105 -0
- codelens/models/reports.py +172 -0
- codelens/models/rubrics.py +76 -0
- codelens/services/__init__.py +37 -0
- codelens/services/batch_processor.py +508 -0
- codelens/services/code_executor.py +310 -0
- codelens/services/sandbox.py +375 -0
- codelens/services/similarity_service.py +449 -0
- codelens/utils/__init__.py +29 -0
- codelens/utils/helpers.py +217 -0
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Similarity analysis for detecting potential plagiarism in code submissions
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import difflib
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import structlog
|
|
12
|
+
|
|
13
|
+
logger = structlog.get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SimilarityMethod(Enum):
|
|
17
|
+
"""Available similarity analysis methods"""
|
|
18
|
+
AST_STRUCTURAL = "ast_structural"
|
|
19
|
+
TOKEN_BASED = "token_based"
|
|
20
|
+
LINE_BASED = "line_based"
|
|
21
|
+
FUNCTION_SIGNATURE = "function_signature"
|
|
22
|
+
VARIABLE_PATTERN = "variable_pattern"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class SimilarityMatch:
|
|
27
|
+
"""Represents a similarity match between code sections"""
|
|
28
|
+
method: SimilarityMethod
|
|
29
|
+
score: float # 0.0 to 1.0
|
|
30
|
+
confidence: float # 0.0 to 1.0
|
|
31
|
+
matched_sections: dict[str, Any]
|
|
32
|
+
explanation: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class SimilarityResult:
|
|
37
|
+
"""Result of similarity analysis between two code submissions"""
|
|
38
|
+
overall_score: float # 0.0 to 1.0
|
|
39
|
+
matches: list[SimilarityMatch]
|
|
40
|
+
flagged: bool
|
|
41
|
+
threshold_used: float
|
|
42
|
+
methods_used: list[SimilarityMethod]
|
|
43
|
+
|
|
44
|
+
# Detailed breakdown
|
|
45
|
+
structural_similarity: float = 0.0
|
|
46
|
+
token_similarity: float = 0.0
|
|
47
|
+
line_similarity: float = 0.0
|
|
48
|
+
function_similarity: float = 0.0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class PythonSimilarityAnalyzer:
|
|
52
|
+
"""Similarity analyzer specifically for Python code"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, threshold: float = 0.8):
|
|
55
|
+
self.threshold = threshold
|
|
56
|
+
|
|
57
|
+
def analyze_similarity(
|
|
58
|
+
self,
|
|
59
|
+
code1: str,
|
|
60
|
+
code2: str,
|
|
61
|
+
methods: list[SimilarityMethod] | None = None
|
|
62
|
+
) -> SimilarityResult:
|
|
63
|
+
"""
|
|
64
|
+
Analyze similarity between two Python code submissions
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
code1: First code submission
|
|
68
|
+
code2: Second code submission
|
|
69
|
+
methods: Similarity methods to use (default: all)
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
SimilarityResult with detailed similarity analysis
|
|
73
|
+
"""
|
|
74
|
+
if methods is None:
|
|
75
|
+
methods = list(SimilarityMethod)
|
|
76
|
+
|
|
77
|
+
matches = []
|
|
78
|
+
scores = {}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# AST structural similarity
|
|
82
|
+
if SimilarityMethod.AST_STRUCTURAL in methods:
|
|
83
|
+
structural_match = self._analyze_ast_similarity(code1, code2)
|
|
84
|
+
if structural_match:
|
|
85
|
+
matches.append(structural_match)
|
|
86
|
+
scores['structural'] = structural_match.score
|
|
87
|
+
|
|
88
|
+
# Token-based similarity
|
|
89
|
+
if SimilarityMethod.TOKEN_BASED in methods:
|
|
90
|
+
token_match = self._analyze_token_similarity(code1, code2)
|
|
91
|
+
if token_match:
|
|
92
|
+
matches.append(token_match)
|
|
93
|
+
scores['token'] = token_match.score
|
|
94
|
+
|
|
95
|
+
# Line-based similarity
|
|
96
|
+
if SimilarityMethod.LINE_BASED in methods:
|
|
97
|
+
line_match = self._analyze_line_similarity(code1, code2)
|
|
98
|
+
if line_match:
|
|
99
|
+
matches.append(line_match)
|
|
100
|
+
scores['line'] = line_match.score
|
|
101
|
+
|
|
102
|
+
# Function signature similarity
|
|
103
|
+
if SimilarityMethod.FUNCTION_SIGNATURE in methods:
|
|
104
|
+
function_match = self._analyze_function_similarity(code1, code2)
|
|
105
|
+
if function_match:
|
|
106
|
+
matches.append(function_match)
|
|
107
|
+
scores['function'] = function_match.score
|
|
108
|
+
|
|
109
|
+
# Calculate overall score (weighted average)
|
|
110
|
+
overall_score = self._calculate_overall_score(scores)
|
|
111
|
+
|
|
112
|
+
result = SimilarityResult(
|
|
113
|
+
overall_score=overall_score,
|
|
114
|
+
matches=matches,
|
|
115
|
+
flagged=overall_score >= self.threshold,
|
|
116
|
+
threshold_used=self.threshold,
|
|
117
|
+
methods_used=methods,
|
|
118
|
+
structural_similarity=scores.get('structural', 0.0),
|
|
119
|
+
token_similarity=scores.get('token', 0.0),
|
|
120
|
+
line_similarity=scores.get('line', 0.0),
|
|
121
|
+
function_similarity=scores.get('function', 0.0)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
logger.info("Similarity analysis completed",
|
|
125
|
+
overall_score=overall_score,
|
|
126
|
+
flagged=result.flagged,
|
|
127
|
+
matches_count=len(matches))
|
|
128
|
+
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error("Similarity analysis failed", error=str(e))
|
|
133
|
+
return SimilarityResult(
|
|
134
|
+
overall_score=0.0,
|
|
135
|
+
matches=[],
|
|
136
|
+
flagged=False,
|
|
137
|
+
threshold_used=self.threshold,
|
|
138
|
+
methods_used=methods
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _analyze_ast_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
|
|
142
|
+
"""Analyze structural similarity using AST comparison"""
|
|
143
|
+
try:
|
|
144
|
+
tree1 = ast.parse(code1)
|
|
145
|
+
tree2 = ast.parse(code2)
|
|
146
|
+
|
|
147
|
+
# Extract structural features
|
|
148
|
+
features1 = self._extract_ast_features(tree1)
|
|
149
|
+
features2 = self._extract_ast_features(tree2)
|
|
150
|
+
|
|
151
|
+
# Compare structures
|
|
152
|
+
similarity = self._compare_ast_features(features1, features2)
|
|
153
|
+
|
|
154
|
+
if similarity > 0.1: # Only report significant similarities
|
|
155
|
+
return SimilarityMatch(
|
|
156
|
+
method=SimilarityMethod.AST_STRUCTURAL,
|
|
157
|
+
score=similarity,
|
|
158
|
+
confidence=0.9, # AST comparison is highly reliable
|
|
159
|
+
matched_sections={
|
|
160
|
+
"common_patterns": self._find_common_ast_patterns(features1, features2)
|
|
161
|
+
},
|
|
162
|
+
explanation=f"Structural similarity: {similarity:.2f} based on AST analysis"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
except SyntaxError:
|
|
166
|
+
logger.warning("Syntax error in code - skipping AST analysis")
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.error("AST similarity analysis failed", error=str(e))
|
|
169
|
+
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def _extract_ast_features(self, tree: ast.AST) -> dict[str, Any]:
|
|
173
|
+
"""Extract structural features from AST"""
|
|
174
|
+
features: dict[str, Any] = {
|
|
175
|
+
'node_types': [],
|
|
176
|
+
'function_names': [],
|
|
177
|
+
'class_names': [],
|
|
178
|
+
'control_structures': [],
|
|
179
|
+
'nesting_pattern': [],
|
|
180
|
+
'variable_names': set(),
|
|
181
|
+
'import_modules': []
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
for node in ast.walk(tree):
|
|
185
|
+
features['node_types'].append(type(node).__name__)
|
|
186
|
+
|
|
187
|
+
if isinstance(node, ast.FunctionDef):
|
|
188
|
+
features['function_names'].append(node.name)
|
|
189
|
+
elif isinstance(node, ast.ClassDef):
|
|
190
|
+
features['class_names'].append(node.name)
|
|
191
|
+
elif isinstance(node, ast.If | ast.While | ast.For):
|
|
192
|
+
features['control_structures'].append(type(node).__name__)
|
|
193
|
+
elif isinstance(node, ast.Name):
|
|
194
|
+
features['variable_names'].add(node.id)
|
|
195
|
+
elif isinstance(node, ast.Import):
|
|
196
|
+
for alias in node.names:
|
|
197
|
+
features['import_modules'].append(alias.name)
|
|
198
|
+
elif isinstance(node, ast.ImportFrom):
|
|
199
|
+
if node.module:
|
|
200
|
+
features['import_modules'].append(node.module)
|
|
201
|
+
|
|
202
|
+
# Convert set to list for comparison
|
|
203
|
+
features['variable_names'] = list(features['variable_names'])
|
|
204
|
+
|
|
205
|
+
return features
|
|
206
|
+
|
|
207
|
+
def _compare_ast_features(self, features1: dict[str, Any], features2: dict[str, Any]) -> float:
|
|
208
|
+
"""Compare AST features and return similarity score"""
|
|
209
|
+
total_score = 0.0
|
|
210
|
+
weights = {
|
|
211
|
+
'node_types': 0.3,
|
|
212
|
+
'function_names': 0.2,
|
|
213
|
+
'class_names': 0.15,
|
|
214
|
+
'control_structures': 0.15,
|
|
215
|
+
'variable_names': 0.1,
|
|
216
|
+
'import_modules': 0.1
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
for feature, weight in weights.items():
|
|
220
|
+
list1 = features1.get(feature, [])
|
|
221
|
+
list2 = features2.get(feature, [])
|
|
222
|
+
|
|
223
|
+
# Calculate Jaccard similarity for lists
|
|
224
|
+
set1, set2 = set(list1), set(list2)
|
|
225
|
+
intersection = len(set1.intersection(set2))
|
|
226
|
+
union = len(set1.union(set2))
|
|
227
|
+
|
|
228
|
+
similarity = intersection / union if union > 0 else 0.0
|
|
229
|
+
total_score += similarity * weight
|
|
230
|
+
|
|
231
|
+
return total_score
|
|
232
|
+
|
|
233
|
+
def _find_common_ast_patterns(self, features1: dict[str, Any], features2: dict[str, Any]) -> dict[str, list]:
|
|
234
|
+
"""Find common patterns in AST features"""
|
|
235
|
+
common_patterns = {}
|
|
236
|
+
|
|
237
|
+
for feature_type in ['function_names', 'class_names', 'control_structures']:
|
|
238
|
+
list1 = features1.get(feature_type, [])
|
|
239
|
+
list2 = features2.get(feature_type, [])
|
|
240
|
+
common = list(set(list1).intersection(set(list2)))
|
|
241
|
+
if common:
|
|
242
|
+
common_patterns[feature_type] = common
|
|
243
|
+
|
|
244
|
+
return common_patterns
|
|
245
|
+
|
|
246
|
+
def _analyze_token_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
|
|
247
|
+
"""Analyze similarity based on code tokens"""
|
|
248
|
+
try:
|
|
249
|
+
# Tokenize code (simplified - split by whitespace and symbols)
|
|
250
|
+
tokens1 = self._tokenize_code(code1)
|
|
251
|
+
tokens2 = self._tokenize_code(code2)
|
|
252
|
+
|
|
253
|
+
# Calculate token similarity using sequence matching
|
|
254
|
+
matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
|
|
255
|
+
similarity = matcher.ratio()
|
|
256
|
+
|
|
257
|
+
if similarity > 0.3: # Only report significant token similarities
|
|
258
|
+
common_tokens = self._find_common_token_sequences(tokens1, tokens2)
|
|
259
|
+
|
|
260
|
+
return SimilarityMatch(
|
|
261
|
+
method=SimilarityMethod.TOKEN_BASED,
|
|
262
|
+
score=similarity,
|
|
263
|
+
confidence=0.7,
|
|
264
|
+
matched_sections={
|
|
265
|
+
"common_tokens": common_tokens[:10], # First 10 common sequences
|
|
266
|
+
"total_common_tokens": len(common_tokens)
|
|
267
|
+
},
|
|
268
|
+
explanation=f"Token similarity: {similarity:.2f} based on code token analysis"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
except Exception as e:
|
|
272
|
+
logger.error("Token similarity analysis failed", error=str(e))
|
|
273
|
+
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
def _tokenize_code(self, code: str) -> list[str]:
|
|
277
|
+
"""Simple code tokenization"""
|
|
278
|
+
import re
|
|
279
|
+
# Split on whitespace and common symbols
|
|
280
|
+
tokens = re.findall(r'\w+|[^\w\s]', code)
|
|
281
|
+
# Filter out very common tokens
|
|
282
|
+
common_tokens = {'def', 'class', 'if', 'else', 'for', 'while', 'import', 'return'}
|
|
283
|
+
return [token for token in tokens if token.lower() not in common_tokens]
|
|
284
|
+
|
|
285
|
+
def _find_common_token_sequences(self, tokens1: list[str], tokens2: list[str]) -> list[str]:
|
|
286
|
+
"""Find common token sequences"""
|
|
287
|
+
matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
|
|
288
|
+
common_sequences = []
|
|
289
|
+
|
|
290
|
+
for match in matcher.get_matching_blocks():
|
|
291
|
+
if match.size > 2: # Only sequences of 3+ tokens
|
|
292
|
+
sequence = ' '.join(tokens1[match.a:match.a + match.size])
|
|
293
|
+
common_sequences.append(sequence)
|
|
294
|
+
|
|
295
|
+
return common_sequences
|
|
296
|
+
|
|
297
|
+
def _analyze_line_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
|
|
298
|
+
"""Analyze similarity based on code lines"""
|
|
299
|
+
try:
|
|
300
|
+
lines1 = [line.strip() for line in code1.split('\n') if line.strip()]
|
|
301
|
+
lines2 = [line.strip() for line in code2.split('\n') if line.strip()]
|
|
302
|
+
|
|
303
|
+
# Calculate line-based similarity
|
|
304
|
+
matcher = difflib.SequenceMatcher(None, lines1, lines2)
|
|
305
|
+
similarity = matcher.ratio()
|
|
306
|
+
|
|
307
|
+
if similarity > 0.4: # Only report significant line similarities
|
|
308
|
+
common_lines = []
|
|
309
|
+
for match in matcher.get_matching_blocks():
|
|
310
|
+
if match.size > 1: # Sequences of 2+ lines
|
|
311
|
+
for i in range(match.size):
|
|
312
|
+
common_lines.append(lines1[match.a + i])
|
|
313
|
+
|
|
314
|
+
return SimilarityMatch(
|
|
315
|
+
method=SimilarityMethod.LINE_BASED,
|
|
316
|
+
score=similarity,
|
|
317
|
+
confidence=0.6,
|
|
318
|
+
matched_sections={
|
|
319
|
+
"common_lines": common_lines[:5], # First 5 common lines
|
|
320
|
+
"total_common_lines": len(common_lines)
|
|
321
|
+
},
|
|
322
|
+
explanation=f"Line similarity: {similarity:.2f} based on exact line matching"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
logger.error("Line similarity analysis failed", error=str(e))
|
|
327
|
+
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
def _analyze_function_similarity(self, code1: str, code2: str) -> SimilarityMatch | None:
|
|
331
|
+
"""Analyze similarity based on function signatures and structure"""
|
|
332
|
+
try:
|
|
333
|
+
functions1 = self._extract_functions(code1)
|
|
334
|
+
functions2 = self._extract_functions(code2)
|
|
335
|
+
|
|
336
|
+
if not functions1 or not functions2:
|
|
337
|
+
return None
|
|
338
|
+
|
|
339
|
+
# Compare function signatures
|
|
340
|
+
common_functions = []
|
|
341
|
+
for func1 in functions1:
|
|
342
|
+
for func2 in functions2:
|
|
343
|
+
similarity = self._compare_functions(func1, func2)
|
|
344
|
+
if similarity > 0.7:
|
|
345
|
+
common_functions.append({
|
|
346
|
+
'name1': func1['name'],
|
|
347
|
+
'name2': func2['name'],
|
|
348
|
+
'similarity': similarity
|
|
349
|
+
})
|
|
350
|
+
|
|
351
|
+
if common_functions:
|
|
352
|
+
avg_similarity = sum(f['similarity'] for f in common_functions) / len(common_functions)
|
|
353
|
+
|
|
354
|
+
return SimilarityMatch(
|
|
355
|
+
method=SimilarityMethod.FUNCTION_SIGNATURE,
|
|
356
|
+
score=avg_similarity,
|
|
357
|
+
confidence=0.8,
|
|
358
|
+
matched_sections={
|
|
359
|
+
"common_functions": common_functions,
|
|
360
|
+
"total_functions": len(common_functions)
|
|
361
|
+
},
|
|
362
|
+
explanation=f"Function similarity: {avg_similarity:.2f} based on function structure"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
except Exception as e:
|
|
366
|
+
logger.error("Function similarity analysis failed", error=str(e))
|
|
367
|
+
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
def _extract_functions(self, code: str) -> list[dict[str, Any]]:
|
|
371
|
+
"""Extract function information from code"""
|
|
372
|
+
try:
|
|
373
|
+
tree = ast.parse(code)
|
|
374
|
+
functions = []
|
|
375
|
+
|
|
376
|
+
for node in ast.walk(tree):
|
|
377
|
+
if isinstance(node, ast.FunctionDef):
|
|
378
|
+
functions.append({
|
|
379
|
+
'name': node.name,
|
|
380
|
+
'args': [arg.arg for arg in node.args.args],
|
|
381
|
+
'body_length': len(node.body),
|
|
382
|
+
'returns': any(isinstance(n, ast.Return) for n in ast.walk(node))
|
|
383
|
+
})
|
|
384
|
+
|
|
385
|
+
return functions
|
|
386
|
+
except SyntaxError:
|
|
387
|
+
return []
|
|
388
|
+
|
|
389
|
+
def _compare_functions(self, func1: dict[str, Any], func2: dict[str, Any]) -> float:
|
|
390
|
+
"""Compare two function signatures"""
|
|
391
|
+
score: float = 0.0
|
|
392
|
+
|
|
393
|
+
# Name similarity (less weight if names are obviously different)
|
|
394
|
+
name_sim: float = difflib.SequenceMatcher(None, func1['name'], func2['name']).ratio()
|
|
395
|
+
score += name_sim * 0.3
|
|
396
|
+
|
|
397
|
+
# Argument similarity
|
|
398
|
+
args1, args2 = set(func1['args']), set(func2['args'])
|
|
399
|
+
arg_sim = len(args1.intersection(args2)) / max(len(args1), len(args2), 1)
|
|
400
|
+
score += arg_sim * 0.4
|
|
401
|
+
|
|
402
|
+
# Body length similarity
|
|
403
|
+
len1, len2 = func1['body_length'], func2['body_length']
|
|
404
|
+
len_sim = 1 - abs(len1 - len2) / max(len1, len2, 1)
|
|
405
|
+
score += len_sim * 0.2
|
|
406
|
+
|
|
407
|
+
# Return statement similarity
|
|
408
|
+
ret_sim = 1.0 if func1['returns'] == func2['returns'] else 0.0
|
|
409
|
+
score += ret_sim * 0.1
|
|
410
|
+
|
|
411
|
+
return score
|
|
412
|
+
|
|
413
|
+
def _calculate_overall_score(self, scores: dict[str, float]) -> float:
|
|
414
|
+
"""Calculate weighted overall similarity score"""
|
|
415
|
+
if not scores:
|
|
416
|
+
return 0.0
|
|
417
|
+
|
|
418
|
+
weights = {
|
|
419
|
+
'structural': 0.4,
|
|
420
|
+
'token': 0.3,
|
|
421
|
+
'line': 0.2,
|
|
422
|
+
'function': 0.1
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
weighted_sum = 0.0
|
|
426
|
+
total_weight = 0.0
|
|
427
|
+
|
|
428
|
+
for score_type, score in scores.items():
|
|
429
|
+
weight = weights.get(score_type, 0.1)
|
|
430
|
+
weighted_sum += score * weight
|
|
431
|
+
total_weight += weight
|
|
432
|
+
|
|
433
|
+
return weighted_sum / total_weight if total_weight > 0 else 0.0
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
class SimilarityDetector:
|
|
437
|
+
"""Main similarity detection service"""
|
|
438
|
+
|
|
439
|
+
def __init__(self, threshold: float = 0.8):
|
|
440
|
+
self.threshold = threshold
|
|
441
|
+
self.python_analyzer = PythonSimilarityAnalyzer(threshold)
|
|
442
|
+
|
|
443
|
+
def compare_submissions(
|
|
444
|
+
self,
|
|
445
|
+
submission1: dict[str, Any],
|
|
446
|
+
submission2: dict[str, Any],
|
|
447
|
+
methods: list[SimilarityMethod] | None = None
|
|
448
|
+
) -> SimilarityResult:
|
|
449
|
+
"""
|
|
450
|
+
Compare two code submissions for similarity
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
submission1: First submission with 'code' and 'language'
|
|
454
|
+
submission2: Second submission with 'code' and 'language'
|
|
455
|
+
methods: Similarity methods to use
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
SimilarityResult with similarity analysis
|
|
459
|
+
"""
|
|
460
|
+
code1 = submission1.get('code', '')
|
|
461
|
+
code2 = submission2.get('code', '')
|
|
462
|
+
language = submission1.get('language', 'python').lower()
|
|
463
|
+
|
|
464
|
+
if language == 'python':
|
|
465
|
+
return self.python_analyzer.analyze_similarity(code1, code2, methods)
|
|
466
|
+
else:
|
|
467
|
+
logger.warning("Similarity analysis not implemented for language", language=language)
|
|
468
|
+
return SimilarityResult(
|
|
469
|
+
overall_score=0.0,
|
|
470
|
+
matches=[],
|
|
471
|
+
flagged=False,
|
|
472
|
+
threshold_used=self.threshold,
|
|
473
|
+
methods_used=methods or []
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
def batch_similarity_check(
|
|
477
|
+
self,
|
|
478
|
+
submissions: list[dict[str, Any]],
|
|
479
|
+
methods: list[SimilarityMethod] | None = None
|
|
480
|
+
) -> list[tuple[int, int, SimilarityResult]]:
|
|
481
|
+
"""
|
|
482
|
+
Perform pairwise similarity checks on a batch of submissions
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
submissions: List of submissions to compare
|
|
486
|
+
methods: Similarity methods to use
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
List of (index1, index2, SimilarityResult) for flagged pairs
|
|
490
|
+
"""
|
|
491
|
+
results = []
|
|
492
|
+
|
|
493
|
+
for i in range(len(submissions)):
|
|
494
|
+
for j in range(i + 1, len(submissions)):
|
|
495
|
+
similarity = self.compare_submissions(
|
|
496
|
+
submissions[i],
|
|
497
|
+
submissions[j],
|
|
498
|
+
methods
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
if similarity.flagged or similarity.overall_score > 0.3: # Report moderate+ similarities
|
|
502
|
+
results.append((i, j, similarity))
|
|
503
|
+
|
|
504
|
+
logger.info("Batch similarity check completed",
|
|
505
|
+
total_comparisons=len(submissions) * (len(submissions) - 1) // 2,
|
|
506
|
+
flagged_pairs=len(results))
|
|
507
|
+
|
|
508
|
+
return results
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
# Global similarity detector instance
|
|
512
|
+
similarity_detector = SimilarityDetector()
|
codelens/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""API routes and endpoints"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""API route modules"""
|