code-analyser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,449 @@
1
+ """
2
+ Similarity analysis service for detecting plagiarism across submissions
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ import structlog
9
+ from sqlalchemy import and_, select
10
+ from sqlalchemy.ext.asyncio import AsyncSession
11
+
12
+ from codelens.analyzers import SimilarityMethod, SimilarityResult, similarity_detector
13
+ from codelens.core.config import settings
14
+ from codelens.models import AnalysisReport
15
+ from codelens.models import SimilarityMatch as SimilarityMatchModel
16
+
17
+ logger = structlog.get_logger()
18
+
19
+
20
+ class SimilarityService:
21
+ """Service for managing similarity analysis and plagiarism detection"""
22
+
23
+ def __init__(self) -> None:
24
+ self.detector = similarity_detector
25
+ self.enabled = settings.similarity.enabled
26
+ self.threshold = settings.similarity.threshold
27
+ self.methods = [
28
+ SimilarityMethod(method) for method in settings.similarity.methods
29
+ ]
30
+
31
+ async def check_submission_similarity(
32
+ self,
33
+ submission_code: str,
34
+ submission_id: str,
35
+ assignment_id: int,
36
+ language: str,
37
+ db: AsyncSession,
38
+ student_id: str | None = None
39
+ ) -> dict[str, Any]:
40
+ """
41
+ Check similarity of a new submission against existing submissions
42
+
43
+ Args:
44
+ submission_code: Code content to check
45
+ submission_id: Unique submission identifier
46
+ assignment_id: Assignment ID
47
+ language: Programming language
48
+ db: Database session
49
+ student_id: Optional student ID to exclude from comparison
50
+
51
+ Returns:
52
+ Dictionary with similarity analysis results
53
+ """
54
+ if not self.enabled:
55
+ logger.info("Similarity checking disabled")
56
+ return {
57
+ "enabled": False,
58
+ "matches": [],
59
+ "highest_similarity": 0.0,
60
+ "flagged": False
61
+ }
62
+
63
+ try:
64
+ # Get existing submissions for the same assignment
65
+ query = select(AnalysisReport).where(
66
+ and_(
67
+ AnalysisReport.assignment_id == assignment_id,
68
+ AnalysisReport.language == language.lower(),
69
+ AnalysisReport.submission_id != submission_id # Exclude self
70
+ )
71
+ )
72
+
73
+ # Exclude same student's previous submissions if student_id provided
74
+ if student_id:
75
+ query = query.where(AnalysisReport.student_id != student_id)
76
+
77
+ result = await db.execute(query)
78
+ existing_reports = result.scalars().all()
79
+
80
+ logger.info("Checking similarity against existing submissions",
81
+ submission_id=submission_id,
82
+ existing_count=len(existing_reports))
83
+
84
+ if not existing_reports:
85
+ return {
86
+ "enabled": True,
87
+ "matches": [],
88
+ "highest_similarity": 0.0,
89
+ "flagged": False,
90
+ "comparison_count": 0
91
+ }
92
+
93
+ # Perform similarity checks
94
+ similarity_matches = []
95
+ highest_similarity = 0.0
96
+
97
+ for report in existing_reports:
98
+ # We don't store the original code, so we can't do real comparison
99
+ # In a real implementation, you might:
100
+ # 1. Store code hashes for quick comparison
101
+ # 2. Store AST fingerprints
102
+ # 3. Use external storage for code content
103
+ # 4. Generate similarity scores from stored features
104
+
105
+ # For now, simulate similarity checking
106
+ simulated_similarity = self._simulate_similarity_check(
107
+ submission_code,
108
+ report.file_hash, # Using hash as proxy
109
+ language
110
+ )
111
+
112
+ if simulated_similarity.overall_score > 0.3: # Only store significant matches
113
+ similarity_matches.append({
114
+ "report_id": report.id,
115
+ "matched_submission_id": report.submission_id,
116
+ "student_id": report.student_id,
117
+ "similarity_score": simulated_similarity.overall_score,
118
+ "methods_used": [m.value for m in simulated_similarity.methods_used],
119
+ "flagged": simulated_similarity.flagged
120
+ })
121
+
122
+ highest_similarity = max(highest_similarity, simulated_similarity.overall_score)
123
+
124
+ # Store in database
125
+ await self._store_similarity_match(
126
+ db, submission_id, report, simulated_similarity
127
+ )
128
+
129
+ # Check if flagged based on highest similarity
130
+ flagged = highest_similarity >= self.threshold
131
+
132
+ result_data = {
133
+ "enabled": True,
134
+ "matches": similarity_matches,
135
+ "highest_similarity": highest_similarity,
136
+ "flagged": flagged,
137
+ "comparison_count": len(existing_reports),
138
+ "threshold_used": self.threshold,
139
+ "methods_used": [m.value for m in self.methods]
140
+ }
141
+
142
+ logger.info("Similarity check completed",
143
+ submission_id=submission_id,
144
+ matches_found=len(similarity_matches),
145
+ highest_similarity=highest_similarity,
146
+ flagged=flagged)
147
+
148
+ return result_data
149
+
150
+ except Exception as e:
151
+ logger.error("Similarity check failed",
152
+ submission_id=submission_id,
153
+ error=str(e))
154
+ return {
155
+ "enabled": True,
156
+ "matches": [],
157
+ "highest_similarity": 0.0,
158
+ "flagged": False,
159
+ "error": f"Similarity check failed: {str(e)}"
160
+ }
161
+
162
+ async def batch_similarity_analysis(
163
+ self,
164
+ submissions: list[dict[str, Any]],
165
+ assignment_id: int,
166
+ db: AsyncSession
167
+ ) -> list[dict[str, Any]]:
168
+ """
169
+ Perform batch similarity analysis on multiple submissions
170
+
171
+ Args:
172
+ submissions: List of submission dictionaries
173
+ assignment_id: Assignment ID
174
+ db: Database session
175
+
176
+ Returns:
177
+ List of similarity analysis results
178
+ """
179
+ if not self.enabled or len(submissions) < 2:
180
+ return []
181
+
182
+ try:
183
+ logger.info("Starting batch similarity analysis",
184
+ assignment_id=assignment_id,
185
+ submission_count=len(submissions))
186
+
187
+ # Perform pairwise comparisons
188
+ flagged_pairs = self.detector.batch_similarity_check(
189
+ submissions, self.methods
190
+ )
191
+
192
+ # Store results and prepare response
193
+ batch_results = []
194
+
195
+ for i, j, similarity_result in flagged_pairs:
196
+ sub1, sub2 = submissions[i], submissions[j]
197
+
198
+ result_data = {
199
+ "submission1_id": sub1.get("submission_id"),
200
+ "submission2_id": sub2.get("submission_id"),
201
+ "student1_id": sub1.get("student_id"),
202
+ "student2_id": sub2.get("student_id"),
203
+ "similarity_score": similarity_result.overall_score,
204
+ "flagged": similarity_result.flagged,
205
+ "methods_used": [m.value for m in similarity_result.methods_used],
206
+ "matches": [
207
+ {
208
+ "method": match.method.value,
209
+ "score": match.score,
210
+ "confidence": match.confidence,
211
+ "explanation": match.explanation
212
+ }
213
+ for match in similarity_result.matches
214
+ ]
215
+ }
216
+
217
+ batch_results.append(result_data)
218
+
219
+ # Store in database if both submissions have IDs
220
+ if sub1.get("report_id") and sub2.get("report_id"):
221
+ await self._store_batch_similarity_match(
222
+ db, sub1["report_id"], sub2["report_id"], similarity_result
223
+ )
224
+
225
+ logger.info("Batch similarity analysis completed",
226
+ flagged_pairs=len(batch_results))
227
+
228
+ return batch_results
229
+
230
+ except Exception as e:
231
+ logger.error("Batch similarity analysis failed", error=str(e))
232
+ return []
233
+
234
+ def _simulate_similarity_check(
235
+ self,
236
+ code: str,
237
+ existing_hash: str,
238
+ language: str
239
+ ) -> SimilarityResult:
240
+ """
241
+ Simulate similarity checking (placeholder implementation)
242
+
243
+ In a real implementation, this would:
244
+ 1. Retrieve the actual code content from storage
245
+ 2. Perform full similarity analysis
246
+ 3. Return detailed similarity results
247
+
248
+ For now, we simulate based on simple heuristics
249
+ """
250
+ from codelens.utils import calculate_file_hash
251
+
252
+ current_hash = calculate_file_hash(code)
253
+
254
+ # Very simple simulation - in reality, you'd do full analysis
255
+ if current_hash == existing_hash:
256
+ # Identical files
257
+ similarity_score = 1.0
258
+ else:
259
+ # Simulate some similarity based on hash similarity
260
+ # This is just for demonstration - real implementation needed
261
+ hash_similarity = len(set(current_hash).intersection(set(existing_hash))) / len(set(current_hash + existing_hash))
262
+ similarity_score = hash_similarity * 0.3 # Scale down since hash similarity is not meaningful
263
+
264
+ from codelens.analyzers.similarity_analyzer import (
265
+ SimilarityMatch,
266
+ SimilarityResult,
267
+ )
268
+
269
+ matches = []
270
+ if similarity_score > 0.5:
271
+ matches.append(SimilarityMatch(
272
+ method=SimilarityMethod.TOKEN_BASED,
273
+ score=similarity_score,
274
+ confidence=0.6,
275
+ matched_sections={"simulated": True},
276
+ explanation=f"Simulated similarity: {similarity_score:.2f}"
277
+ ))
278
+
279
+ return SimilarityResult(
280
+ overall_score=similarity_score,
281
+ matches=matches,
282
+ flagged=similarity_score >= self.threshold,
283
+ threshold_used=self.threshold,
284
+ methods_used=self.methods
285
+ )
286
+
287
+ async def _store_similarity_match(
288
+ self,
289
+ db: AsyncSession,
290
+ submission_id: str,
291
+ matched_report: Any,
292
+ similarity_result: SimilarityResult
293
+ ) -> None:
294
+ """Store similarity match in database"""
295
+ try:
296
+ # Get report ID for current submission
297
+ current_report_query = select(AnalysisReport).where(
298
+ AnalysisReport.submission_id == submission_id
299
+ )
300
+ current_report_result = await db.execute(current_report_query)
301
+ current_report = current_report_result.scalar_one_or_none()
302
+
303
+ if not current_report:
304
+ logger.warning("Could not find report for similarity storage",
305
+ submission_id=submission_id)
306
+ return
307
+
308
+ # Create similarity match record
309
+ similarity_match = SimilarityMatchModel(
310
+ report_id=current_report.id,
311
+ matched_report_id=matched_report.id,
312
+ similarity_score=similarity_result.overall_score,
313
+ similarity_method="combined", # Multiple methods combined
314
+ matched_sections={
315
+ "methods": [m.method.value for m in similarity_result.matches],
316
+ "scores": {m.method.value: m.score for m in similarity_result.matches}
317
+ },
318
+ confidence=max([m.confidence for m in similarity_result.matches], default=0.5),
319
+ flagged=similarity_result.flagged
320
+ )
321
+
322
+ db.add(similarity_match)
323
+ await db.commit()
324
+
325
+ logger.debug("Stored similarity match",
326
+ report_id=current_report.id,
327
+ matched_report_id=matched_report.id,
328
+ score=similarity_result.overall_score)
329
+
330
+ except Exception as e:
331
+ logger.error("Failed to store similarity match", error=str(e))
332
+ await db.rollback()
333
+
334
+ async def _store_batch_similarity_match(
335
+ self,
336
+ db: AsyncSession,
337
+ report1_id: int,
338
+ report2_id: int,
339
+ similarity_result: SimilarityResult
340
+ ) -> None:
341
+ """Store batch similarity match in database"""
342
+ try:
343
+ # Create similarity match records (bidirectional)
344
+ match1 = SimilarityMatchModel(
345
+ report_id=report1_id,
346
+ matched_report_id=report2_id,
347
+ similarity_score=similarity_result.overall_score,
348
+ similarity_method="batch_analysis",
349
+ matched_sections={
350
+ "methods": [m.method.value for m in similarity_result.matches],
351
+ "batch_analysis": True
352
+ },
353
+ confidence=max([m.confidence for m in similarity_result.matches], default=0.7),
354
+ flagged=similarity_result.flagged
355
+ )
356
+
357
+ match2 = SimilarityMatchModel(
358
+ report_id=report2_id,
359
+ matched_report_id=report1_id,
360
+ similarity_score=similarity_result.overall_score,
361
+ similarity_method="batch_analysis",
362
+ matched_sections={
363
+ "methods": [m.method.value for m in similarity_result.matches],
364
+ "batch_analysis": True
365
+ },
366
+ confidence=max([m.confidence for m in similarity_result.matches], default=0.7),
367
+ flagged=similarity_result.flagged
368
+ )
369
+
370
+ db.add(match1)
371
+ db.add(match2)
372
+ await db.commit()
373
+
374
+ except Exception as e:
375
+ logger.error("Failed to store batch similarity match", error=str(e))
376
+ await db.rollback()
377
+
378
+ async def get_submission_similarities(
379
+ self,
380
+ report_id: int,
381
+ db: AsyncSession
382
+ ) -> list[dict[str, Any]]:
383
+ """Get all similarity matches for a specific report"""
384
+ try:
385
+ query = select(SimilarityMatchModel).where(
386
+ SimilarityMatchModel.report_id == report_id
387
+ )
388
+ result = await db.execute(query)
389
+ matches = result.scalars().all()
390
+
391
+ return [
392
+ {
393
+ "id": match.id,
394
+ "matched_report_id": match.matched_report_id,
395
+ "similarity_score": match.similarity_score,
396
+ "similarity_method": match.similarity_method,
397
+ "confidence": match.confidence,
398
+ "flagged": match.flagged,
399
+ "reviewed": match.reviewed,
400
+ "detected_at": match.detected_at
401
+ }
402
+ for match in matches
403
+ ]
404
+
405
+ except Exception as e:
406
+ logger.error("Failed to get submission similarities",
407
+ report_id=report_id, error=str(e))
408
+ return []
409
+
410
+ async def review_similarity_match(
411
+ self,
412
+ match_id: int,
413
+ decision: str,
414
+ reviewer_notes: str | None,
415
+ db: AsyncSession
416
+ ) -> bool:
417
+ """Review and mark a similarity match"""
418
+ try:
419
+ query = select(SimilarityMatchModel).where(
420
+ SimilarityMatchModel.id == match_id
421
+ )
422
+ result = await db.execute(query)
423
+ match = result.scalar_one_or_none()
424
+
425
+ if not match:
426
+ return False
427
+
428
+ match.reviewed = True
429
+ match.review_decision = decision
430
+ match.reviewer_notes = reviewer_notes
431
+ match.reviewed_at = datetime.utcnow()
432
+
433
+ await db.commit()
434
+
435
+ logger.info("Similarity match reviewed",
436
+ match_id=match_id,
437
+ decision=decision)
438
+
439
+ return True
440
+
441
+ except Exception as e:
442
+ logger.error("Failed to review similarity match",
443
+ match_id=match_id, error=str(e))
444
+ await db.rollback()
445
+ return False
446
+
447
+
448
+ # Global similarity service instance
449
+ similarity_service = SimilarityService()
@@ -0,0 +1,29 @@
1
+ """Utility functions"""
2
+
3
+ from .helpers import (
4
+ calculate_file_hash,
5
+ calculate_grade_letter,
6
+ detect_language_from_extension,
7
+ extract_classes_from_python,
8
+ extract_functions_from_python,
9
+ format_file_size,
10
+ generate_submission_id,
11
+ is_supported_file_type,
12
+ parse_batch_files,
13
+ sanitize_code_for_display,
14
+ validate_student_id,
15
+ )
16
+
17
+ __all__ = [
18
+ "generate_submission_id",
19
+ "calculate_file_hash",
20
+ "detect_language_from_extension",
21
+ "is_supported_file_type",
22
+ "format_file_size",
23
+ "extract_functions_from_python",
24
+ "extract_classes_from_python",
25
+ "sanitize_code_for_display",
26
+ "calculate_grade_letter",
27
+ "validate_student_id",
28
+ "parse_batch_files"
29
+ ]
@@ -0,0 +1,217 @@
1
+ """
2
+ Utility functions and helpers
3
+ """
4
+
5
+ import hashlib
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ def generate_submission_id() -> str:
12
+ """Generate a unique submission identifier"""
13
+ return str(uuid.uuid4())
14
+
15
+
16
+ def calculate_file_hash(content: str) -> str:
17
+ """Calculate SHA-256 hash of file content"""
18
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()
19
+
20
+
21
+ def detect_language_from_extension(filename: str) -> str | None:
22
+ """Detect programming language from file extension"""
23
+ extension_map = {
24
+ '.py': 'python',
25
+ '.js': 'javascript',
26
+ '.html': 'html',
27
+ '.htm': 'html',
28
+ '.css': 'css',
29
+ '.java': 'java',
30
+ '.cpp': 'cpp',
31
+ '.c': 'c',
32
+ '.cs': 'csharp',
33
+ '.php': 'php',
34
+ '.rb': 'ruby',
35
+ '.go': 'go',
36
+ '.rs': 'rust',
37
+ '.ts': 'typescript',
38
+ '.jsx': 'javascript',
39
+ '.tsx': 'typescript'
40
+ }
41
+
42
+ ext = Path(filename).suffix.lower()
43
+ return extension_map.get(ext)
44
+
45
+
46
+ def is_supported_file_type(filename: str) -> bool:
47
+ """Check if file type is supported for analysis"""
48
+ return detect_language_from_extension(filename) is not None
49
+
50
+
51
+ def format_file_size(size_bytes: int) -> str:
52
+ """Format file size in human readable format"""
53
+ if size_bytes == 0:
54
+ return "0 B"
55
+
56
+ size_names = ["B", "KB", "MB", "GB"]
57
+ i = 0
58
+ size_float = float(size_bytes)
59
+ while size_float >= 1024 and i < len(size_names) - 1:
60
+ size_float /= 1024.0
61
+ i += 1
62
+
63
+ return f"{size_float:.1f} {size_names[i]}"
64
+
65
+
66
+ def extract_functions_from_python(code: str) -> list[dict[str, Any]]:
67
+ """Extract function information from Python code"""
68
+ import ast
69
+
70
+ functions = []
71
+ try:
72
+ tree = ast.parse(code)
73
+
74
+ for node in ast.walk(tree):
75
+ if isinstance(node, ast.FunctionDef):
76
+ functions.append({
77
+ 'name': node.name,
78
+ 'line': node.lineno,
79
+ 'args': [arg.arg for arg in node.args.args],
80
+ 'has_docstring': (
81
+ len(node.body) > 0 and
82
+ isinstance(node.body[0], ast.Expr) and
83
+ isinstance(node.body[0].value, ast.Constant) and
84
+ isinstance(node.body[0].value.value, str)
85
+ )
86
+ })
87
+ except SyntaxError:
88
+ pass
89
+
90
+ return functions
91
+
92
+
93
+ def extract_classes_from_python(code: str) -> list[dict[str, Any]]:
94
+ """Extract class information from Python code"""
95
+ import ast
96
+
97
+ classes = []
98
+ try:
99
+ tree = ast.parse(code)
100
+
101
+ for node in ast.walk(tree):
102
+ if isinstance(node, ast.ClassDef):
103
+ methods = []
104
+ for child in node.body:
105
+ if isinstance(child, ast.FunctionDef):
106
+ methods.append({
107
+ 'name': child.name,
108
+ 'line': child.lineno
109
+ })
110
+
111
+ classes.append({
112
+ 'name': node.name,
113
+ 'line': node.lineno,
114
+ 'methods': methods,
115
+ 'base_classes': [base.id for base in node.bases if isinstance(base, ast.Name)]
116
+ })
117
+ except SyntaxError:
118
+ pass
119
+
120
+ return classes
121
+
122
+
123
+ def sanitize_code_for_display(code: str, max_lines: int = 100) -> str:
124
+ """Sanitize code for safe display, truncating if too long"""
125
+ lines = code.split('\n')
126
+
127
+ if len(lines) > max_lines:
128
+ lines = lines[:max_lines]
129
+ lines.append(f"... (truncated, {len(code.split(chr(10))) - max_lines} more lines)")
130
+
131
+ return '\n'.join(lines)
132
+
133
+
134
+ def calculate_grade_letter(score: float) -> str:
135
+ """Convert numeric score to letter grade"""
136
+ if score >= 97:
137
+ return "A+"
138
+ elif score >= 93:
139
+ return "A"
140
+ elif score >= 90:
141
+ return "A-"
142
+ elif score >= 87:
143
+ return "B+"
144
+ elif score >= 83:
145
+ return "B"
146
+ elif score >= 80:
147
+ return "B-"
148
+ elif score >= 77:
149
+ return "C+"
150
+ elif score >= 73:
151
+ return "C"
152
+ elif score >= 70:
153
+ return "C-"
154
+ elif score >= 67:
155
+ return "D+"
156
+ elif score >= 63:
157
+ return "D"
158
+ elif score >= 60:
159
+ return "D-"
160
+ else:
161
+ return "F"
162
+
163
+
164
+ def validate_student_id(student_id: str) -> bool:
165
+ """Validate student ID format (basic validation)"""
166
+ if not student_id:
167
+ return False
168
+
169
+ # Remove whitespace
170
+ student_id = student_id.strip()
171
+
172
+ # Check length (between 3 and 50 characters)
173
+ if len(student_id) < 3 or len(student_id) > 50:
174
+ return False
175
+
176
+ # Allow alphanumeric characters, hyphens, and underscores
177
+ import re
178
+ return bool(re.match(r'^[a-zA-Z0-9_-]+$', student_id))
179
+
180
+
181
+ def parse_batch_files(files_data: list[dict[str, str]]) -> list[dict[str, Any]]:
182
+ """Parse and validate batch file data"""
183
+ parsed_files = []
184
+
185
+ for i, file_data in enumerate(files_data):
186
+ try:
187
+ # Validate required fields
188
+ if 'code' not in file_data or 'path' not in file_data:
189
+ continue
190
+
191
+ code = file_data['code']
192
+ file_path = file_data['path']
193
+
194
+ # Detect language
195
+ language = detect_language_from_extension(file_path)
196
+ if not language:
197
+ continue
198
+
199
+ # Extract metadata
200
+ parsed_file = {
201
+ 'index': i,
202
+ 'code': code,
203
+ 'path': file_path,
204
+ 'language': language,
205
+ 'size': len(code.encode('utf-8')),
206
+ 'hash': calculate_file_hash(code),
207
+ 'student_id': file_data.get('student_id'),
208
+ 'student_name': file_data.get('student_name')
209
+ }
210
+
211
+ parsed_files.append(parsed_file)
212
+
213
+ except Exception:
214
+ # Log error but continue processing other files
215
+ continue
216
+
217
+ return parsed_files