code-analyser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,508 @@
1
+ """
2
+ Batch processing service for handling multiple code submissions
3
+ """
4
+
5
+ import asyncio
6
+ import re
7
+ from dataclasses import dataclass
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+
11
+ import structlog
12
+
13
+ from codelens.analyzers import analyzer_manager
14
+ from codelens.api.schemas import AnalysisRequest, AnalysisResponse
15
+ from codelens.utils import (
16
+ calculate_file_hash,
17
+ detect_language_from_extension,
18
+ generate_submission_id,
19
+ parse_batch_files,
20
+ )
21
+
22
+ logger = structlog.get_logger()
23
+
24
+
25
+ @dataclass
26
+ class BatchProcessingConfig:
27
+ """Configuration for batch processing"""
28
+ parallel_processing: bool = True
29
+ max_concurrent: int = 5
30
+ skip_unsupported_files: bool = True
31
+ extract_student_info: bool = True
32
+ default_language: str = "python"
33
+
34
+ # Student ID extraction patterns
35
+ student_id_patterns: list[str] | None = None
36
+
37
+ def __post_init__(self) -> None:
38
+ if self.student_id_patterns is None:
39
+ self.student_id_patterns = [
40
+ r'(\d{6,12})', # 6-12 digit student IDs
41
+ r'([a-z]{2,3}\d{3,6})', # Letters followed by numbers (e.g., cs123456)
42
+ r'(\w+)_assignment', # Username before _assignment
43
+ r'(\w+)\.py', # Filename without extension
44
+ ]
45
+
46
+
47
+ @dataclass
48
+ class BatchFile:
49
+ """Represents a file in a batch"""
50
+ path: Path
51
+ content: str
52
+ language: str
53
+ student_id: str | None = None
54
+ student_name: str | None = None
55
+ file_size: int = 0
56
+ file_hash: str = ""
57
+
58
+ def __post_init__(self) -> None:
59
+ self.file_size = len(self.content.encode('utf-8'))
60
+ self.file_hash = calculate_file_hash(self.content)
61
+
62
+
63
+ @dataclass
64
+ class BatchProcessingResult:
65
+ """Result of batch processing operation"""
66
+ success: bool
67
+ batch_id: str
68
+ total_files: int
69
+ processed_files: int
70
+ failed_files: int
71
+ results: list[AnalysisResponse]
72
+ processing_time: float
73
+ errors: list[str]
74
+
75
+ # Statistics
76
+ average_score: float | None = None
77
+ score_distribution: dict[str, int] | None = None
78
+
79
+
80
+ class BatchProcessor:
81
+ """Service for processing multiple code submissions in batch"""
82
+
83
+ def __init__(self, config: BatchProcessingConfig | None = None):
84
+ self.config = config or BatchProcessingConfig()
85
+
86
+ async def process_directory(
87
+ self,
88
+ directory_path: str,
89
+ assignment_id: int | None = None,
90
+ rubric_id: int | None = None,
91
+ language: str | None = None
92
+ ) -> BatchProcessingResult:
93
+ """
94
+ Process all supported files in a directory
95
+
96
+ Args:
97
+ directory_path: Path to directory containing student submissions
98
+ assignment_id: Optional assignment ID
99
+ rubric_id: Optional rubric ID for grading
100
+ language: Optional language override
101
+
102
+ Returns:
103
+ BatchProcessingResult with processing results
104
+ """
105
+ start_time = datetime.utcnow()
106
+ batch_id = generate_submission_id()
107
+
108
+ try:
109
+ # Discover files in directory
110
+ files = await self._discover_files(directory_path, language)
111
+
112
+ if not files:
113
+ return BatchProcessingResult(
114
+ success=False,
115
+ batch_id=batch_id,
116
+ total_files=0,
117
+ processed_files=0,
118
+ failed_files=0,
119
+ results=[],
120
+ processing_time=0.0,
121
+ errors=["No supported files found in directory"]
122
+ )
123
+
124
+ logger.info("Starting batch processing",
125
+ batch_id=batch_id,
126
+ directory=directory_path,
127
+ file_count=len(files))
128
+
129
+ # Process files
130
+ results = await self._process_files(
131
+ files=files,
132
+ assignment_id=assignment_id,
133
+ rubric_id=rubric_id
134
+ )
135
+
136
+ # Calculate statistics
137
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
138
+ processed_count = len([r for r in results if r.success])
139
+ failed_count = len(files) - processed_count
140
+
141
+ # Calculate scores if available
142
+ scores = [r.total_score for r in results if r.total_score is not None]
143
+ average_score = sum(scores) / len(scores) if scores else None
144
+ score_distribution = self._calculate_score_distribution(scores) if scores else None
145
+
146
+ result = BatchProcessingResult(
147
+ success=failed_count == 0,
148
+ batch_id=batch_id,
149
+ total_files=len(files),
150
+ processed_files=processed_count,
151
+ failed_files=failed_count,
152
+ results=results,
153
+ processing_time=processing_time,
154
+ errors=[r.error_message for r in results if r.error_message],
155
+ average_score=average_score,
156
+ score_distribution=score_distribution
157
+ )
158
+
159
+ logger.info("Batch processing completed",
160
+ batch_id=batch_id,
161
+ processed=processed_count,
162
+ failed=failed_count,
163
+ processing_time=processing_time)
164
+
165
+ return result
166
+
167
+ except Exception as e:
168
+ logger.error("Batch processing failed",
169
+ batch_id=batch_id,
170
+ error=str(e))
171
+ return BatchProcessingResult(
172
+ success=False,
173
+ batch_id=batch_id,
174
+ total_files=0,
175
+ processed_files=0,
176
+ failed_files=0,
177
+ results=[],
178
+ processing_time=(datetime.utcnow() - start_time).total_seconds(),
179
+ errors=[f"Batch processing failed: {str(e)}"]
180
+ )
181
+
182
+ async def process_files_list(
183
+ self,
184
+ files_data: list[dict[str, str]],
185
+ assignment_id: int | None = None,
186
+ rubric_id: int | None = None,
187
+ language: str = "python"
188
+ ) -> BatchProcessingResult:
189
+ """
190
+ Process a list of file data (code + metadata)
191
+
192
+ Args:
193
+ files_data: List of file dictionaries with code and metadata
194
+ assignment_id: Optional assignment ID
195
+ rubric_id: Optional rubric ID
196
+ language: Programming language
197
+
198
+ Returns:
199
+ BatchProcessingResult with processing results
200
+ """
201
+ start_time = datetime.utcnow()
202
+ batch_id = generate_submission_id()
203
+
204
+ try:
205
+ # Parse and validate file data
206
+ parsed_files = parse_batch_files(files_data)
207
+
208
+ if not parsed_files:
209
+ return BatchProcessingResult(
210
+ success=False,
211
+ batch_id=batch_id,
212
+ total_files=0,
213
+ processed_files=0,
214
+ failed_files=0,
215
+ results=[],
216
+ processing_time=0.0,
217
+ errors=["No valid files provided"]
218
+ )
219
+
220
+ logger.info("Processing files list",
221
+ batch_id=batch_id,
222
+ file_count=len(parsed_files))
223
+
224
+ # Convert to BatchFile objects
225
+ batch_files = []
226
+ for file_data in parsed_files:
227
+ batch_file = BatchFile(
228
+ path=Path(file_data['path']),
229
+ content=file_data['code'],
230
+ language=file_data['language'],
231
+ student_id=file_data.get('student_id'),
232
+ student_name=file_data.get('student_name')
233
+ )
234
+ batch_files.append(batch_file)
235
+
236
+ # Process files
237
+ results = await self._process_files(
238
+ files=batch_files,
239
+ assignment_id=assignment_id,
240
+ rubric_id=rubric_id
241
+ )
242
+
243
+ # Calculate results
244
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
245
+ processed_count = len([r for r in results if r.success])
246
+ failed_count = len(batch_files) - processed_count
247
+
248
+ return BatchProcessingResult(
249
+ success=failed_count == 0,
250
+ batch_id=batch_id,
251
+ total_files=len(batch_files),
252
+ processed_files=processed_count,
253
+ failed_files=failed_count,
254
+ results=results,
255
+ processing_time=processing_time,
256
+ errors=[r.error_message for r in results if r.error_message]
257
+ )
258
+
259
+ except Exception as e:
260
+ logger.error("Files list processing failed",
261
+ batch_id=batch_id,
262
+ error=str(e))
263
+ return BatchProcessingResult(
264
+ success=False,
265
+ batch_id=batch_id,
266
+ total_files=0,
267
+ processed_files=0,
268
+ failed_files=0,
269
+ results=[],
270
+ processing_time=(datetime.utcnow() - start_time).total_seconds(),
271
+ errors=[f"Processing failed: {str(e)}"]
272
+ )
273
+
274
+ async def _discover_files(
275
+ self,
276
+ directory_path: str,
277
+ language_filter: str | None = None
278
+ ) -> list[BatchFile]:
279
+ """Discover and read all supported files in directory"""
280
+ directory = Path(directory_path)
281
+
282
+ if not directory.exists() or not directory.is_dir():
283
+ raise ValueError(f"Directory does not exist: {directory_path}")
284
+
285
+ files = []
286
+
287
+ # Walk through directory structure
288
+ for file_path in directory.rglob("*"):
289
+ if file_path.is_file():
290
+ # Check if file type is supported
291
+ detected_language = detect_language_from_extension(file_path.name)
292
+
293
+ if not detected_language:
294
+ if not self.config.skip_unsupported_files:
295
+ logger.warning("Unsupported file type", file=str(file_path))
296
+ continue
297
+
298
+ # Apply language filter if specified
299
+ if language_filter and detected_language != language_filter:
300
+ continue
301
+
302
+ try:
303
+ # Read file content
304
+ with open(file_path, encoding='utf-8', errors='ignore') as f:
305
+ content = f.read()
306
+
307
+ # Extract student information
308
+ student_id, student_name = self._extract_student_info(file_path)
309
+
310
+ # Create BatchFile
311
+ batch_file = BatchFile(
312
+ path=file_path,
313
+ content=content,
314
+ language=detected_language,
315
+ student_id=student_id,
316
+ student_name=student_name
317
+ )
318
+
319
+ files.append(batch_file)
320
+
321
+ logger.debug("Discovered file",
322
+ file=str(file_path),
323
+ language=detected_language,
324
+ student_id=student_id)
325
+
326
+ except Exception as e:
327
+ logger.warning("Failed to read file",
328
+ file=str(file_path),
329
+ error=str(e))
330
+ continue
331
+
332
+ return files
333
+
334
+ def _extract_student_info(self, file_path: Path) -> tuple[str | None, str | None]:
335
+ """Extract student ID and name from file path"""
336
+ if not self.config.extract_student_info:
337
+ return None, None
338
+
339
+ # Try to extract from file path components
340
+ path_parts = [file_path.stem] + list(file_path.parts)
341
+
342
+ student_id = None
343
+ student_name = None
344
+
345
+ for part in path_parts:
346
+ if student_id:
347
+ break
348
+
349
+ for pattern in self.config.student_id_patterns or []:
350
+ match = re.search(pattern, part.lower())
351
+ if match:
352
+ student_id = match.group(1)
353
+ # Try to extract name from the same part
354
+ name_match = re.search(r'([a-z]+_[a-z]+)', part.lower())
355
+ if name_match:
356
+ student_name = name_match.group(1).replace('_', ' ').title()
357
+ break
358
+
359
+ return student_id, student_name
360
+
361
+ async def _process_files(
362
+ self,
363
+ files: list[BatchFile],
364
+ assignment_id: int | None = None,
365
+ rubric_id: int | None = None
366
+ ) -> list[AnalysisResponse]:
367
+ """Process multiple files, optionally in parallel"""
368
+
369
+ if self.config.parallel_processing:
370
+ # Process files in parallel with limited concurrency
371
+ semaphore = asyncio.Semaphore(self.config.max_concurrent)
372
+ tasks = [
373
+ self._process_single_file_with_semaphore(
374
+ semaphore, file, assignment_id, rubric_id
375
+ )
376
+ for file in files
377
+ ]
378
+ gather_results = await asyncio.gather(*tasks, return_exceptions=True)
379
+
380
+ # Handle any exceptions
381
+ processed_results: list[AnalysisResponse] = []
382
+ for i, result in enumerate(gather_results):
383
+ if isinstance(result, Exception):
384
+ logger.error("File processing failed",
385
+ file=str(files[i].path),
386
+ error=str(result))
387
+ # Create error response
388
+ error_response = AnalysisResponse(
389
+ success=False,
390
+ submission_id=generate_submission_id(),
391
+ error_message=f"Processing failed: {str(result)}",
392
+ processing_time=0.0,
393
+ total_score=0.0,
394
+ max_score=100.0
395
+ )
396
+ processed_results.append(error_response)
397
+ elif isinstance(result, AnalysisResponse):
398
+ processed_results.append(result)
399
+
400
+ return processed_results
401
+ else:
402
+ # Process files sequentially
403
+ results: list[AnalysisResponse] = []
404
+ for file in files:
405
+ result = await self._process_single_file(file, assignment_id, rubric_id)
406
+ results.append(result)
407
+ return results
408
+
409
+ async def _process_single_file_with_semaphore(
410
+ self,
411
+ semaphore: asyncio.Semaphore,
412
+ file: BatchFile,
413
+ assignment_id: int | None,
414
+ rubric_id: int | None
415
+ ) -> AnalysisResponse:
416
+ """Process a single file with semaphore for concurrency control"""
417
+ async with semaphore:
418
+ return await self._process_single_file(file, assignment_id, rubric_id)
419
+
420
+ async def _process_single_file(
421
+ self,
422
+ file: BatchFile,
423
+ assignment_id: int | None,
424
+ rubric_id: int | None
425
+ ) -> AnalysisResponse:
426
+ """Process a single file through the analysis pipeline"""
427
+ try:
428
+ # Run analysis
429
+ start_time = datetime.utcnow()
430
+
431
+ analysis_result = await analyzer_manager.analyze_code(
432
+ code=file.content,
433
+ language=file.language,
434
+ file_path=str(file.path),
435
+ analyzer_config=None
436
+ )
437
+
438
+ processing_time = (datetime.utcnow() - start_time).total_seconds()
439
+
440
+ # Convert to response format (simplified)
441
+ from codelens.api.routes.analysis import (
442
+ convert_analysis_issues,
443
+ convert_metrics,
444
+ )
445
+
446
+ response = AnalysisResponse(
447
+ success=analysis_result.success,
448
+ submission_id=generate_submission_id(),
449
+ syntax_valid=analysis_result.success,
450
+ issues=convert_analysis_issues(analysis_result.issues),
451
+ metrics=convert_metrics(analysis_result.metrics),
452
+ analysis_version=analysis_result.analyzer_version,
453
+ processing_time=processing_time,
454
+ tools_used={"analyzer": analysis_result.analyzer_version},
455
+ total_score=0.0,
456
+ max_score=100.0
457
+ )
458
+
459
+ logger.info("Processed file",
460
+ file=str(file.path),
461
+ success=response.success,
462
+ processing_time=processing_time)
463
+
464
+ return response
465
+
466
+ except Exception as e:
467
+ logger.error("Single file processing failed",
468
+ file=str(file.path),
469
+ error=str(e))
470
+ return AnalysisResponse(
471
+ success=False,
472
+ submission_id=generate_submission_id(),
473
+ error_message=f"Processing failed: {str(e)}",
474
+ processing_time=0.0,
475
+ total_score=0.0,
476
+ max_score=100.0
477
+ )
478
+
479
+ def _calculate_score_distribution(self, scores: list[float]) -> dict[str, int]:
480
+ """Calculate score distribution by grade ranges"""
481
+ if not scores:
482
+ return {}
483
+
484
+ distribution = {
485
+ "A (90-100)": 0,
486
+ "B (80-89)": 0,
487
+ "C (70-79)": 0,
488
+ "D (60-69)": 0,
489
+ "F (0-59)": 0
490
+ }
491
+
492
+ for score in scores:
493
+ if score >= 90:
494
+ distribution["A (90-100)"] += 1
495
+ elif score >= 80:
496
+ distribution["B (80-89)"] += 1
497
+ elif score >= 70:
498
+ distribution["C (70-79)"] += 1
499
+ elif score >= 60:
500
+ distribution["D (60-69)"] += 1
501
+ else:
502
+ distribution["F (0-59)"] += 1
503
+
504
+ return distribution
505
+
506
+
507
+ # Global batch processor instance
508
+ batch_processor = BatchProcessor()