cr-proc 0.1.11__tar.gz → 0.1.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cr_proc
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: A tool for processing BYU CS code recording files.
5
5
  Author: Ethan Dye
6
6
  Author-email: mrtops03@gmail.com
@@ -79,6 +79,14 @@ When processing multiple files:
79
79
  specified file instead of stdout. For single files only.
80
80
  - `--output-dir OUTPUT_DIR`: (Optional) Directory to write reconstructed code
81
81
  files in batch mode. Files are named based on input recording filenames.
82
+ - `--submitted-file SUBMITTED_FILE`: (Optional) Path to the submitted final file
83
+ to verify against the reconstructed output. If provided, the reconstructed code
84
+ will be compared to this file and differences will be reported.
85
+ - `--submitted-dir SUBMITTED_DIR`: (Optional) Directory containing submitted files
86
+ to verify against the reconstructed output. For each recording file, the
87
+ corresponding submitted file will be found by matching the filename
88
+ (e.g., `homework0-ISC.recording.jsonl.gz` will match `homework0-ISC.py`).
89
+ Cannot be used with `--submitted-file`.
82
90
  - `-s, --show-autocomplete-details`: (Optional) Show individual auto-complete
83
91
  events in addition to aggregate statistics.
84
92
  - `-p, --playback`: (Optional) Play back the recording in real-time, showing
@@ -112,6 +120,18 @@ Save JSON results:
112
120
  poetry run cr_proc student1.jsonl.gz student2.jsonl.gz template.py -o results/
113
121
  ```
114
122
 
123
+ Verify against a single submitted file:
124
+
125
+ ```bash
126
+ poetry run cr_proc homework0.recording.jsonl.gz homework0.py --submitted-file submitted_homework0.py
127
+ ```
128
+
129
+ Verify against submitted files in a directory (batch mode):
130
+
131
+ ```bash
132
+ poetry run cr_proc recordings/*.jsonl.gz template.py --submitted-dir submissions/
133
+ ```
134
+
115
135
  This will process each recording independently and flag any that exceed 30
116
136
  minutes.
117
137
 
@@ -67,6 +67,14 @@ When processing multiple files:
67
67
  specified file instead of stdout. For single files only.
68
68
  - `--output-dir OUTPUT_DIR`: (Optional) Directory to write reconstructed code
69
69
  files in batch mode. Files are named based on input recording filenames.
70
+ - `--submitted-file SUBMITTED_FILE`: (Optional) Path to the submitted final file
71
+ to verify against the reconstructed output. If provided, the reconstructed code
72
+ will be compared to this file and differences will be reported.
73
+ - `--submitted-dir SUBMITTED_DIR`: (Optional) Directory containing submitted files
74
+ to verify against the reconstructed output. For each recording file, the
75
+ corresponding submitted file will be found by matching the filename
76
+ (e.g., `homework0-ISC.recording.jsonl.gz` will match `homework0-ISC.py`).
77
+ Cannot be used with `--submitted-file`.
70
78
  - `-s, --show-autocomplete-details`: (Optional) Show individual auto-complete
71
79
  events in addition to aggregate statistics.
72
80
  - `-p, --playback`: (Optional) Play back the recording in real-time, showing
@@ -100,6 +108,18 @@ Save JSON results:
100
108
  poetry run cr_proc student1.jsonl.gz student2.jsonl.gz template.py -o results/
101
109
  ```
102
110
 
111
+ Verify against a single submitted file:
112
+
113
+ ```bash
114
+ poetry run cr_proc homework0.recording.jsonl.gz homework0.py --submitted-file submitted_homework0.py
115
+ ```
116
+
117
+ Verify against submitted files in a directory (batch mode):
118
+
119
+ ```bash
120
+ poetry run cr_proc recordings/*.jsonl.gz template.py --submitted-dir submissions/
121
+ ```
122
+
103
123
  This will process each recording independently and flag any that exceed 30
104
124
  minutes.
105
125
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "cr_proc"
3
- version = "0.1.11"
3
+ version = "0.1.13"
4
4
  description = "A tool for processing BYU CS code recording files."
5
5
  authors = [
6
6
  {name = "Ethan Dye",email = "mrtops03@gmail.com"}
@@ -5,9 +5,38 @@ from pathlib import Path, PureWindowsPath, PurePosixPath
5
5
  from typing import Any
6
6
 
7
7
 
8
+ def normalize_path_string(path_str: str) -> str:
9
+ """
10
+ Normalize a path string to use forward slashes (POSIX style).
11
+
12
+ Handles both Windows-style (backslash) and Unix-style (forward slash) paths
13
+ regardless of the current platform. Useful for cross-platform consistency
14
+ when files are created on Windows but processed on other systems.
15
+
16
+ Parameters
17
+ ----------
18
+ path_str : str
19
+ Path string (may use Windows or Unix separators)
20
+
21
+ Returns
22
+ -------
23
+ str
24
+ Normalized path string using forward slashes
25
+ """
26
+ # Try to detect if this is a Windows path (contains backslashes)
27
+ if "\\" in path_str:
28
+ # Windows-style path
29
+ path_obj = PureWindowsPath(path_str)
30
+ else:
31
+ # Unix-style path (or just a filename)
32
+ path_obj = PurePosixPath(path_str)
33
+
34
+ return path_obj.as_posix()
35
+
36
+
8
37
  def _normalize_document_path(doc_path: str) -> tuple[str, str]:
9
38
  """
10
- Normalize a document path to extract filename and stem.
39
+ Extract filename and stem from a document path.
11
40
 
12
41
  Handles both Windows-style (backslash) and Unix-style (forward slash) paths
13
42
  regardless of the current platform.
@@ -22,14 +51,9 @@ def _normalize_document_path(doc_path: str) -> tuple[str, str]:
22
51
  tuple[str, str]
23
52
  (filename, stem) extracted from the path
24
53
  """
25
- # Try to detect if this is a Windows path (contains backslashes)
26
- if "\\" in doc_path:
27
- # Windows-style path
28
- path_obj = PureWindowsPath(doc_path)
29
- else:
30
- # Unix-style path (or just a filename)
31
- path_obj = PurePosixPath(doc_path)
32
-
54
+ # Normalize to forward slashes first, then parse
55
+ normalized = normalize_path_string(doc_path)
56
+ path_obj = PurePosixPath(normalized)
33
57
  return path_obj.name, path_obj.stem
34
58
 
35
59
 
@@ -4,6 +4,8 @@ import sys
4
4
  from pathlib import Path
5
5
  from typing import Any
6
6
 
7
+ from .document import normalize_path_string
8
+
7
9
 
8
10
  def write_batch_json_output(
9
11
  output_path: Path,
@@ -36,15 +38,21 @@ def write_batch_json_output(
36
38
  # Convert results to JSON-serializable format
37
39
  files_data = []
38
40
  for r in results:
39
- files_data.append({
40
- "jsonl_file": str(r["jsonl_file"]),
41
+ file_result = {
42
+ "jsonl_file": normalize_path_string(str(r["jsonl_file"])),
41
43
  "document": r["target_document"],
42
44
  "verified": r["verified"],
43
45
  "time_info": r["time_info"],
44
46
  "suspicious_events": r["suspicious_events"],
45
47
  "template_diff": r.get("template_diff", ""),
46
48
  "reconstructed_code": r["reconstructed"],
47
- })
49
+ }
50
+
51
+ # Add submitted_comparison if present
52
+ if r.get("submitted_comparison") is not None:
53
+ file_result["submitted_comparison"] = r["submitted_comparison"]
54
+
55
+ files_data.append(file_result)
48
56
 
49
57
  # Use consistent format for both single and batch modes
50
58
  output_data = {
@@ -1,6 +1,7 @@
1
1
  from typing import Any
2
2
  from datetime import datetime
3
3
  import difflib
4
+ from .document import normalize_path_string
4
5
 
5
6
  # ============================================================================
6
7
  # Constants for detection thresholds
@@ -837,15 +838,19 @@ def verify(template: str, jsonData: tuple[dict[str, Any], ...]) -> tuple[str, li
837
838
 
838
839
 
839
840
  def combine_time_info(
840
- time_infos: list[dict[str, Any] | None], time_limit_minutes: int | None
841
+ all_events: list[tuple[dict[str, Any], ...]], time_limit_minutes: int | None
841
842
  ) -> dict[str, Any] | None:
842
843
  """
843
- Combine time information from multiple recording files.
844
+ Combine time information from multiple recording files, avoiding double-counting overlapping time.
845
+
846
+ Merges all events from multiple recordings, then calculates the actual time spent editing
847
+ using the same logic as check_time_limit (gap analysis with focus awareness). This ensures
848
+ overlapping editing sessions are not double-counted.
844
849
 
845
850
  Parameters
846
851
  ----------
847
- time_infos : list[dict[str, Any] | None]
848
- List of time information dictionaries from multiple files
852
+ all_events : list[tuple[dict[str, Any], ...]]
853
+ List of event tuples from multiple recording files
849
854
  time_limit_minutes : int | None
850
855
  Time limit to check against
851
856
 
@@ -854,40 +859,94 @@ def combine_time_info(
854
859
  dict[str, Any] | None
855
860
  Combined time information, or None if no valid data
856
861
  """
857
- valid_infos = [info for info in time_infos if info is not None]
858
- if not valid_infos:
862
+ # Filter out empty event sets
863
+ valid_event_sets = [events for events in all_events if events]
864
+ if not valid_event_sets:
859
865
  return None
860
866
 
861
- # Sum elapsed times across all sessions
862
- total_elapsed = sum(info["minutes_elapsed"] for info in valid_infos)
867
+ # Merge all events from all recordings into a single tuple
868
+ merged_events = tuple(
869
+ event
870
+ for event_set in valid_event_sets
871
+ for event in event_set
872
+ )
863
873
 
864
- # Find overall first and last timestamps
865
- all_timestamps = []
866
- for info in valid_infos:
867
- all_timestamps.append(
868
- datetime.fromisoformat(info["first_timestamp"].replace("Z", "+00:00"))
869
- )
870
- all_timestamps.append(
871
- datetime.fromisoformat(info["last_timestamp"].replace("Z", "+00:00"))
872
- )
874
+ # Use check_time_limit on the merged events to calculate time properly
875
+ # This handles overlapping periods automatically since we're now analyzing
876
+ # all events together chronologically
877
+ combined_result = check_time_limit(merged_events, time_limit_minutes)
873
878
 
874
- first_ts = min(all_timestamps)
875
- last_ts = max(all_timestamps)
876
- overall_span = (last_ts - first_ts).total_seconds() / 60
879
+ if combined_result is None:
880
+ return None
877
881
 
878
- result = {
879
- "time_limit_minutes": time_limit_minutes,
880
- "minutes_elapsed": round(total_elapsed, 2),
881
- "first_timestamp": first_ts.isoformat().replace("+00:00", "Z"),
882
- "last_timestamp": last_ts.isoformat().replace("+00:00", "Z"),
883
- "file_count": len(valid_infos),
884
- "overall_span_minutes": round(overall_span, 2),
885
- }
882
+ # Add file_count to the result
883
+ combined_result["file_count"] = len(valid_event_sets)
886
884
 
887
- # For time limit check in combined mode, use the sum of elapsed times
888
- if time_limit_minutes is not None:
889
- result["exceeds_limit"] = total_elapsed > time_limit_minutes
890
- else:
891
- result["exceeds_limit"] = False
885
+ return combined_result
892
886
 
893
- return result
887
+
888
+ def compare_submitted_file(reconstructed_code: str, submitted_file_path) -> dict[str, Any]:
889
+ """
890
+ Compare reconstructed code from recording with a submitted final file.
891
+
892
+ Parameters
893
+ ----------
894
+ reconstructed_code : str
895
+ The code reconstructed from the recording
896
+ submitted_file_path : Path
897
+ Path to the submitted file
898
+
899
+ Returns
900
+ -------
901
+ dict[str, Any]
902
+ Dictionary containing:
903
+ - matches: bool indicating if the files match
904
+ - submitted_file: path to the submitted file
905
+ - diff: unified diff string if files don't match
906
+ - whitespace_only: bool indicating if only whitespace differs
907
+ """
908
+ try:
909
+ submitted_content = submitted_file_path.read_text()
910
+ except Exception as e:
911
+ return {
912
+ "matches": False,
913
+ "submitted_file": normalize_path_string(str(submitted_file_path)),
914
+ "error": f"Failed to read submitted file: {e}",
915
+ "diff": "",
916
+ "whitespace_only": False,
917
+ }
918
+
919
+ # Normalize newlines for comparison
920
+ reconstructed_normalized = _normalize_newlines(reconstructed_code)
921
+ submitted_normalized = _normalize_newlines(submitted_content)
922
+
923
+ # Check exact match
924
+ matches = reconstructed_normalized == submitted_normalized
925
+
926
+ # Check if only whitespace differs
927
+ whitespace_only = False
928
+ if not matches:
929
+ whitespace_only = is_only_whitespace_differences(
930
+ submitted_normalized, reconstructed_normalized
931
+ )
932
+
933
+ # Generate diff if they don't match
934
+ diff_text = ""
935
+ if not matches:
936
+ reconstructed_lines = reconstructed_normalized.splitlines(keepends=True)
937
+ submitted_lines = submitted_normalized.splitlines(keepends=True)
938
+ diff = difflib.unified_diff(
939
+ reconstructed_lines,
940
+ submitted_lines,
941
+ fromfile="reconstructed",
942
+ tofile="submitted",
943
+ lineterm="",
944
+ )
945
+ diff_text = "".join(diff)
946
+
947
+ return {
948
+ "matches": matches,
949
+ "submitted_file": normalize_path_string(str(submitted_file_path)),
950
+ "diff": diff_text,
951
+ "whitespace_only": whitespace_only,
952
+ }
@@ -18,11 +18,13 @@ from .api.output import write_batch_json_output
18
18
  from .api.verify import (
19
19
  check_time_limit,
20
20
  combine_time_info,
21
+ compare_submitted_file,
21
22
  detect_external_copypaste,
22
23
  template_diff,
23
24
  verify,
24
25
  )
25
26
  from .display import (
27
+ display_submitted_file_comparison,
26
28
  display_suspicious_events,
27
29
  display_template_diff,
28
30
  display_time_info,
@@ -102,6 +104,21 @@ def create_parser() -> argparse.ArgumentParser:
102
104
  help="Directory to write reconstructed code files in batch mode (one file per recording). "
103
105
  "Files are named based on input recording filenames.",
104
106
  )
107
+ parser.add_argument(
108
+ "--submitted-file",
109
+ type=Path,
110
+ default=None,
111
+ help="Path to the submitted final file to verify against the reconstructed output. "
112
+ "If provided, the reconstructed code will be compared to this file.",
113
+ )
114
+ parser.add_argument(
115
+ "--submitted-dir",
116
+ type=Path,
117
+ default=None,
118
+ help="Directory containing submitted files to compare against. "
119
+ "For each recording, the corresponding submitted file will be found by matching the filename. "
120
+ "For example, 'homework0-ISC.recording.jsonl.gz' will match 'homework0-ISC.py' in the directory.",
121
+ )
105
122
  parser.add_argument(
106
123
  "-s",
107
124
  "--show-autocomplete-details",
@@ -169,36 +186,81 @@ def expand_file_patterns(patterns: list[str]) -> list[Path]:
169
186
  return existing_files
170
187
 
171
188
 
189
+ def find_submitted_file(
190
+ jsonl_file: Path,
191
+ submitted_dir: Path,
192
+ target_document: str | None,
193
+ ) -> Path | None:
194
+ """
195
+ Find the submitted file corresponding to a recording file.
196
+
197
+ Matches by replacing '.recording.jsonl.gz' with the extension of the
198
+ target document (or '.py' if not specified).
199
+
200
+ Parameters
201
+ ----------
202
+ jsonl_file : Path
203
+ Path to the JSONL recording file
204
+ submitted_dir : Path
205
+ Directory containing submitted files
206
+ target_document : str | None
207
+ Target document path (to extract extension)
208
+
209
+ Returns
210
+ -------
211
+ Path | None
212
+ Path to the submitted file if found, None otherwise
213
+ """
214
+ # Determine the file extension from target_document or default to .py
215
+ extension = ".py"
216
+ if target_document:
217
+ extension = Path(target_document).suffix or ".py"
218
+
219
+ # Remove '.recording.jsonl.gz' and add the appropriate extension
220
+ base_name = jsonl_file.name.replace(".recording.jsonl.gz", "")
221
+ submitted_filename = base_name + extension
222
+
223
+ submitted_file = submitted_dir / submitted_filename
224
+ if submitted_file.exists():
225
+ return submitted_file
226
+
227
+ return None
228
+
229
+
172
230
  def process_single_file(
173
231
  jsonl_path: Path,
232
+ json_data: tuple[dict[str, Any], ...],
174
233
  template_data: str,
175
234
  target_document: str | None,
176
235
  time_limit: int | None,
177
- ) -> tuple[bool, str, list[dict[str, Any]], dict[str, Any] | None, str]:
236
+ submitted_file: Path | None = None,
237
+ submitted_dir: Path | None = None,
238
+ ) -> tuple[bool, str, list[dict[str, Any]], dict[str, Any] | None, str, tuple[dict[str, Any], ...], dict[str, Any] | None]:
178
239
  """
179
240
  Process a single JSONL recording file.
180
241
 
181
242
  Parameters
182
243
  ----------
183
244
  jsonl_path : Path
184
- Path to the JSONL file
245
+ Path to the JSONL file (used for error reporting and file matching)
246
+ json_data : tuple[dict[str, Any], ...]
247
+ Pre-loaded JSON events from the recording file
185
248
  template_data : str
186
249
  Template file content
187
250
  target_document : str | None
188
251
  Document to process
189
252
  time_limit : int | None
190
253
  Time limit in minutes
254
+ submitted_file : Path | None
255
+ Path to the submitted file to compare against
256
+ submitted_dir : Path | None
257
+ Directory containing submitted files to compare against
191
258
 
192
259
  Returns
193
260
  -------
194
261
  tuple
195
- (verified, reconstructed_code, suspicious_events, time_info, template_diff_text)
262
+ (verified, reconstructed_code, suspicious_events, time_info, template_diff_text, doc_events, submitted_comparison)
196
263
  """
197
- try:
198
- json_data = load_jsonl(jsonl_path)
199
- except (FileNotFoundError, ValueError, IOError) as e:
200
- print(f"Error loading {jsonl_path}: {e}", file=sys.stderr)
201
- return False, "", [], None, ""
202
264
 
203
265
  # Filter events for target document
204
266
  doc_events = filter_events_by_document(json_data, target_document)
@@ -207,7 +269,7 @@ def process_single_file(
207
269
  f"Warning: No events found for document '{target_document}' in {jsonl_path}",
208
270
  file=sys.stderr,
209
271
  )
210
- return False, "", [], None, ""
272
+ return False, "", [], None, "", (), None
211
273
 
212
274
  # Check time information
213
275
  time_info = check_time_limit(doc_events, time_limit)
@@ -218,13 +280,29 @@ def process_single_file(
218
280
  reconstructed = reconstruct_file_from_events(
219
281
  doc_events, verified_template, document_path=target_document
220
282
  )
221
- return True, reconstructed, suspicious_events, time_info, ""
283
+
284
+ # Compare with submitted file if provided
285
+ submitted_comparison = None
286
+ actual_submitted_file = submitted_file
287
+
288
+ # If submitted_dir is provided, find the matching file
289
+ if submitted_dir and not submitted_file:
290
+ actual_submitted_file = find_submitted_file(jsonl_path, submitted_dir, target_document)
291
+ if actual_submitted_file:
292
+ print(f"Found submitted file: {actual_submitted_file.name}", file=sys.stderr)
293
+
294
+ if actual_submitted_file and actual_submitted_file.exists():
295
+ submitted_comparison = compare_submitted_file(reconstructed, actual_submitted_file)
296
+ elif actual_submitted_file:
297
+ print(f"Warning: Submitted file not found: {actual_submitted_file}", file=sys.stderr)
298
+
299
+ return True, reconstructed, suspicious_events, time_info, "", doc_events, submitted_comparison
222
300
  except ValueError as e:
223
301
  # If verification fails but we have events, still try to reconstruct
224
302
  print(f"Warning: Verification failed for {jsonl_path}: {e}", file=sys.stderr)
225
303
  try:
226
304
  if not doc_events:
227
- return False, "", [], time_info, ""
305
+ return False, "", [], time_info, "", (), None
228
306
 
229
307
  # Compute diff against template and still detect suspicious events
230
308
  diff_text = template_diff(template_data, doc_events)
@@ -235,19 +313,35 @@ def process_single_file(
235
313
  reconstructed = reconstruct_file_from_events(
236
314
  doc_events, initial_state, document_path=target_document
237
315
  )
238
- return False, reconstructed, suspicious_events, time_info, diff_text
316
+
317
+ # Compare with submitted file if provided
318
+ submitted_comparison = None
319
+ actual_submitted_file = submitted_file
320
+
321
+ # If submitted_dir is provided, find the matching file
322
+ if submitted_dir and not submitted_file:
323
+ actual_submitted_file = find_submitted_file(jsonl_path, submitted_dir, target_document)
324
+ if actual_submitted_file:
325
+ print(f"Found submitted file: {actual_submitted_file.name}", file=sys.stderr)
326
+
327
+ if actual_submitted_file and actual_submitted_file.exists():
328
+ submitted_comparison = compare_submitted_file(reconstructed, actual_submitted_file)
329
+ elif actual_submitted_file:
330
+ print(f"Warning: Submitted file not found: {actual_submitted_file}", file=sys.stderr)
331
+
332
+ return False, reconstructed, suspicious_events, time_info, diff_text, doc_events, submitted_comparison
239
333
  except Exception as reconstruction_error:
240
334
  print(
241
335
  f"Error reconstructing {jsonl_path}: {type(reconstruction_error).__name__}: {reconstruction_error}",
242
336
  file=sys.stderr,
243
337
  )
244
- return False, "", [], time_info, ""
338
+ return False, "", [], time_info, "", (), None
245
339
  except Exception as e:
246
340
  print(
247
341
  f"Error processing {jsonl_path}: {type(e).__name__}: {e}",
248
342
  file=sys.stderr,
249
343
  )
250
- return False, "", [], time_info, ""
344
+ return False, "", [], time_info, "", (), None
251
345
 
252
346
 
253
347
  def write_reconstructed_file(
@@ -274,7 +368,7 @@ def write_reconstructed_file(
274
368
  """
275
369
  try:
276
370
  output_path.parent.mkdir(parents=True, exist_ok=True)
277
- output_path.write_text(content)
371
+ output_path.write_text(content + '\n')
278
372
  print(f"{file_description} written to: {output_path}", file=sys.stderr)
279
373
  return True
280
374
  except Exception as e:
@@ -284,7 +378,8 @@ def write_reconstructed_file(
284
378
 
285
379
  def handle_playback_mode(
286
380
  jsonl_file: Path,
287
- template_file: Path,
381
+ json_data: tuple[dict[str, Any], ...],
382
+ template_base: Path | None,
288
383
  template_data: str,
289
384
  document_override: str | None,
290
385
  speed: float,
@@ -295,9 +390,11 @@ def handle_playback_mode(
295
390
  Parameters
296
391
  ----------
297
392
  jsonl_file : Path
298
- Path to the recording file
299
- template_file : Path
300
- Path to the template file
393
+ Path to the recording file (for error reporting)
394
+ json_data : tuple[dict[str, Any], ...]
395
+ Pre-loaded JSON events from the recording file
396
+ template_base : Path | None
397
+ Path to the template file or directory
301
398
  template_data : str
302
399
  Template file content
303
400
  document_override : str | None
@@ -311,9 +408,8 @@ def handle_playback_mode(
311
408
  Exit code (0 for success, 1 for error)
312
409
  """
313
410
  try:
314
- json_data = load_jsonl(jsonl_file)
315
411
  recorded_docs = get_recorded_documents(json_data)
316
- target_document = resolve_document(recorded_docs, template_file, document_override)
412
+ target_document = resolve_document(recorded_docs, template_base, document_override)
317
413
 
318
414
  if target_document:
319
415
  playback_recording(json_data, target_document, template_data, speed)
@@ -322,12 +418,13 @@ def handle_playback_mode(
322
418
  print("Error: No documents found in recording", file=sys.stderr)
323
419
  return 1
324
420
  except Exception as e:
325
- print(f"Error loading file for playback: {e}", file=sys.stderr)
421
+ print(f"Error in playback: {e}", file=sys.stderr)
326
422
  return 1
327
423
 
328
424
 
329
425
  def process_batch(
330
426
  jsonl_files: list[Path],
427
+ json_data_map: dict[Path, tuple[dict[str, Any], ...]],
331
428
  template_base: Path | None,
332
429
  template_data: str,
333
430
  args: argparse.Namespace,
@@ -339,6 +436,8 @@ def process_batch(
339
436
  ----------
340
437
  jsonl_files : list[Path]
341
438
  List of JSONL files to process
439
+ json_data_map : dict[Path, tuple[dict[str, Any], ...]]
440
+ Pre-loaded JSON data for each file
342
441
  template_base : Path
343
442
  Path to template file or directory
344
443
  template_data : str
@@ -360,9 +459,16 @@ def process_batch(
360
459
  for i, jsonl_file in enumerate(jsonl_files, 1):
361
460
  print_batch_header(i, len(jsonl_files), jsonl_file.name)
362
461
 
462
+ # Get pre-loaded data for this file
463
+ if jsonl_file not in json_data_map:
464
+ print(f"Error: No pre-loaded data for {jsonl_file}", file=sys.stderr)
465
+ all_verified = False
466
+ continue
467
+
468
+ file_data = json_data_map[jsonl_file]
469
+
363
470
  # Determine target document for this file
364
471
  try:
365
- file_data = load_jsonl(jsonl_file)
366
472
  recorded_docs = get_recorded_documents(file_data)
367
473
  target_document = resolve_document(recorded_docs, template_base, args.document)
368
474
  except (FileNotFoundError, ValueError, IOError) as e:
@@ -386,9 +492,9 @@ def process_batch(
386
492
  else:
387
493
  file_template_data = template_data
388
494
 
389
- # Process the file
390
- verified, reconstructed, suspicious_events, time_info, diff_text = process_single_file(
391
- jsonl_file, file_template_data, target_document, args.time_limit
495
+ # Process the file with pre-loaded data
496
+ verified, reconstructed, suspicious_events, time_info, diff_text, doc_events, submitted_comparison = process_single_file(
497
+ jsonl_file, file_data, file_template_data, target_document, args.time_limit, args.submitted_file, args.submitted_dir
392
498
  )
393
499
 
394
500
  if not verified:
@@ -398,6 +504,7 @@ def process_batch(
398
504
  display_time_info(time_info)
399
505
  display_suspicious_events(suspicious_events, args.show_autocomplete_details)
400
506
  display_template_diff(diff_text)
507
+ display_submitted_file_comparison(submitted_comparison)
401
508
 
402
509
  # Store results
403
510
  results.append({
@@ -408,6 +515,8 @@ def process_batch(
408
515
  "suspicious_events": suspicious_events,
409
516
  "time_info": time_info,
410
517
  "template_diff": diff_text,
518
+ "doc_events": doc_events,
519
+ "submitted_comparison": submitted_comparison,
411
520
  })
412
521
 
413
522
  # Write output file if requested
@@ -421,6 +530,7 @@ def process_batch(
421
530
 
422
531
  def process_single(
423
532
  jsonl_file: Path,
533
+ json_data: tuple[dict[str, Any], ...],
424
534
  template_base: Path | None,
425
535
  template_data: str,
426
536
  args: argparse.Namespace,
@@ -432,6 +542,8 @@ def process_single(
432
542
  ----------
433
543
  jsonl_file : Path
434
544
  Path to JSONL file
545
+ json_data : tuple[dict[str, Any], ...]
546
+ Pre-loaded JSON data for the file
435
547
  template_base : Path
436
548
  Path to template file or directory
437
549
  template_data : str
@@ -445,8 +557,7 @@ def process_single(
445
557
  (results, verified)
446
558
  """
447
559
  try:
448
- file_data = load_jsonl(jsonl_file)
449
- recorded_docs = get_recorded_documents(file_data)
560
+ recorded_docs = get_recorded_documents(json_data)
450
561
  target_document = resolve_document(recorded_docs, template_base, args.document)
451
562
  except (FileNotFoundError, ValueError, IOError) as e:
452
563
  print(f"Error determining document: {e}", file=sys.stderr)
@@ -470,14 +581,15 @@ def process_single(
470
581
 
471
582
  print(f"Processing: {target_document or template_base}", file=sys.stderr)
472
583
 
473
- verified, reconstructed, suspicious_events, time_info, diff_text = process_single_file(
474
- jsonl_file, file_template_data, target_document, args.time_limit
584
+ verified, reconstructed, suspicious_events, time_info, diff_text, doc_events, submitted_comparison = process_single_file(
585
+ jsonl_file, json_data, file_template_data, target_document, args.time_limit, args.submitted_file, args.submitted_dir
475
586
  )
476
587
 
477
588
  # Display results
478
589
  display_time_info(time_info)
479
590
  display_suspicious_events(suspicious_events, args.show_autocomplete_details)
480
591
  display_template_diff(diff_text)
592
+ display_submitted_file_comparison(submitted_comparison)
481
593
 
482
594
  # Write output file if requested
483
595
  if reconstructed and args.output_file:
@@ -492,6 +604,8 @@ def process_single(
492
604
  "suspicious_events": suspicious_events,
493
605
  "time_info": time_info,
494
606
  "template_diff": diff_text,
607
+ "doc_events": doc_events,
608
+ "submitted_comparison": submitted_comparison,
495
609
  }]
496
610
 
497
611
  return results, verified
@@ -526,6 +640,11 @@ def main() -> int:
526
640
  parser.print_help()
527
641
  return 1
528
642
 
643
+ # Validate that both --submitted-file and --submitted-dir are not provided simultaneously
644
+ if args.submitted_file and args.submitted_dir:
645
+ print("Error: Cannot specify both --submitted-file and --submitted-dir", file=sys.stderr)
646
+ return 1
647
+
529
648
  # Expand file patterns and validate
530
649
  try:
531
650
  jsonl_files = expand_file_patterns(jsonl_patterns)
@@ -540,10 +659,23 @@ def main() -> int:
540
659
  # Determine template source (use template_dir if provided, otherwise template_file)
541
660
  template_path = args.template_dir if args.template_dir else template_file
542
661
 
662
+ # Load all files once - fail fast if any fail
663
+ json_data_map: dict[Path, tuple[dict[str, Any], ...]] = {}
664
+ for jsonl_file in jsonl_files:
665
+ try:
666
+ json_data_map[jsonl_file] = load_jsonl(jsonl_file)
667
+ except ValueError as e:
668
+ print(f"Error parsing {jsonl_file}: {e}", file=sys.stderr)
669
+ print("Tampering is likey - aborting processing.", file=sys.stderr)
670
+ return 1
671
+ except (FileNotFoundError, ValueError, IOError) as e:
672
+ print(f"Error loading {jsonl_file}: {e}", file=sys.stderr)
673
+ return 1
674
+
543
675
  # Handle playback mode (single file only)
544
676
  if not batch_mode and args.playback:
545
677
  try:
546
- json_data = load_jsonl(jsonl_files[0])
678
+ json_data = json_data_map[jsonl_files[0]]
547
679
  recorded_docs = get_recorded_documents(json_data)
548
680
  target_document = resolve_document(recorded_docs, template_path, args.document)
549
681
 
@@ -554,14 +686,11 @@ def main() -> int:
554
686
  target_document
555
687
  )
556
688
 
557
- if target_document:
558
- playback_recording(json_data, target_document, template_data, args.playback_speed)
559
- return 0
560
- else:
561
- print("Error: No documents found in recording", file=sys.stderr)
562
- return 1
689
+ return handle_playback_mode(
690
+ jsonl_files[0], json_data, template_path, template_data, args.document, args.playback_speed
691
+ )
563
692
  except Exception as e:
564
- print(f"Error loading file for playback: {e}", file=sys.stderr)
693
+ print(f"Error in playback: {e}", file=sys.stderr)
565
694
  return 1
566
695
 
567
696
  # Get template data
@@ -580,14 +709,14 @@ def main() -> int:
580
709
  print(f"Error: {e}", file=sys.stderr)
581
710
  return 1
582
711
 
583
- # Process files
712
+ # Process files with pre-loaded data
584
713
  if batch_mode:
585
714
  results, all_verified = process_batch(
586
- jsonl_files, template_path, template_data, args
715
+ jsonl_files, json_data_map, template_path, template_data, args
587
716
  )
588
717
  else:
589
718
  results, all_verified = process_single(
590
- jsonl_files[0], template_path, template_data, args
719
+ jsonl_files[0], json_data_map[jsonl_files[0]], template_path, template_data, args
591
720
  )
592
721
 
593
722
  if not results:
@@ -600,10 +729,10 @@ def main() -> int:
600
729
  print_batch_summary(len(results), verified_count, failed_files)
601
730
 
602
731
  # Display combined time report
603
- time_infos = [r["time_info"] for r in results]
732
+ all_events = [r["doc_events"] for r in results]
604
733
  combined_time = None
605
- if any(time_infos):
606
- combined_time = combine_time_info(time_infos, args.time_limit)
734
+ if any(all_events):
735
+ combined_time = combine_time_info(all_events, args.time_limit)
607
736
  display_time_info(combined_time, is_combined=True)
608
737
 
609
738
  # Write JSON output
@@ -176,6 +176,39 @@ def display_template_diff(diff_text: str) -> None:
176
176
  print(diff_text, file=sys.stderr)
177
177
 
178
178
 
179
+ def display_submitted_file_comparison(comparison: dict[str, Any] | None) -> None:
180
+ """
181
+ Display comparison results between reconstructed code and submitted file.
182
+
183
+ Parameters
184
+ ----------
185
+ comparison : dict[str, Any] | None
186
+ Comparison results from compare_submitted_file, or None if no comparison
187
+ """
188
+ if not comparison:
189
+ return
190
+
191
+ print("\nSubmitted file comparison:", file=sys.stderr)
192
+ print(f" Submitted file: {comparison['submitted_file']}", file=sys.stderr)
193
+
194
+ if "error" in comparison:
195
+ print(f" Error: {comparison['error']}", file=sys.stderr)
196
+ return
197
+
198
+ if comparison["matches"]:
199
+ print(" ✓ Reconstructed code matches submitted file exactly", file=sys.stderr)
200
+ elif comparison.get("whitespace_only", False):
201
+ print(" ⚠ Reconstructed code differs only in whitespace from submitted file", file=sys.stderr)
202
+ else:
203
+ print(" ✗ Reconstructed code differs from submitted file", file=sys.stderr)
204
+ if comparison.get("diff"):
205
+ print("\n Diff (reconstructed → submitted):", file=sys.stderr)
206
+ # Indent each line of the diff
207
+ for line in comparison["diff"].split("\n"):
208
+ if line:
209
+ print(f" {line}", file=sys.stderr)
210
+
211
+
179
212
  def print_separator() -> None:
180
213
  """Print a separator line."""
181
214
  print(f"{'='*80}", file=sys.stderr)