cr-proc 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -169,6 +169,9 @@ def reconstruct_file_from_events(
169
169
  from .load import is_edit_event
170
170
  events = tuple(e for e in events if is_edit_event(e))
171
171
 
172
+ # Skip no-op events (oldFragment == newFragment, typically file-open markers)
173
+ events = tuple(e for e in events if not (e.get("oldFragment") == e.get("newFragment") and e.get("offset") == 0))
174
+
172
175
  # Read template content
173
176
  if normalize_newlines:
174
177
  template = _normalize_newlines(template)
@@ -197,6 +200,39 @@ def reconstruct_file_from_events(
197
200
  # No events for target_doc; return template unchanged
198
201
  return template
199
202
 
203
+ # Handle case where first event is a file-open/load event at offset 0
204
+ # (IDE captures the file content as seen when opened)
205
+ if evs and evs[0].get("offset") == 0:
206
+ first_old = evs[0].get("oldFragment", "")
207
+ first_new = evs[0].get("newFragment", "")
208
+
209
+ if first_old and not template.startswith(first_old):
210
+ # Check if this looks like a file-open event:
211
+ # - First event is at offset 0
212
+ # - oldFragment and newFragment contain significant content (file was loaded)
213
+ # - Template is much smaller (stub/placeholder)
214
+ is_likely_file_open = (
215
+ first_old == first_new and # no-op replacement (just file load)
216
+ len(first_old) > 50 and # substantial content
217
+ len(template) < len(first_old) # template is smaller stub
218
+ )
219
+
220
+ if is_likely_file_open:
221
+ # Use first event's oldFragment as the template (actual file state when opened)
222
+ template = first_old
223
+ else:
224
+ # Template genuinely doesn't match
225
+ raise ValueError(
226
+ f"Template content does not match recording's initial state.\n"
227
+ f"First event expects to replace {len(first_old)} chars starting at offset 0,\n"
228
+ f"but template only has {len(template)} chars and starts with:\n"
229
+ f"{template[:min(100, len(template))]!r}\n\n"
230
+ f"Expected to start with:\n"
231
+ f"{first_old[:min(100, len(first_old))]!r}\n\n"
232
+ f"Recording was likely made on a different version of the file.\n"
233
+ f"Document path in recording: {target_doc}"
234
+ )
235
+
200
236
  if utf16_mode:
201
237
  # Work in UTF-16-LE byte space
202
238
  doc_bytes = template.encode("utf-16-le")
@@ -1,10 +1,38 @@
1
1
  """Document resolution and filtering utilities."""
2
2
  import difflib
3
3
  import sys
4
- from pathlib import Path
4
+ from pathlib import Path, PureWindowsPath, PurePosixPath
5
5
  from typing import Any
6
6
 
7
7
 
8
+ def _normalize_document_path(doc_path: str) -> tuple[str, str]:
9
+ """
10
+ Normalize a document path to extract filename and stem.
11
+
12
+ Handles both Windows-style (backslash) and Unix-style (forward slash) paths
13
+ regardless of the current platform.
14
+
15
+ Parameters
16
+ ----------
17
+ doc_path : str
18
+ Document path string (may use Windows or Unix separators)
19
+
20
+ Returns
21
+ -------
22
+ tuple[str, str]
23
+ (filename, stem) extracted from the path
24
+ """
25
+ # Try to detect if this is a Windows path (contains backslashes)
26
+ if "\\" in doc_path:
27
+ # Windows-style path
28
+ path_obj = PureWindowsPath(doc_path)
29
+ else:
30
+ # Unix-style path (or just a filename)
31
+ path_obj = PurePosixPath(doc_path)
32
+
33
+ return path_obj.name, path_obj.stem
34
+
35
+
8
36
  def find_matching_template(
9
37
  template_dir: Path, document_path: str
10
38
  ) -> Path | None:
@@ -31,8 +59,7 @@ def find_matching_template(
31
59
  if not template_dir.is_dir():
32
60
  return None
33
61
 
34
- doc_name = Path(document_path).name
35
- doc_stem = Path(document_path).stem
62
+ doc_name, doc_stem = _normalize_document_path(document_path)
36
63
 
37
64
  # First, try exact filename match
38
65
  exact_match = template_dir / doc_name
@@ -81,19 +108,25 @@ def get_normalized_document_key(doc_path: str) -> tuple[str, str]:
81
108
  Get a normalized key for a document based on filename and extension.
82
109
 
83
110
  This helps identify documents that are the same but with different paths.
111
+ Handles both Windows and Unix style paths correctly.
84
112
 
85
113
  Parameters
86
114
  ----------
87
115
  doc_path : str
88
- Document path
116
+ Document path (may use Windows or Unix separators)
89
117
 
90
118
  Returns
91
119
  -------
92
120
  tuple[str, str]
93
121
  (filename_with_extension, extension) for grouping similar documents
94
122
  """
95
- path_obj = Path(doc_path)
96
- return (path_obj.name, path_obj.suffix)
123
+ filename, _ = _normalize_document_path(doc_path)
124
+ # Get extension from filename
125
+ if '.' in filename:
126
+ extension = '.' + filename.rsplit('.', 1)[1]
127
+ else:
128
+ extension = ''
129
+ return (filename, extension)
97
130
 
98
131
 
99
132
  def group_documents_by_name(docs: list[str]) -> dict[tuple[str, str], list[str]]:
@@ -205,7 +238,8 @@ def resolve_document(
205
238
 
206
239
  if override:
207
240
  matches = [
208
- d for d in unique_docs if d.endswith(override) or Path(d).name == override
241
+ d for d in unique_docs
242
+ if d.endswith(override) or _normalize_document_path(d)[0] == override
209
243
  ]
210
244
  if not matches:
211
245
  raise ValueError(
@@ -220,7 +254,10 @@ def resolve_document(
220
254
  # If template_path is provided and is a file (not directory), use its extension for matching
221
255
  if template_path and template_path.is_file():
222
256
  template_ext = template_path.suffix
223
- ext_matches = [d for d in unique_docs if Path(d).suffix == template_ext]
257
+ ext_matches = [
258
+ d for d in unique_docs
259
+ if _normalize_document_path(d)[0].endswith(template_ext)
260
+ ]
224
261
  if len(ext_matches) == 1:
225
262
  return ext_matches[0]
226
263
  if len(ext_matches) > 1:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cr_proc
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: A tool for processing BYU CS code recording files.
5
5
  Author: Ethan Dye
6
6
  Author-email: mrtops03@gmail.com
@@ -28,7 +28,8 @@ poetry install
28
28
 
29
29
  ## Usage
30
30
 
31
- The processor can be run using the `cr_proc` command with recording file(s) and a template:
31
+ The processor can be run using the `cr_proc` command with recording file(s) and
32
+ a template:
32
33
 
33
34
  ```bash
34
35
  poetry run cr_proc <path-to-jsonl-file> <path-to-template-file>
@@ -36,7 +37,8 @@ poetry run cr_proc <path-to-jsonl-file> <path-to-template-file>
36
37
 
37
38
  ### Batch Processing
38
39
 
39
- You can process multiple recording files at once (e.g., for different students' submissions):
40
+ You can process multiple recording files at once (e.g., for different students'
41
+ submissions):
40
42
 
41
43
  ```bash
42
44
  # Process multiple files
@@ -47,9 +49,11 @@ poetry run cr_proc recordings/*.jsonl.gz template.py
47
49
  ```
48
50
 
49
51
  When processing multiple files:
52
+
50
53
  - Each recording is processed independently (for different students/documents)
51
54
  - Time calculations and verification are done separately for each file
52
- - A combined time report is shown at the end summarizing total editing time across all recordings
55
+ - A combined time report is shown at the end summarizing total editing time
56
+ across all recordings
53
57
  - Results can be output to individual files using `--output-dir`
54
58
 
55
59
  ### Arguments
@@ -61,24 +65,26 @@ When processing multiple files:
61
65
 
62
66
  ### Options
63
67
 
64
- - `-t, --time-limit MINUTES`: (Optional) Maximum allowed time in minutes between the
65
- first and last edit in the recording. Applied individually to each recording file and
66
- also to the combined total in batch mode. If the elapsed time exceeds this limit, the
67
- recording is flagged as suspicious.
68
- - `-d, --document DOCUMENT`: (Optional) Document path or filename to process from the
69
- recording. Defaults to the document whose extension matches the template file.
70
- - `-o, --output-json OUTPUT_JSON`: (Optional) Path to output JSON file with verification
71
- results (time info and suspicious events). In batch mode, creates a single JSON file
72
- containing all recordings plus the combined time report.
73
- - `-f, --output-file OUTPUT_FILE`: (Optional) Write reconstructed code to specified file
74
- instead of stdout. For single files only.
75
- - `--output-dir OUTPUT_DIR`: (Optional) Directory to write reconstructed code files in
76
- batch mode. Files are named based on input recording filenames.
77
- - `-s, --show-autocomplete-details`: (Optional) Show individual auto-complete events in
78
- addition to aggregate statistics.
79
- - `-p, --playback`: (Optional) Play back the recording in real-time, showing code evolution.
80
- - `--playback-speed SPEED`: (Optional) Playback speed multiplier (1.0 = real-time, 2.0 = 2x
81
- speed, 0.5 = half speed).
68
+ - `-t, --time-limit MINUTES`: (Optional) Maximum allowed time in minutes between
69
+ the first and last edit in the recording. Applied individually to each
70
+ recording file and also to the combined total in batch mode. If the elapsed
71
+ time exceeds this limit, the recording is flagged as suspicious.
72
+ - `-d, --document DOCUMENT`: (Optional) Document path or filename to process
73
+ from the recording. Defaults to the document whose extension matches the
74
+ template file.
75
+ - `-o, --output-json OUTPUT_JSON`: (Optional) Path to output JSON file with
76
+ verification results (time info and suspicious events). In batch mode, creates
77
+ a single JSON file containing all recordings plus the combined time report.
78
+ - `-f, --output-file OUTPUT_FILE`: (Optional) Write reconstructed code to
79
+ specified file instead of stdout. For single files only.
80
+ - `--output-dir OUTPUT_DIR`: (Optional) Directory to write reconstructed code
81
+ files in batch mode. Files are named based on input recording filenames.
82
+ - `-s, --show-autocomplete-details`: (Optional) Show individual auto-complete
83
+ events in addition to aggregate statistics.
84
+ - `-p, --playback`: (Optional) Play back the recording in real-time, showing
85
+ code evolution.
86
+ - `--playback-speed SPEED`: (Optional) Playback speed multiplier (1.0 =
87
+ real-time, 2.0 = 2x speed, 0.5 = half speed).
82
88
 
83
89
  ### Examples
84
90
 
@@ -106,7 +112,8 @@ Save JSON results:
106
112
  poetry run cr_proc student1.jsonl.gz student2.jsonl.gz template.py -o results/
107
113
  ```
108
114
 
109
- This will process each recording independently and flag any that exceed 30 minutes.
115
+ This will process each recording independently and flag any that exceed 30
116
+ minutes.
110
117
 
111
118
  The processor will:
112
119
 
@@ -118,8 +125,9 @@ The processor will:
118
125
 
119
126
  ### Output
120
127
 
121
- Reconstructed code files are written to disk using `-f/--output-file` (single file)
122
- or `--output-dir` (batch mode). The processor does not output reconstructed code to stdout.
128
+ Reconstructed code files are written to disk using `-f/--output-file` (single
129
+ file) or `--output-dir` (batch mode). The processor does not output
130
+ reconstructed code to stdout.
123
131
 
124
132
  Verification information, warnings, and errors are printed to stderr, including:
125
133
 
@@ -133,8 +141,8 @@ Verification information, warnings, and errors are printed to stderr, including:
133
141
 
134
142
  ### Suspicious Activity Detection
135
143
 
136
- The processor automatically detects and reports three types of suspicious activity
137
- patterns:
144
+ The processor automatically detects and reports three types of suspicious
145
+ activity patterns:
138
146
 
139
147
  #### 1. Time Limit Exceeded
140
148
 
@@ -142,8 +150,8 @@ When the `--time-limit` flag is specified, the processor flags recordings where
142
150
  the elapsed time between the first and last edit exceeds the specified limit.
143
151
  This can indicate unusually long work sessions or potential external assistance.
144
152
 
145
- Each recording file is checked independently against the time limit. In batch mode,
146
- the combined total time is also checked against the limit.
153
+ Each recording file is checked independently against the time limit. In batch
154
+ mode, the combined total time is also checked against the limit.
147
155
 
148
156
  **Example warning (single file):**
149
157
 
@@ -199,12 +207,14 @@ Events #42-#44 (rapid one-line pastes (AI indicator)): 3 lines, 89 chars
199
207
 
200
208
  ### JSON Output Format
201
209
 
202
- The `--output-json` flag generates JSON files with verification results using a consistent format
203
- for both single file and batch modes, making it easier for tooling to consume.
210
+ The `--output-json` flag generates JSON files with verification results using a
211
+ consistent format for both single file and batch modes, making it easier for
212
+ tooling to consume.
204
213
 
205
214
  #### JSON Structure
206
215
 
207
216
  All JSON output follows this unified format:
217
+
208
218
  - `batch_mode`: Boolean indicating if multiple files were processed
209
219
  - `total_files`: Number of files processed
210
220
  - `verified_count`: How many files passed verification
@@ -219,6 +229,7 @@ All JSON output follows this unified format:
219
229
  - `files`: Array of individual results for each recording
220
230
 
221
231
  **Single file example:**
232
+
222
233
  ```json
223
234
  {
224
235
  "batch_mode": false,
@@ -244,6 +255,7 @@ All JSON output follows this unified format:
244
255
  ```
245
256
 
246
257
  **Batch file example:**
258
+
247
259
  ```json
248
260
  {
249
261
  "batch_mode": true,
@@ -1,13 +1,13 @@
1
1
  code_recorder_processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- code_recorder_processor/api/build.py,sha256=tljtuEFH-ZU-hSFYmlAMSY61W-DSptQo_D5-GjAasco,7951
3
- code_recorder_processor/api/document.py,sha256=mBvATBZs8yyCY_nDOX2qhw0Gp1mmwI3PgOAzFgHUiSY,9486
2
+ code_recorder_processor/api/build.py,sha256=XuF8Vx9mDdRqeaxCVgYAdn4NFJzkRt4Q839m15th0Fo,9908
3
+ code_recorder_processor/api/document.py,sha256=DOQ0H1dQJtMs2P9E2qnKgg2iKQT9msgdE9oJXl36SnY,10622
4
4
  code_recorder_processor/api/load.py,sha256=Br-USpFQJ6W8c5hjmCnunM3V0_MURKZp5Yyl1IJdahc,5514
5
5
  code_recorder_processor/api/output.py,sha256=H2SC3pQ0C9V8YyN4yeA_KmvSoWXy_3T3TKWKhywIax4,2161
6
6
  code_recorder_processor/api/verify.py,sha256=9GpeoFQIiTzZd-DNSyN5OUM6YB5iMslO85oAjc0yoSU,34073
7
7
  code_recorder_processor/cli.py,sha256=ardcM3bLNhf6abOQ1Aj746x4hp8gerdklfDwszLlYKc,20504
8
8
  code_recorder_processor/display.py,sha256=IVTNFB3Vjzpc5ZHceAFQI2-o-N6bvjYmotLDaEy0KoU,7368
9
9
  code_recorder_processor/playback.py,sha256=6-OJtQOHKgfutxUNBMunWl-VVSIB0zUDENSl0EsPCh4,4008
10
- cr_proc-0.1.9.dist-info/METADATA,sha256=3yqgqvpe1juNoinP6Xn59UiowZen06mgFTh1eG2ZC8M,8915
11
- cr_proc-0.1.9.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
12
- cr_proc-0.1.9.dist-info/entry_points.txt,sha256=xb5dPAAWN1Z9NUHpvZgNakaslR1MVOERf_IfpG_M04M,77
13
- cr_proc-0.1.9.dist-info/RECORD,,
10
+ cr_proc-0.1.11.dist-info/METADATA,sha256=wZuAW9ghrjT2fCbiI9bJSy5TPLc4YD6OpYb0mTlyOL4,8926
11
+ cr_proc-0.1.11.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
12
+ cr_proc-0.1.11.dist-info/entry_points.txt,sha256=xb5dPAAWN1Z9NUHpvZgNakaslR1MVOERf_IfpG_M04M,77
13
+ cr_proc-0.1.11.dist-info/RECORD,,