cr-proc 0.1.2__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,12 +65,29 @@ def load_jsonl(file: Path) -> tuple[dict[str, Any], ...]:
65
65
 
66
66
  if data is None:
67
67
  # If gzip stream is broken, attempt a lenient zlib decompress to salvage content.
68
+ # Handle multiple concatenated gzip streams (common in recordings)
68
69
  try:
69
70
  raw = file.read_bytes()
70
- dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)
71
- text_bytes = dobj.decompress(raw) + dobj.flush()
72
- text = text_bytes.decode("utf-8", errors="replace")
73
- data = _load_jsonl(StringIO(text))
71
+ all_text = ""
72
+ remaining = raw
73
+
74
+ # Decompress all concatenated gzip streams
75
+ while remaining:
76
+ dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)
77
+ try:
78
+ text_bytes = dobj.decompress(remaining) + dobj.flush()
79
+ all_text += text_bytes.decode("utf-8", errors="replace")
80
+ remaining = dobj.unused_data
81
+ if not text_bytes or not remaining:
82
+ break
83
+ except Exception:
84
+ # If decompression fails, try to salvage what we have
85
+ break
86
+
87
+ if all_text:
88
+ data = _load_jsonl(StringIO(all_text))
89
+ else:
90
+ data = None
74
91
  except Exception:
75
92
  data = None
76
93
 
@@ -113,20 +113,84 @@ def template_diff(template: str, jsonData: tuple[dict[str, Any], ...]) -> str:
113
113
  return "".join(diff_iter)
114
114
 
115
115
 
116
- def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
116
+ def _build_document_states(jsonData: tuple[dict[str, Any], ...]) -> tuple[list[str], set[str]]:
117
+ """
118
+ Build complete document state at each event and a whitelist of all content seen.
119
+
120
+ Reconstructs the document after each keystroke/edit to track what content
121
+ existed in the document at each point in time. This allows detectors to
122
+ check if pasted/autocompleted content already existed in the document.
123
+
124
+ Parameters
125
+ ----------
126
+ jsonData : tuple[dict[str, Any], ...]
127
+ The event data from the JSONL file
128
+
129
+ Returns
130
+ -------
131
+ tuple[list[str], set[str]]
132
+ - List of document states (one per event, strings of full document content)
133
+ - Set of all content fragments ever seen (whitelist for internal copy detection)
134
+ """
135
+ document_states = []
136
+ content_whitelist = set()
137
+ current_state = ""
138
+
139
+ for idx, event in enumerate(jsonData):
140
+ old_frag = _normalize_newlines(event.get("oldFragment", ""))
141
+ new_frag = _normalize_newlines(event.get("newFragment", ""))
142
+ offset = event.get("offset", 0)
143
+
144
+ # First event is the initial snapshot (template)
145
+ if idx == 0:
146
+ current_state = new_frag
147
+ elif new_frag != old_frag:
148
+ # Apply the edit to reconstruct document state
149
+ current_state = current_state[:offset] + new_frag + current_state[offset + len(old_frag):]
150
+
151
+ document_states.append(current_state)
152
+
153
+ # Build whitelist of all content fragments seen
154
+ # Add both old and new fragments to whitelist for comprehensive coverage
155
+ if len(old_frag) > 10: # Ignore tiny fragments
156
+ content_whitelist.add(old_frag)
157
+ if len(new_frag) > 10:
158
+ content_whitelist.add(new_frag)
159
+
160
+ # Also add the full document state to whitelist
161
+ if len(current_state) > 10:
162
+ content_whitelist.add(current_state)
163
+
164
+ return document_states, content_whitelist
165
+
166
+
167
+ def _detect_multiline_external_pastes(
168
+ jsonData: tuple[dict[str, Any], ...],
169
+ document_states: list[str],
170
+ content_whitelist: set[str]
171
+ ) -> list[dict[str, Any]]:
117
172
  """
118
173
  Detect multi-line copy-paste events from external sources.
119
174
 
120
175
  Flags newFragments that are significant in length (more than one line)
121
176
  and do not appear to be copied from within the document itself.
122
177
 
123
- Returns a list of suspicious multi-line paste events.
178
+ Parameters
179
+ ----------
180
+ jsonData : tuple[dict[str, Any], ...]
181
+ The event data
182
+ document_states : list[str]
183
+ Full document state at each event
184
+ content_whitelist : set[str]
185
+ All content fragments ever seen in the document (for internal copy detection)
186
+
187
+ Returns
188
+ -------
189
+ list[dict[str, Any]]
190
+ List of suspicious multi-line paste events.
124
191
  """
125
192
  suspicious_events = []
126
193
 
127
- # Build a history of all document content seen so far
128
- document_history = set()
129
-
130
194
  for idx, event in enumerate(jsonData):
131
195
  old_frag = _normalize_newlines(event.get("oldFragment", ""))
132
196
  new_frag = _normalize_newlines(event.get("newFragment", ""))
@@ -140,32 +204,39 @@ def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> l
140
204
  if len(new_lines) <= 2: # Single line or line + empty
141
205
  continue
142
206
 
143
- # Check if the new content appears to be from within the document
207
+ # Check if the new content already existed in the document at any prior point
144
208
  is_internal_copy = False
145
209
 
146
- # Check if new_frag content was present in any previous fragments
147
- for hist_content in document_history:
148
- # Ignore tiny fragments; they appear everywhere and cause false positives
149
- if len(hist_content) < 20:
150
- continue
210
+ # Check against document state BEFORE this event
211
+ if idx > 0:
212
+ prior_state = document_states[idx - 1]
213
+ if new_frag in prior_state:
214
+ is_internal_copy = True
215
+
216
+ # Also check against whitelist of all content seen
217
+ if not is_internal_copy:
218
+ for hist_content in content_whitelist:
219
+ # Ignore tiny fragments
220
+ if len(hist_content) < 20:
221
+ continue
151
222
 
152
- # Require substantial overlap in size to count as an internal copy
153
- similar_length = (
154
- len(hist_content) >= 0.8 * len(new_frag)
155
- and len(hist_content) <= 1.25 * len(new_frag)
156
- )
223
+ # Require substantial overlap in size to count as an internal copy
224
+ similar_length = (
225
+ len(hist_content) >= 0.8 * len(new_frag)
226
+ and len(hist_content) <= 1.25 * len(new_frag)
227
+ )
157
228
 
158
- if new_frag == hist_content:
159
- is_internal_copy = True
160
- break
229
+ if new_frag == hist_content:
230
+ is_internal_copy = True
231
+ break
161
232
 
162
- if new_frag in hist_content and similar_length:
163
- is_internal_copy = True
164
- break
233
+ if new_frag in hist_content and similar_length:
234
+ is_internal_copy = True
235
+ break
165
236
 
166
- if hist_content in new_frag and similar_length:
167
- is_internal_copy = True
168
- break
237
+ if hist_content in new_frag and similar_length:
238
+ is_internal_copy = True
239
+ break
169
240
 
170
241
  # Also check if it's in the old fragment (internal move/copy)
171
242
  if not is_internal_copy and old_frag and (new_frag in old_frag or old_frag in new_frag):
@@ -177,15 +248,9 @@ def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> l
177
248
  "line_count": len(new_lines),
178
249
  "char_count": len(new_frag),
179
250
  "reason": "multi-line external paste",
180
- "newFragment": new_frag[:100] + ("..." if len(new_frag) > 100 else ""),
251
+ "newFragment": new_frag
181
252
  })
182
253
 
183
- # Update history after analysis so the current fragment cannot mask itself
184
- if len(old_frag) > 1:
185
- document_history.add(old_frag)
186
- if len(new_frag) > 1:
187
- document_history.add(new_frag)
188
-
189
254
  return suspicious_events
190
255
 
191
256
 
@@ -262,6 +327,119 @@ def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[
262
327
  return suspicious_events
263
328
 
264
329
 
330
+ def _detect_fullline_autocomplete(
331
+ jsonData: tuple[dict[str, Any], ...],
332
+ document_states: list[str],
333
+ content_whitelist: set[str],
334
+ excluded_indices: set[int]
335
+ ) -> list[dict[str, Any]]:
336
+ """
337
+ Detect full-line auto-complete events where the IDE/AI completes code.
338
+
339
+ At keystroke level, events show:
340
+ - Normal typing: oldFragment="" (empty), newFragment="X" (1 char)
341
+ - Auto-complete: oldFragment="" (empty), newFragment="long_text" (10+ chars)
342
+
343
+ Auto-complete is detected when:
344
+ - oldFragment is empty or very short (0-3 chars)
345
+ - newFragment is substantial (10+ characters)
346
+ - newFragment contains code structure (assignment, parens, brackets, etc.)
347
+ - newFragment does NOT already exist in the document state
348
+ - Event not already flagged as external copy-paste
349
+
350
+ Parameters
351
+ ----------
352
+ jsonData : tuple[dict[str, Any], ...]
353
+ The event data
354
+ document_states : list[str]
355
+ Full document state at each event
356
+ content_whitelist : set[str]
357
+ All content fragments ever seen in the document
358
+ excluded_indices : set[int]
359
+ Set of event indices already flagged by other detectors (to avoid double-flagging)
360
+
361
+ Returns
362
+ -------
363
+ list[dict[str, Any]]
364
+ List of suspected auto-complete events.
365
+ """
366
+ suspicious_events = []
367
+
368
+ for idx, event in enumerate(jsonData):
369
+ # Skip if already flagged by another detector
370
+ if idx in excluded_indices:
371
+ continue
372
+
373
+ old_frag = _normalize_newlines(event.get("oldFragment", ""))
374
+ new_frag = _normalize_newlines(event.get("newFragment", ""))
375
+
376
+ # Skip first event (template) and no-change events
377
+ if idx == 0 or new_frag == old_frag:
378
+ continue
379
+
380
+ old_len = len(old_frag)
381
+ new_len = len(new_frag)
382
+
383
+ # At keystroke level, oldFragment is typically empty for insertions
384
+ # Allow up to 3 chars for prefix-based autocomplete triggers
385
+ if old_len > 3:
386
+ continue
387
+
388
+ # Skip single-character additions (normal typing)
389
+ # Auto-complete typically adds 10+ characters at once
390
+ if new_len < 10:
391
+ continue
392
+
393
+ # Skip large multi-line pastes - those should be caught by multi-line paste detector
394
+ # Auto-complete is typically 1-2 lines and under 100 chars
395
+ # Anything larger is likely external copy-paste, not auto-complete
396
+ new_lines = new_frag.split("\n")
397
+ if len(new_lines) > 2 or new_len > 100:
398
+ continue
399
+
400
+ # The new fragment should not be just whitespace
401
+ if not new_frag.strip():
402
+ continue
403
+
404
+ # Check if the new fragment contains code structure indicators
405
+ # These strongly suggest IDE/AI auto-completion of code
406
+ code_indicators = [
407
+ "=", # Assignment (most common in autocomplete)
408
+ "(", # Function call/definition
409
+ ")", # Closing paren
410
+ ":", # Block statement (if, for, def, etc.)
411
+ "{", # Dictionary/block
412
+ "}", # Closing brace
413
+ "[", # List/index
414
+ "]", # Closing bracket
415
+ "=>", # Arrow function
416
+ ";", # Statement end
417
+ ]
418
+
419
+ has_code_structure = any(indicator in new_frag for indicator in code_indicators)
420
+
421
+ # Must have code structure to be considered auto-complete
422
+ if has_code_structure:
423
+ # Check if this content already existed in the document state BEFORE this event
424
+ is_internal_copy = False
425
+
426
+ if idx > 0:
427
+ prior_state = document_states[idx - 1]
428
+ if new_frag in prior_state:
429
+ is_internal_copy = True
430
+
431
+ if not is_internal_copy:
432
+ suspicious_events.append({
433
+ "event_index": idx,
434
+ "line_count": len(new_lines),
435
+ "char_count": new_len,
436
+ "reason": "full-line auto-complete",
437
+ "newFragment": new_frag,
438
+ })
439
+
440
+ return suspicious_events
441
+
442
+
265
443
  def detect_external_copypaste(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
266
444
  """
267
445
  Detect copy-paste events from external sources and AI-assisted coding patterns.
@@ -269,16 +447,64 @@ def detect_external_copypaste(jsonData: tuple[dict[str, Any], ...]) -> list[dict
269
447
  Combines detection of:
270
448
  1. Multi-line external paste events (content not from within document)
271
449
  2. Rapid one-line paste sequences (potential AI assistance indicator)
450
+ 3. Full-line auto-complete events (user types, AI completes the line)
451
+
452
+ Detection order matters: events flagged by earlier detectors are excluded
453
+ from later detectors to avoid double-flagging.
272
454
 
273
- Returns a list of all suspicious events with metadata.
455
+ Returns a list of all suspicious events with metadata, including aggregate statistics.
274
456
  """
275
457
  suspicious_events = []
276
458
 
277
- # Detect multi-line external pastes
278
- suspicious_events.extend(_detect_multiline_external_pastes(jsonData))
459
+ # Build shared document state tracking
460
+ # This reconstructs the full document at each event and creates a whitelist
461
+ # of all content that has ever appeared in the document
462
+ document_states, content_whitelist = _build_document_states(jsonData)
463
+
464
+ # Step 1: Detect multi-line external pastes
465
+ multiline_events = _detect_multiline_external_pastes(jsonData, document_states, content_whitelist)
466
+ suspicious_events.extend(multiline_events)
467
+
468
+ # Step 2: Detect rapid one-line paste sequences (AI indicator)
469
+ rapid_paste_events = _detect_rapid_paste_sequences(jsonData)
470
+ suspicious_events.extend(rapid_paste_events)
471
+
472
+ # Build set of all event indices already flagged
473
+ excluded_indices = set()
474
+ for event in multiline_events:
475
+ # Handle both single events and clusters
476
+ if "event_indices" in event:
477
+ excluded_indices.update(event["event_indices"])
478
+ else:
479
+ excluded_indices.add(event["event_index"])
480
+
481
+ for event in rapid_paste_events:
482
+ if "event_indices" in event:
483
+ excluded_indices.update(event["event_indices"])
484
+ else:
485
+ excluded_indices.add(event["event_index"])
486
+
487
+ # Step 3: Detect full-line auto-complete events (excluding already-flagged events)
488
+ autocomplete_events = _detect_fullline_autocomplete(
489
+ jsonData, document_states, content_whitelist, excluded_indices
490
+ )
279
491
 
280
- # Detect rapid one-line paste sequences (AI indicator)
281
- suspicious_events.extend(_detect_rapid_paste_sequences(jsonData))
492
+ # Calculate aggregate statistics for auto-complete/small paste events
493
+ # Store individual events for optional detailed review, but don't report them by default
494
+ if autocomplete_events:
495
+ total_autocomplete_chars = sum(ev["char_count"] for ev in autocomplete_events)
496
+ total_autocomplete_events = len(autocomplete_events)
497
+
498
+ # Always add aggregate summary, never individual events
499
+ # Store individual events in the aggregate for optional detailed review
500
+ suspicious_events.append({
501
+ "event_index": -1, # Special marker for aggregate
502
+ "event_count": total_autocomplete_events,
503
+ "total_chars": total_autocomplete_chars,
504
+ "reason": "aggregate auto-complete/small paste activity",
505
+ "newFragment": f"{total_autocomplete_events} auto-complete events ({total_autocomplete_chars} total chars)",
506
+ "detailed_events": autocomplete_events, # Store for optional review
507
+ })
282
508
 
283
509
  return suspicious_events
284
510
 
@@ -311,12 +537,15 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
311
537
  def parse_ts(ts_str: str) -> datetime:
312
538
  return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
313
539
 
314
- # Identify session boundaries: sessions start at indices where oldFragment == newFragment (non-empty)
540
+ # Identify session boundaries: sessions start at indices where offset == 0
541
+ # (indicating file reopen/recording restart) and oldFragment == newFragment (initial snapshot)
315
542
  session_starts = [0] # First session always starts at index 0
316
543
  for idx in range(1, len(jsonData)):
544
+ offset = jsonData[idx].get("offset", -1)
317
545
  old_frag = jsonData[idx].get("oldFragment", "")
318
546
  new_frag = jsonData[idx].get("newFragment", "")
319
- if old_frag == new_frag and old_frag.strip() != "":
547
+ # Session boundary: offset is 0 and it's an initial snapshot (old == new, non-empty)
548
+ if offset == 0 and old_frag == new_frag and old_frag.strip() != "":
320
549
  session_starts.append(idx)
321
550
 
322
551
  # Add sentinel to mark end of last session
@@ -344,25 +573,26 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
344
573
  session_end = session_starts[i + 1]
345
574
 
346
575
  # Find first and last events with timestamps in this session
347
- first_event = None
348
- last_event = None
349
-
350
- for event in jsonData[session_start:session_end]:
351
- if event.get("timestamp"):
352
- if first_event is None:
353
- first_event = event
354
- last_event = event
576
+ first_event_time = None
577
+ last_event_time = None
578
+
579
+ for idx in range(session_start, session_end):
580
+ event = jsonData[idx]
581
+ timestamp = event.get("timestamp")
582
+ if timestamp:
583
+ try:
584
+ event_time = parse_ts(timestamp)
585
+ if first_event_time is None:
586
+ first_event_time = event_time
587
+ last_event_time = event_time
588
+ except (ValueError, KeyError):
589
+ # Skip events with invalid timestamps
590
+ continue
355
591
 
356
592
  # If this session has timestamped events, add its elapsed time
357
- if first_event is not None and last_event is not None:
358
- try:
359
- first_time = parse_ts(first_event["timestamp"])
360
- last_time = parse_ts(last_event["timestamp"])
361
- session_diff = last_time - first_time
362
- total_minutes_elapsed += session_diff.total_seconds() / 60
363
- except (ValueError, KeyError):
364
- # Timestamp parsing failed for this session, skip it
365
- continue
593
+ if first_event_time is not None and last_event_time is not None:
594
+ session_diff = last_event_time - first_event_time
595
+ total_minutes_elapsed += session_diff.total_seconds() / 60
366
596
 
367
597
  # For time limit check, use the span from first to last timestamp overall
368
598
  try:
@@ -1,14 +1,280 @@
1
1
  import argparse
2
- import sys
3
2
  import json
3
+ import sys
4
4
  from datetime import datetime
5
5
  from pathlib import Path
6
- from .api.load import load_jsonl
7
- from .api.verify import verify, template_diff, check_time_limit
6
+ from typing import Any
7
+
8
8
  from .api.build import reconstruct_file_from_events
9
+ from .api.load import load_jsonl
10
+ from .api.verify import check_time_limit, template_diff, verify
11
+
12
+
13
+ def resolve_document(
14
+ docs: list[str], template_path: Path, override: str | None
15
+ ) -> str | None:
16
+ """
17
+ Determine which document from the recording to process.
18
+
19
+ Parameters
20
+ ----------
21
+ docs : list[str]
22
+ List of document paths found in the recording
23
+ template_path : Path
24
+ Path to the template file
25
+ override : str | None
26
+ Explicit document name or path override
27
+
28
+ Returns
29
+ -------
30
+ str | None
31
+ The resolved document path, or None if no documents exist
32
+
33
+ Raises
34
+ ------
35
+ ValueError
36
+ If document resolution is ambiguous or the override doesn't match
37
+ """
38
+ if not docs:
39
+ return None
40
+
41
+ if override:
42
+ matches = [
43
+ d for d in docs if d.endswith(override) or Path(d).name == override
44
+ ]
45
+ if not matches:
46
+ raise ValueError(
47
+ f"No document in recording matches '{override}'. Available: {docs}"
48
+ )
49
+ if len(matches) > 1:
50
+ raise ValueError(
51
+ f"Ambiguous document override '{override}'. Matches: {matches}"
52
+ )
53
+ return matches[0]
54
+
55
+ template_ext = template_path.suffix
56
+ ext_matches = [d for d in docs if Path(d).suffix == template_ext]
57
+ if len(ext_matches) == 1:
58
+ return ext_matches[0]
59
+ if len(ext_matches) > 1:
60
+ raise ValueError(
61
+ f"Multiple documents share extension '{template_ext}': {ext_matches}. "
62
+ "Use --document to choose one."
63
+ )
64
+
65
+ if len(docs) == 1:
66
+ return docs[0]
67
+
68
+ raise ValueError(
69
+ "Could not determine document to process. Use --document to select one. "
70
+ f"Available documents: {docs}"
71
+ )
72
+
73
+
74
+ def get_recorded_documents(events: tuple[dict[str, Any], ...]) -> list[str]:
75
+ """
76
+ Extract unique document paths from recording events.
77
+
78
+ Parameters
79
+ ----------
80
+ events : tuple[dict[str, Any], ...]
81
+ Recording events loaded from JSONL
82
+
83
+ Returns
84
+ -------
85
+ list[str]
86
+ Sorted list of unique document paths
87
+ """
88
+ documents = {
89
+ e.get("document")
90
+ for e in events
91
+ if "document" in e and e.get("document") is not None
92
+ }
93
+ return sorted([d for d in documents if d is not None])
94
+
95
+
96
+ def filter_events_by_document(
97
+ events: tuple[dict[str, Any], ...], document: str | None
98
+ ) -> tuple[dict[str, Any], ...]:
99
+ """
100
+ Filter events to only those for a specific document.
101
+
102
+ Parameters
103
+ ----------
104
+ events : tuple[dict[str, Any], ...]
105
+ All recording events
106
+ document : str | None
107
+ Document path to filter by, or None to return all events
108
+
109
+ Returns
110
+ -------
111
+ tuple[dict[str, Any], ...]
112
+ Filtered events
113
+ """
114
+ if document:
115
+ return tuple(e for e in events if e.get("document") == document)
116
+ return events
117
+
118
+
119
+ def display_time_info(time_info: dict[str, Any] | None) -> None:
120
+ """
121
+ Display elapsed time and time limit information.
122
+
123
+ Parameters
124
+ ----------
125
+ time_info : dict[str, Any] | None
126
+ Time information from check_time_limit, or None if no time data
127
+ """
128
+ if not time_info:
129
+ return
130
+
131
+ print(
132
+ f"Elapsed editing time: {time_info['minutes_elapsed']} minutes",
133
+ file=sys.stderr,
134
+ )
135
+
136
+ first_ts = datetime.fromisoformat(
137
+ time_info["first_timestamp"].replace("Z", "+00:00")
138
+ )
139
+ last_ts = datetime.fromisoformat(
140
+ time_info["last_timestamp"].replace("Z", "+00:00")
141
+ )
142
+ time_span = (last_ts - first_ts).total_seconds() / 60
143
+
144
+ print(f"Time span (first to last edit): {time_span:.2f} minutes", file=sys.stderr)
145
+
146
+ if time_info["exceeds_limit"]:
147
+ print("\nTime limit exceeded!", file=sys.stderr)
148
+ print(f" Limit: {time_info['time_limit_minutes']} minutes", file=sys.stderr)
149
+ print(f" First edit: {time_info['first_timestamp']}", file=sys.stderr)
150
+ print(f" Last edit: {time_info['last_timestamp']}", file=sys.stderr)
151
+
152
+
153
+ def display_suspicious_event(event: dict[str, Any], show_details: bool) -> None:
154
+ """
155
+ Display a single suspicious event.
156
+
157
+ Parameters
158
+ ----------
159
+ event : dict[str, Any]
160
+ Suspicious event data
161
+ show_details : bool
162
+ Whether to show detailed autocomplete events
163
+ """
164
+ reason = event.get("reason", "unknown")
165
+
166
+ # Handle aggregate auto-complete events
167
+ if event.get("event_index") == -1 and "detailed_events" in event:
168
+ event_count = event["event_count"]
169
+ total_chars = event["total_chars"]
170
+ print(
171
+ f" Aggregate: {event_count} auto-complete/small paste events "
172
+ f"({total_chars} total chars)",
173
+ file=sys.stderr,
174
+ )
175
+
176
+ if show_details:
177
+ print(" Detailed events:", file=sys.stderr)
178
+ for detail in event["detailed_events"]:
179
+ detail_idx = detail["event_index"]
180
+ detail_lines = detail["line_count"]
181
+ detail_chars = detail["char_count"]
182
+ detail_frag = detail["newFragment"]
183
+ print(
184
+ f" Event #{detail_idx}: {detail_lines} lines, "
185
+ f"{detail_chars} chars",
186
+ file=sys.stderr,
187
+ )
188
+ print(" ```", file=sys.stderr)
189
+ for line in detail_frag.split("\n"):
190
+ print(f" {line}", file=sys.stderr)
191
+ print(" ```", file=sys.stderr)
192
+
193
+ elif "event_indices" in event:
194
+ indices = event.get("event_indices", [event["event_index"]])
195
+ print(
196
+ f" Events #{indices[0]}-#{indices[-1]} ({reason}): "
197
+ f"{event['line_count']} lines, {event['char_count']} chars",
198
+ file=sys.stderr,
199
+ )
200
+
201
+ else:
202
+ new_fragment = event["newFragment"].replace("\n", "\n ")
203
+ print(
204
+ f" Event #{event['event_index']} ({reason}): "
205
+ f"{event['line_count']} lines, {event['char_count']} chars - "
206
+ f"newFragment:\n ```\n {new_fragment}\n ```",
207
+ file=sys.stderr,
208
+ )
209
+
210
+
211
+ def display_suspicious_events(
212
+ suspicious_events: list[dict[str, Any]], show_details: bool
213
+ ) -> None:
214
+ """
215
+ Display all suspicious events or success message.
216
+
217
+ Parameters
218
+ ----------
219
+ suspicious_events : list[dict[str, Any]]
220
+ List of suspicious events detected
221
+ show_details : bool
222
+ Whether to show detailed autocomplete events
223
+ """
224
+ if suspicious_events:
225
+ print("\nSuspicious copy-paste events detected:", file=sys.stderr)
226
+ for event in suspicious_events:
227
+ display_suspicious_event(event, show_details)
228
+ else:
229
+ print("Success! No suspicious events detected.", file=sys.stderr)
9
230
 
10
231
 
11
- def main():
232
+ def write_json_output(
233
+ output_path: Path,
234
+ document: str,
235
+ time_info: dict[str, Any] | None,
236
+ suspicious_events: list[dict[str, Any]],
237
+ ) -> None:
238
+ """
239
+ Write verification results to JSON file.
240
+
241
+ Parameters
242
+ ----------
243
+ output_path : Path
244
+ Path to output JSON file
245
+ document : str
246
+ Document that was processed
247
+ time_info : dict[str, Any] | None
248
+ Time information from verification
249
+ suspicious_events : list[dict[str, Any]]
250
+ List of suspicious events detected
251
+
252
+ Raises
253
+ ------
254
+ Exception
255
+ If file writing fails
256
+ """
257
+ results = {
258
+ "document": document,
259
+ "time_info": time_info,
260
+ "suspicious_events": suspicious_events,
261
+ }
262
+
263
+ output_path.parent.mkdir(parents=True, exist_ok=True)
264
+ with open(output_path, "w") as f:
265
+ json.dump(results, f, indent=2)
266
+ print(f"Results written to {output_path}", file=sys.stderr)
267
+
268
+
269
+ def create_parser() -> argparse.ArgumentParser:
270
+ """
271
+ Create and configure the argument parser.
272
+
273
+ Returns
274
+ -------
275
+ argparse.ArgumentParser
276
+ Configured argument parser
277
+ """
12
278
  parser = argparse.ArgumentParser(
13
279
  description="Process and verify code recorder JSONL files"
14
280
  )
@@ -23,30 +289,54 @@ def main():
23
289
  help="Path to the initial template file that was recorded",
24
290
  )
25
291
  parser.add_argument(
292
+ "-t",
26
293
  "--time-limit",
27
294
  type=int,
28
295
  default=None,
29
- help="Maximum allowed time in minutes between first and last edit. If exceeded, recording is flagged.",
296
+ help="Maximum allowed time in minutes between first and last edit. "
297
+ "If exceeded, recording is flagged.",
30
298
  )
31
299
  parser.add_argument(
300
+ "-d",
32
301
  "--document",
33
302
  type=str,
34
303
  default=None,
35
- help=("Document path or filename to process from the recording. "
36
- "Defaults to the document whose extension matches the template file."),
304
+ help="Document path or filename to process from the recording. "
305
+ "Defaults to the document whose extension matches the template file.",
37
306
  )
38
307
  parser.add_argument(
308
+ "-o",
39
309
  "--output-json",
40
310
  type=Path,
41
311
  default=None,
42
- help="Path to output JSON file with verification results (time info and suspicious events).",
312
+ help="Path to output JSON file with verification results "
313
+ "(time info and suspicious events).",
43
314
  )
315
+ parser.add_argument(
316
+ "-s",
317
+ "--show-autocomplete-details",
318
+ action="store_true",
319
+ help="Show individual auto-complete events in addition to "
320
+ "aggregate statistics",
321
+ )
322
+ return parser
323
+
44
324
 
325
+ def main() -> int:
326
+ """
327
+ Main entry point for the CLI application.
328
+
329
+ Returns
330
+ -------
331
+ int
332
+ Exit code (0 for success, 1 for errors)
333
+ """
334
+ parser = create_parser()
45
335
  args = parser.parse_args()
46
336
 
47
- # Load JSONL file first to get document path
337
+ # Load JSONL file
48
338
  try:
49
- jsonData = load_jsonl(args.jsonl_file)
339
+ json_data = load_jsonl(args.jsonl_file)
50
340
  except FileNotFoundError as e:
51
341
  print(f"Error: {e}", file=sys.stderr)
52
342
  return 1
@@ -54,123 +344,72 @@ def main():
54
344
  print(f"Error loading JSONL file: {e}", file=sys.stderr)
55
345
  return 1
56
346
 
57
- # Decide which recorded document to process
58
- documents = {e.get("document") for e in jsonData if "document" in e and e.get("document") is not None}
59
- recorded_docs = sorted([d for d in documents if d is not None])
60
-
61
- def resolve_document(docs: list[str], template_path: Path, override: str | None) -> str | None:
62
- if not docs:
63
- return None
64
-
65
- if override:
66
- matches = [d for d in docs if d.endswith(override) or Path(d).name == override]
67
- if not matches:
68
- raise ValueError(
69
- f"No document in recording matches '{override}'. Available: {docs}"
70
- )
71
- if len(matches) > 1:
72
- raise ValueError(
73
- f"Ambiguous document override '{override}'. Matches: {matches}"
74
- )
75
- return matches[0]
76
-
77
- template_ext = template_path.suffix
78
- ext_matches = [d for d in docs if Path(d).suffix == template_ext]
79
- if len(ext_matches) == 1:
80
- return ext_matches[0]
81
- if len(ext_matches) > 1:
82
- raise ValueError(
83
- f"Multiple documents share extension '{template_ext}': {ext_matches}. "
84
- "Use --document to choose one."
85
- )
86
-
87
- if len(docs) == 1:
88
- return docs[0]
89
-
90
- raise ValueError(
91
- "Could not determine document to process. Use --document to select one. "
92
- f"Available documents: {docs}"
93
- )
94
-
347
+ # Resolve which document to process
348
+ recorded_docs = get_recorded_documents(json_data)
95
349
  try:
96
- target_document = resolve_document(recorded_docs, args.template_file, args.document)
350
+ target_document = resolve_document(
351
+ recorded_docs, args.template_file, args.document
352
+ )
97
353
  except ValueError as e:
98
354
  print(f"Error determining document: {e}", file=sys.stderr)
99
355
  return 1
100
356
 
101
- if target_document:
102
- doc_events = tuple(e for e in jsonData if e.get("document") == target_document)
103
- if not doc_events:
104
- print(f"Error: No events found for document '{target_document}'", file=sys.stderr)
105
- return 1
106
- else:
107
- doc_events = jsonData
357
+ # Filter events for target document
358
+ doc_events = filter_events_by_document(json_data, target_document)
359
+ if target_document and not doc_events:
360
+ print(
361
+ f"Error: No events found for document '{target_document}'",
362
+ file=sys.stderr,
363
+ )
364
+ return 1
108
365
 
109
366
  print(f"Processing: {target_document or args.template_file}", file=sys.stderr)
110
367
 
111
368
  # Read template file
112
369
  try:
113
- templateData = args.template_file.read_text()
370
+ template_data = args.template_file.read_text()
114
371
  except FileNotFoundError:
115
- print(f"Error: Template file not found: {args.template_file}", file=sys.stderr)
372
+ print(
373
+ f"Error: Template file not found: {args.template_file}", file=sys.stderr
374
+ )
116
375
  return 1
117
376
  except Exception as e:
118
377
  print(f"Error reading template file: {e}", file=sys.stderr)
119
378
  return 1
120
379
 
121
- # Check time limit and display elapsed time
380
+ # Check and display time information
122
381
  time_info = check_time_limit(doc_events, args.time_limit)
123
- if time_info:
124
- print(f"Elapsed editing time: {time_info['minutes_elapsed']} minutes", file=sys.stderr)
125
- print(f"Time span (first to last edit): {(datetime.fromisoformat(time_info['last_timestamp'].replace('Z', '+00:00')) - datetime.fromisoformat(time_info['first_timestamp'].replace('Z', '+00:00'))).total_seconds() / 60:.2f} minutes", file=sys.stderr)
126
- if time_info['exceeds_limit']:
127
- print(f"\nTime limit exceeded!", file=sys.stderr)
128
- print(f" Limit: {time_info['time_limit_minutes']} minutes", file=sys.stderr)
129
- print(f" First edit: {time_info['first_timestamp']}", file=sys.stderr)
130
- print(f" Last edit: {time_info['last_timestamp']}", file=sys.stderr)
131
-
132
- # Verify and process
382
+ display_time_info(time_info)
383
+
384
+ # Verify and process the recording
133
385
  try:
134
- templateData, suspicious_events = verify(templateData, doc_events)
135
- print(reconstruct_file_from_events(doc_events, templateData, document_path=target_document))
136
-
137
- # Prepare results for JSON output
138
- results = {
139
- "document": target_document or str(args.template_file),
140
- "time_info": time_info,
141
- "suspicious_events": suspicious_events,
142
- }
143
-
144
- if suspicious_events:
145
- print("\nSuspicious copy-paste events detected:", file=sys.stderr)
146
- for ev in suspicious_events:
147
- reason = ev.get('reason', 'unknown')
148
- indices = ev.get('event_indices', [ev['event_index']])
149
- if len(indices) > 1:
150
- print(f" Events #{indices[0]}-#{indices[-1]} ({reason}): "
151
- f"{ev['line_count']} lines, {ev['char_count']} chars", file=sys.stderr)
152
- else:
153
- print(f" Event #{ev['event_index']} ({reason}): "
154
- f"{ev['line_count']} lines, {ev['char_count']} chars - "
155
- f"newFragment:\n```\n{ev['newFragment']}\n```", file=sys.stderr)
156
- else:
157
- print("Success! No suspicious events detected.", file=sys.stderr)
386
+ template_data, suspicious_events = verify(template_data, doc_events)
387
+ reconstructed = reconstruct_file_from_events(
388
+ doc_events, template_data, document_path=target_document
389
+ )
390
+ print(reconstructed)
391
+
392
+ # Display suspicious events
393
+ display_suspicious_events(suspicious_events, args.show_autocomplete_details)
158
394
 
159
395
  # Write JSON output if requested
160
396
  if args.output_json:
161
397
  try:
162
- args.output_json.parent.mkdir(parents=True, exist_ok=True)
163
- with open(args.output_json, 'w') as f:
164
- json.dump(results, f, indent=2)
165
- print(f"Results written to {args.output_json}", file=sys.stderr)
398
+ write_json_output(
399
+ args.output_json,
400
+ target_document or str(args.template_file),
401
+ time_info,
402
+ suspicious_events,
403
+ )
166
404
  except Exception as e:
167
405
  print(f"Error writing JSON output: {e}", file=sys.stderr)
168
406
  return 1
407
+
169
408
  except ValueError as e:
170
409
  print("File failed verification from template!", file=sys.stderr)
171
410
  print(str(e), file=sys.stderr)
172
411
  try:
173
- print(template_diff(templateData, doc_events), file=sys.stderr)
412
+ print(template_diff(template_data, doc_events), file=sys.stderr)
174
413
  except Exception:
175
414
  pass
176
415
  return 1
@@ -178,6 +417,8 @@ def main():
178
417
  print(f"Error processing file: {type(e).__name__}: {e}", file=sys.stderr)
179
418
  return 1
180
419
 
420
+ return 0
421
+
181
422
 
182
423
  if __name__ == "__main__":
183
424
  sys.exit(main())
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cr_proc
3
- Version: 0.1.2
4
- Summary: A tool for processing BYU CS code recording files
3
+ Version: 0.1.5
4
+ Summary: A tool for processing BYU CS code recording files.
5
5
  Author: Ethan Dye
6
6
  Author-email: mrtops03@gmail.com
7
7
  Requires-Python: >=3.14
@@ -0,0 +1,9 @@
1
+ code_recorder_processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ code_recorder_processor/api/build.py,sha256=-EMg0w-llblJ_N_vs_B1kOsAOwiV-TYetAXiOY6PcWs,7643
3
+ code_recorder_processor/api/load.py,sha256=ZKoheLsEoGJ3fpAtPauoeEyNUhGLhUYSwjRsqt1m-TI,3947
4
+ code_recorder_processor/api/verify.py,sha256=2XLWr39g3jqjzQhpx82R_lx7FCYrdQjj8VRd9TTRM_8,23266
5
+ code_recorder_processor/cli.py,sha256=OcoKaJ5SV2iY8bExpiagagQMPtIlFYMUcL8nMtjG13g,12530
6
+ cr_proc-0.1.5.dist-info/METADATA,sha256=wyQbPvVGSkLrzERm3j3Xy_WrhJhEGpWeLCn382kBT4g,4070
7
+ cr_proc-0.1.5.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
8
+ cr_proc-0.1.5.dist-info/entry_points.txt,sha256=xb5dPAAWN1Z9NUHpvZgNakaslR1MVOERf_IfpG_M04M,77
9
+ cr_proc-0.1.5.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- code_recorder_processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- code_recorder_processor/api/build.py,sha256=-EMg0w-llblJ_N_vs_B1kOsAOwiV-TYetAXiOY6PcWs,7643
3
- code_recorder_processor/api/load.py,sha256=lkEPnQi3Q_91GOTImk4H380F-uKJPszeX3FJJWM4CIA,3272
4
- code_recorder_processor/api/verify.py,sha256=byW4fyW_gLkFq4rLvWut2Cvj_ds5Cj_MUFrhlhOrucY,14327
5
- code_recorder_processor/cli.py,sha256=sKm9f06NEZ3psw-HEShlHt4grVZvRmNEG33yvxhIIQQ,7154
6
- cr_proc-0.1.2.dist-info/METADATA,sha256=A60JkKqmku5ZO-hPjhSDlPqkqI55gpQj_2UZNs3ZlXg,4069
7
- cr_proc-0.1.2.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
8
- cr_proc-0.1.2.dist-info/entry_points.txt,sha256=xb5dPAAWN1Z9NUHpvZgNakaslR1MVOERf_IfpG_M04M,77
9
- cr_proc-0.1.2.dist-info/RECORD,,