cr-proc 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,12 +65,29 @@ def load_jsonl(file: Path) -> tuple[dict[str, Any], ...]:
65
65
 
66
66
  if data is None:
67
67
  # If gzip stream is broken, attempt a lenient zlib decompress to salvage content.
68
+ # Handle multiple concatenated gzip streams (common in recordings)
68
69
  try:
69
70
  raw = file.read_bytes()
70
- dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)
71
- text_bytes = dobj.decompress(raw) + dobj.flush()
72
- text = text_bytes.decode("utf-8", errors="replace")
73
- data = _load_jsonl(StringIO(text))
71
+ all_text = ""
72
+ remaining = raw
73
+
74
+ # Decompress all concatenated gzip streams
75
+ while remaining:
76
+ dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)
77
+ try:
78
+ text_bytes = dobj.decompress(remaining) + dobj.flush()
79
+ all_text += text_bytes.decode("utf-8", errors="replace")
80
+ remaining = dobj.unused_data
81
+ if not text_bytes or not remaining:
82
+ break
83
+ except Exception:
84
+ # If decompression fails, try to salvage what we have
85
+ break
86
+
87
+ if all_text:
88
+ data = _load_jsonl(StringIO(all_text))
89
+ else:
90
+ data = None
74
91
  except Exception:
75
92
  data = None
76
93
 
@@ -113,20 +113,84 @@ def template_diff(template: str, jsonData: tuple[dict[str, Any], ...]) -> str:
113
113
  return "".join(diff_iter)
114
114
 
115
115
 
116
- def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
116
+ def _build_document_states(jsonData: tuple[dict[str, Any], ...]) -> tuple[list[str], set[str]]:
117
+ """
118
+ Build complete document state at each event and a whitelist of all content seen.
119
+
120
+ Reconstructs the document after each keystroke/edit to track what content
121
+ existed in the document at each point in time. This allows detectors to
122
+ check if pasted/autocompleted content already existed in the document.
123
+
124
+ Parameters
125
+ ----------
126
+ jsonData : tuple[dict[str, Any], ...]
127
+ The event data from the JSONL file
128
+
129
+ Returns
130
+ -------
131
+ tuple[list[str], set[str]]
132
+ - List of document states (one per event, strings of full document content)
133
+ - Set of all content fragments ever seen (whitelist for internal copy detection)
134
+ """
135
+ document_states = []
136
+ content_whitelist = set()
137
+ current_state = ""
138
+
139
+ for idx, event in enumerate(jsonData):
140
+ old_frag = _normalize_newlines(event.get("oldFragment", ""))
141
+ new_frag = _normalize_newlines(event.get("newFragment", ""))
142
+ offset = event.get("offset", 0)
143
+
144
+ # First event is the initial snapshot (template)
145
+ if idx == 0:
146
+ current_state = new_frag
147
+ elif new_frag != old_frag:
148
+ # Apply the edit to reconstruct document state
149
+ current_state = current_state[:offset] + new_frag + current_state[offset + len(old_frag):]
150
+
151
+ document_states.append(current_state)
152
+
153
+ # Build whitelist of all content fragments seen
154
+ # Add both old and new fragments to whitelist for comprehensive coverage
155
+ if len(old_frag) > 10: # Ignore tiny fragments
156
+ content_whitelist.add(old_frag)
157
+ if len(new_frag) > 10:
158
+ content_whitelist.add(new_frag)
159
+
160
+ # Also add the full document state to whitelist
161
+ if len(current_state) > 10:
162
+ content_whitelist.add(current_state)
163
+
164
+ return document_states, content_whitelist
165
+
166
+
167
+ def _detect_multiline_external_pastes(
168
+ jsonData: tuple[dict[str, Any], ...],
169
+ document_states: list[str],
170
+ content_whitelist: set[str]
171
+ ) -> list[dict[str, Any]]:
117
172
  """
118
173
  Detect multi-line copy-paste events from external sources.
119
174
 
120
175
  Flags newFragments that are significant in length (more than one line)
121
176
  and do not appear to be copied from within the document itself.
122
177
 
123
- Returns a list of suspicious multi-line paste events.
178
+ Parameters
179
+ ----------
180
+ jsonData : tuple[dict[str, Any], ...]
181
+ The event data
182
+ document_states : list[str]
183
+ Full document state at each event
184
+ content_whitelist : set[str]
185
+ All content fragments ever seen in the document (for internal copy detection)
186
+
187
+ Returns
188
+ -------
189
+ list[dict[str, Any]]
190
+ List of suspicious multi-line paste events.
124
191
  """
125
192
  suspicious_events = []
126
193
 
127
- # Build a history of all document content seen so far
128
- document_history = set()
129
-
130
194
  for idx, event in enumerate(jsonData):
131
195
  old_frag = _normalize_newlines(event.get("oldFragment", ""))
132
196
  new_frag = _normalize_newlines(event.get("newFragment", ""))
@@ -140,32 +204,39 @@ def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> l
140
204
  if len(new_lines) <= 2: # Single line or line + empty
141
205
  continue
142
206
 
143
- # Check if the new content appears to be from within the document
207
+ # Check if the new content already existed in the document at any prior point
144
208
  is_internal_copy = False
145
209
 
146
- # Check if new_frag content was present in any previous fragments
147
- for hist_content in document_history:
148
- # Ignore tiny fragments; they appear everywhere and cause false positives
149
- if len(hist_content) < 20:
150
- continue
210
+ # Check against document state BEFORE this event
211
+ if idx > 0:
212
+ prior_state = document_states[idx - 1]
213
+ if new_frag in prior_state:
214
+ is_internal_copy = True
151
215
 
152
- # Require substantial overlap in size to count as an internal copy
153
- similar_length = (
154
- len(hist_content) >= 0.8 * len(new_frag)
155
- and len(hist_content) <= 1.25 * len(new_frag)
156
- )
216
+ # Also check against whitelist of all content seen
217
+ if not is_internal_copy:
218
+ for hist_content in content_whitelist:
219
+ # Ignore tiny fragments
220
+ if len(hist_content) < 20:
221
+ continue
157
222
 
158
- if new_frag == hist_content:
159
- is_internal_copy = True
160
- break
223
+ # Require substantial overlap in size to count as an internal copy
224
+ similar_length = (
225
+ len(hist_content) >= 0.8 * len(new_frag)
226
+ and len(hist_content) <= 1.25 * len(new_frag)
227
+ )
161
228
 
162
- if new_frag in hist_content and similar_length:
163
- is_internal_copy = True
164
- break
229
+ if new_frag == hist_content:
230
+ is_internal_copy = True
231
+ break
165
232
 
166
- if hist_content in new_frag and similar_length:
167
- is_internal_copy = True
168
- break
233
+ if new_frag in hist_content and similar_length:
234
+ is_internal_copy = True
235
+ break
236
+
237
+ if hist_content in new_frag and similar_length:
238
+ is_internal_copy = True
239
+ break
169
240
 
170
241
  # Also check if it's in the old fragment (internal move/copy)
171
242
  if not is_internal_copy and old_frag and (new_frag in old_frag or old_frag in new_frag):
@@ -180,12 +251,6 @@ def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> l
180
251
  "newFragment": new_frag
181
252
  })
182
253
 
183
- # Update history after analysis so the current fragment cannot mask itself
184
- if len(old_frag) > 1:
185
- document_history.add(old_frag)
186
- if len(new_frag) > 1:
187
- document_history.add(new_frag)
188
-
189
254
  return suspicious_events
190
255
 
191
256
 
@@ -248,6 +313,19 @@ def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[
248
313
  # If we found 3+ one-line pastes within 1 second, flag it
249
314
  if len(cluster) >= 3:
250
315
  event_indices = [p["event_index"] for p in cluster]
316
+
317
+ # Build detailed events list for optional detailed review
318
+ detailed_events = []
319
+ for paste in cluster:
320
+ idx = paste["event_index"]
321
+ content = paste["content"]
322
+ detailed_events.append({
323
+ "event_index": idx,
324
+ "line_count": 1,
325
+ "char_count": len(content),
326
+ "newFragment": content,
327
+ })
328
+
251
329
  suspicious_events.append({
252
330
  "event_index": event_indices[0],
253
331
  "event_indices": event_indices,
@@ -255,6 +333,7 @@ def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[
255
333
  "char_count": sum(len(p["content"]) for p in cluster),
256
334
  "reason": "rapid one-line pastes (AI indicator)",
257
335
  "newFragment": f"{len(cluster)} one-line pastes in 1 second",
336
+ "detailed_events": detailed_events,
258
337
  })
259
338
 
260
339
  i = j if j > i + 1 else i + 1
@@ -262,6 +341,119 @@ def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[
262
341
  return suspicious_events
263
342
 
264
343
 
344
+ def _detect_fullline_autocomplete(
345
+ jsonData: tuple[dict[str, Any], ...],
346
+ document_states: list[str],
347
+ content_whitelist: set[str],
348
+ excluded_indices: set[int]
349
+ ) -> list[dict[str, Any]]:
350
+ """
351
+ Detect full-line auto-complete events where the IDE/AI completes code.
352
+
353
+ At keystroke level, events show:
354
+ - Normal typing: oldFragment="" (empty), newFragment="X" (1 char)
355
+ - Auto-complete: oldFragment="" (empty), newFragment="long_text" (10+ chars)
356
+
357
+ Auto-complete is detected when:
358
+ - oldFragment is empty or very short (0-3 chars)
359
+ - newFragment is substantial (10+ characters)
360
+ - newFragment contains code structure (assignment, parens, brackets, etc.)
361
+ - newFragment does NOT already exist in the document state
362
+ - Event not already flagged as external copy-paste
363
+
364
+ Parameters
365
+ ----------
366
+ jsonData : tuple[dict[str, Any], ...]
367
+ The event data
368
+ document_states : list[str]
369
+ Full document state at each event
370
+ content_whitelist : set[str]
371
+ All content fragments ever seen in the document
372
+ excluded_indices : set[int]
373
+ Set of event indices already flagged by other detectors (to avoid double-flagging)
374
+
375
+ Returns
376
+ -------
377
+ list[dict[str, Any]]
378
+ List of suspected auto-complete events.
379
+ """
380
+ suspicious_events = []
381
+
382
+ for idx, event in enumerate(jsonData):
383
+ # Skip if already flagged by another detector
384
+ if idx in excluded_indices:
385
+ continue
386
+
387
+ old_frag = _normalize_newlines(event.get("oldFragment", ""))
388
+ new_frag = _normalize_newlines(event.get("newFragment", ""))
389
+
390
+ # Skip first event (template) and no-change events
391
+ if idx == 0 or new_frag == old_frag:
392
+ continue
393
+
394
+ old_len = len(old_frag)
395
+ new_len = len(new_frag)
396
+
397
+ # At keystroke level, oldFragment is typically empty for insertions
398
+ # Allow up to 3 chars for prefix-based autocomplete triggers
399
+ if old_len > 3:
400
+ continue
401
+
402
+ # Skip single-character additions (normal typing)
403
+ # Auto-complete typically adds 10+ characters at once
404
+ if new_len < 10:
405
+ continue
406
+
407
+ # Skip large multi-line pastes - those should be caught by multi-line paste detector
408
+ # Auto-complete is typically 1-2 lines and under 100 chars
409
+ # Anything larger is likely external copy-paste, not auto-complete
410
+ new_lines = new_frag.split("\n")
411
+ if len(new_lines) > 2 or new_len > 100:
412
+ continue
413
+
414
+ # The new fragment should not be just whitespace
415
+ if not new_frag.strip():
416
+ continue
417
+
418
+ # Check if the new fragment contains code structure indicators
419
+ # These strongly suggest IDE/AI auto-completion of code
420
+ code_indicators = [
421
+ "=", # Assignment (most common in autocomplete)
422
+ "(", # Function call/definition
423
+ ")", # Closing paren
424
+ ":", # Block statement (if, for, def, etc.)
425
+ "{", # Dictionary/block
426
+ "}", # Closing brace
427
+ "[", # List/index
428
+ "]", # Closing bracket
429
+ "=>", # Arrow function
430
+ ";", # Statement end
431
+ ]
432
+
433
+ has_code_structure = any(indicator in new_frag for indicator in code_indicators)
434
+
435
+ # Must have code structure to be considered auto-complete
436
+ if has_code_structure:
437
+ # Check if this content already existed in the document state BEFORE this event
438
+ is_internal_copy = False
439
+
440
+ if idx > 0:
441
+ prior_state = document_states[idx - 1]
442
+ if new_frag in prior_state:
443
+ is_internal_copy = True
444
+
445
+ if not is_internal_copy:
446
+ suspicious_events.append({
447
+ "event_index": idx,
448
+ "line_count": len(new_lines),
449
+ "char_count": new_len,
450
+ "reason": "full-line auto-complete",
451
+ "newFragment": new_frag,
452
+ })
453
+
454
+ return suspicious_events
455
+
456
+
265
457
  def detect_external_copypaste(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
266
458
  """
267
459
  Detect copy-paste events from external sources and AI-assisted coding patterns.
@@ -269,16 +461,64 @@ def detect_external_copypaste(jsonData: tuple[dict[str, Any], ...]) -> list[dict
269
461
  Combines detection of:
270
462
  1. Multi-line external paste events (content not from within document)
271
463
  2. Rapid one-line paste sequences (potential AI assistance indicator)
464
+ 3. Full-line auto-complete events (user types, AI completes the line)
465
+
466
+ Detection order matters: events flagged by earlier detectors are excluded
467
+ from later detectors to avoid double-flagging.
272
468
 
273
- Returns a list of all suspicious events with metadata.
469
+ Returns a list of all suspicious events with metadata, including aggregate statistics.
274
470
  """
275
471
  suspicious_events = []
276
472
 
277
- # Detect multi-line external pastes
278
- suspicious_events.extend(_detect_multiline_external_pastes(jsonData))
473
+ # Build shared document state tracking
474
+ # This reconstructs the full document at each event and creates a whitelist
475
+ # of all content that has ever appeared in the document
476
+ document_states, content_whitelist = _build_document_states(jsonData)
477
+
478
+ # Step 1: Detect multi-line external pastes
479
+ multiline_events = _detect_multiline_external_pastes(jsonData, document_states, content_whitelist)
480
+ suspicious_events.extend(multiline_events)
481
+
482
+ # Step 2: Detect rapid one-line paste sequences (AI indicator)
483
+ rapid_paste_events = _detect_rapid_paste_sequences(jsonData)
484
+ suspicious_events.extend(rapid_paste_events)
485
+
486
+ # Build set of all event indices already flagged
487
+ excluded_indices = set()
488
+ for event in multiline_events:
489
+ # Handle both single events and clusters
490
+ if "event_indices" in event:
491
+ excluded_indices.update(event["event_indices"])
492
+ else:
493
+ excluded_indices.add(event["event_index"])
494
+
495
+ for event in rapid_paste_events:
496
+ if "event_indices" in event:
497
+ excluded_indices.update(event["event_indices"])
498
+ else:
499
+ excluded_indices.add(event["event_index"])
500
+
501
+ # Step 3: Detect full-line auto-complete events (excluding already-flagged events)
502
+ autocomplete_events = _detect_fullline_autocomplete(
503
+ jsonData, document_states, content_whitelist, excluded_indices
504
+ )
279
505
 
280
- # Detect rapid one-line paste sequences (AI indicator)
281
- suspicious_events.extend(_detect_rapid_paste_sequences(jsonData))
506
+ # Calculate aggregate statistics for auto-complete/small paste events
507
+ # Store individual events for optional detailed review, but don't report them by default
508
+ if autocomplete_events:
509
+ total_autocomplete_chars = sum(ev["char_count"] for ev in autocomplete_events)
510
+ total_autocomplete_events = len(autocomplete_events)
511
+
512
+ # Always add aggregate summary, never individual events
513
+ # Store individual events in the aggregate for optional detailed review
514
+ suspicious_events.append({
515
+ "event_index": -1, # Special marker for aggregate
516
+ "event_count": total_autocomplete_events,
517
+ "total_chars": total_autocomplete_chars,
518
+ "reason": "aggregate auto-complete/small paste activity",
519
+ "newFragment": f"{total_autocomplete_events} auto-complete events ({total_autocomplete_chars} total chars)",
520
+ "detailed_events": autocomplete_events, # Store for optional review
521
+ })
282
522
 
283
523
  return suspicious_events
284
524
 
@@ -311,12 +551,15 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
311
551
  def parse_ts(ts_str: str) -> datetime:
312
552
  return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
313
553
 
314
- # Identify session boundaries: sessions start at indices where oldFragment == newFragment (non-empty)
554
+ # Identify session boundaries: sessions start at indices where offset == 0
555
+ # (indicating file reopen/recording restart) and oldFragment == newFragment (initial snapshot)
315
556
  session_starts = [0] # First session always starts at index 0
316
557
  for idx in range(1, len(jsonData)):
558
+ offset = jsonData[idx].get("offset", -1)
317
559
  old_frag = jsonData[idx].get("oldFragment", "")
318
560
  new_frag = jsonData[idx].get("newFragment", "")
319
- if old_frag == new_frag and old_frag.strip() != "":
561
+ # Session boundary: offset is 0 and it's an initial snapshot (old == new, non-empty)
562
+ if offset == 0 and old_frag == new_frag and old_frag.strip() != "":
320
563
  session_starts.append(idx)
321
564
 
322
565
  # Add sentinel to mark end of last session
@@ -344,25 +587,26 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
344
587
  session_end = session_starts[i + 1]
345
588
 
346
589
  # Find first and last events with timestamps in this session
347
- first_event = None
348
- last_event = None
349
-
350
- for event in jsonData[session_start:session_end]:
351
- if event.get("timestamp"):
352
- if first_event is None:
353
- first_event = event
354
- last_event = event
590
+ first_event_time = None
591
+ last_event_time = None
592
+
593
+ for idx in range(session_start, session_end):
594
+ event = jsonData[idx]
595
+ timestamp = event.get("timestamp")
596
+ if timestamp:
597
+ try:
598
+ event_time = parse_ts(timestamp)
599
+ if first_event_time is None:
600
+ first_event_time = event_time
601
+ last_event_time = event_time
602
+ except (ValueError, KeyError):
603
+ # Skip events with invalid timestamps
604
+ continue
355
605
 
356
606
  # If this session has timestamped events, add its elapsed time
357
- if first_event is not None and last_event is not None:
358
- try:
359
- first_time = parse_ts(first_event["timestamp"])
360
- last_time = parse_ts(last_event["timestamp"])
361
- session_diff = last_time - first_time
362
- total_minutes_elapsed += session_diff.total_seconds() / 60
363
- except (ValueError, KeyError):
364
- # Timestamp parsing failed for this session, skip it
365
- continue
607
+ if first_event_time is not None and last_event_time is not None:
608
+ session_diff = last_event_time - first_event_time
609
+ total_minutes_elapsed += session_diff.total_seconds() / 60
366
610
 
367
611
  # For time limit check, use the span from first to last timestamp overall
368
612
  try:
@@ -1,14 +1,314 @@
1
1
  import argparse
2
- import sys
3
2
  import json
3
+ import sys
4
4
  from datetime import datetime
5
5
  from pathlib import Path
6
- from .api.load import load_jsonl
7
- from .api.verify import verify, template_diff, check_time_limit
6
+ from typing import Any
7
+
8
8
  from .api.build import reconstruct_file_from_events
9
+ from .api.load import load_jsonl
10
+ from .api.verify import check_time_limit, template_diff, verify
11
+
12
+
13
+ def resolve_document(
14
+ docs: list[str], template_path: Path, override: str | None
15
+ ) -> str | None:
16
+ """
17
+ Determine which document from the recording to process.
18
+
19
+ Parameters
20
+ ----------
21
+ docs : list[str]
22
+ List of document paths found in the recording
23
+ template_path : Path
24
+ Path to the template file
25
+ override : str | None
26
+ Explicit document name or path override
27
+
28
+ Returns
29
+ -------
30
+ str | None
31
+ The resolved document path, or None if no documents exist
32
+
33
+ Raises
34
+ ------
35
+ ValueError
36
+ If document resolution is ambiguous or the override doesn't match
37
+ """
38
+ if not docs:
39
+ return None
40
+
41
+ if override:
42
+ matches = [
43
+ d for d in docs if d.endswith(override) or Path(d).name == override
44
+ ]
45
+ if not matches:
46
+ raise ValueError(
47
+ f"No document in recording matches '{override}'. Available: {docs}"
48
+ )
49
+ if len(matches) > 1:
50
+ raise ValueError(
51
+ f"Ambiguous document override '{override}'. Matches: {matches}"
52
+ )
53
+ return matches[0]
54
+
55
+ template_ext = template_path.suffix
56
+ ext_matches = [d for d in docs if Path(d).suffix == template_ext]
57
+ if len(ext_matches) == 1:
58
+ return ext_matches[0]
59
+ if len(ext_matches) > 1:
60
+ raise ValueError(
61
+ f"Multiple documents share extension '{template_ext}': {ext_matches}. "
62
+ "Use --document to choose one."
63
+ )
64
+
65
+ if len(docs) == 1:
66
+ return docs[0]
67
+
68
+ raise ValueError(
69
+ "Could not determine document to process. Use --document to select one. "
70
+ f"Available documents: {docs}"
71
+ )
72
+
73
+
74
+ def get_recorded_documents(events: tuple[dict[str, Any], ...]) -> list[str]:
75
+ """
76
+ Extract unique document paths from recording events.
77
+
78
+ Parameters
79
+ ----------
80
+ events : tuple[dict[str, Any], ...]
81
+ Recording events loaded from JSONL
9
82
 
83
+ Returns
84
+ -------
85
+ list[str]
86
+ Sorted list of unique document paths
87
+ """
88
+ documents = {
89
+ e.get("document")
90
+ for e in events
91
+ if "document" in e and e.get("document") is not None
92
+ }
93
+ return sorted([d for d in documents if d is not None])
10
94
 
11
- def main():
95
+
96
+ def filter_events_by_document(
97
+ events: tuple[dict[str, Any], ...], document: str | None
98
+ ) -> tuple[dict[str, Any], ...]:
99
+ """
100
+ Filter events to only those for a specific document.
101
+
102
+ Parameters
103
+ ----------
104
+ events : tuple[dict[str, Any], ...]
105
+ All recording events
106
+ document : str | None
107
+ Document path to filter by, or None to return all events
108
+
109
+ Returns
110
+ -------
111
+ tuple[dict[str, Any], ...]
112
+ Filtered events
113
+ """
114
+ if document:
115
+ return tuple(e for e in events if e.get("document") == document)
116
+ return events
117
+
118
+
119
+ def display_time_info(time_info: dict[str, Any] | None) -> None:
120
+ """
121
+ Display elapsed time and time limit information.
122
+
123
+ Parameters
124
+ ----------
125
+ time_info : dict[str, Any] | None
126
+ Time information from check_time_limit, or None if no time data
127
+ """
128
+ if not time_info:
129
+ return
130
+
131
+ print(
132
+ f"Elapsed editing time: {time_info['minutes_elapsed']} minutes",
133
+ file=sys.stderr,
134
+ )
135
+
136
+ first_ts = datetime.fromisoformat(
137
+ time_info["first_timestamp"].replace("Z", "+00:00")
138
+ )
139
+ last_ts = datetime.fromisoformat(
140
+ time_info["last_timestamp"].replace("Z", "+00:00")
141
+ )
142
+ time_span = (last_ts - first_ts).total_seconds() / 60
143
+
144
+ print(f"Time span (first to last edit): {time_span:.2f} minutes", file=sys.stderr)
145
+
146
+ if time_info["exceeds_limit"]:
147
+ print("\nTime limit exceeded!", file=sys.stderr)
148
+ print(f" Limit: {time_info['time_limit_minutes']} minutes", file=sys.stderr)
149
+ print(f" First edit: {time_info['first_timestamp']}", file=sys.stderr)
150
+ print(f" Last edit: {time_info['last_timestamp']}", file=sys.stderr)
151
+
152
+
153
+ def display_suspicious_event(event: dict[str, Any], show_details: bool) -> None:
154
+ """
155
+ Display a single suspicious event.
156
+
157
+ Parameters
158
+ ----------
159
+ event : dict[str, Any]
160
+ Suspicious event data
161
+ show_details : bool
162
+ Whether to show detailed autocomplete events
163
+ """
164
+ reason = event.get("reason", "unknown")
165
+
166
+ # Handle aggregate auto-complete events
167
+ if event.get("event_index") == -1 and "detailed_events" in event:
168
+ event_count = event["event_count"]
169
+ total_chars = event["total_chars"]
170
+ print(
171
+ f" Aggregate: {event_count} auto-complete/small paste events "
172
+ f"({total_chars} total chars)",
173
+ file=sys.stderr,
174
+ )
175
+
176
+ if show_details:
177
+ print(" Detailed events:", file=sys.stderr)
178
+ for detail in event["detailed_events"]:
179
+ detail_idx = detail["event_index"]
180
+ detail_lines = detail["line_count"]
181
+ detail_chars = detail["char_count"]
182
+ detail_frag = detail["newFragment"]
183
+ print(
184
+ f" Event #{detail_idx}: {detail_lines} lines, "
185
+ f"{detail_chars} chars",
186
+ file=sys.stderr,
187
+ )
188
+ print(" ```", file=sys.stderr)
189
+ for line in detail_frag.split("\n"):
190
+ print(f" {line}", file=sys.stderr)
191
+ print(" ```", file=sys.stderr)
192
+
193
+ elif "event_indices" in event and reason == "rapid one-line pastes (AI indicator)":
194
+ # Rapid paste sequences (AI indicator) - show aggregate style
195
+ indices = event["event_indices"]
196
+ print(
197
+ f" AI Rapid Paste: Events #{indices[0]}-#{indices[-1]} "
198
+ f"({event['line_count']} lines, {event['char_count']} chars, "
199
+ f"{len(indices)} events in < 1 second)",
200
+ file=sys.stderr,
201
+ )
202
+
203
+ if show_details and "detailed_events" in event:
204
+ # Combine all detailed events into one block
205
+ combined_content = "".join(
206
+ detail["newFragment"] for detail in event["detailed_events"]
207
+ )
208
+ print(" Combined output:", file=sys.stderr)
209
+ print(" ```", file=sys.stderr)
210
+ for line in combined_content.split("\n"):
211
+ print(f" {line}", file=sys.stderr)
212
+ print(" ```", file=sys.stderr)
213
+
214
+ elif "event_indices" in event:
215
+ # Other multi-event clusters
216
+ indices = event.get("event_indices", [event["event_index"]])
217
+ print(
218
+ f" Events #{indices[0]}-#{indices[-1]} ({reason}): "
219
+ f"{event['line_count']} lines, {event['char_count']} chars",
220
+ file=sys.stderr,
221
+ )
222
+
223
+ else:
224
+ new_fragment = event["newFragment"].replace("\n", "\n ")
225
+ print(
226
+ f" Event #{event['event_index']} ({reason}): "
227
+ f"{event['line_count']} lines, {event['char_count']} chars - "
228
+ f"newFragment:\n ```\n {new_fragment}\n ```",
229
+ file=sys.stderr,
230
+ )
231
+
232
+
233
+ def display_suspicious_events(
234
+ suspicious_events: list[dict[str, Any]], show_details: bool
235
+ ) -> None:
236
+ """
237
+ Display all suspicious events or success message.
238
+
239
+ Parameters
240
+ ----------
241
+ suspicious_events : list[dict[str, Any]]
242
+ List of suspicious events detected
243
+ show_details : bool
244
+ Whether to show detailed autocomplete events
245
+ """
246
+ if suspicious_events:
247
+ print("\nSuspicious events detected:", file=sys.stderr)
248
+
249
+ # Sort events by their index for chronological display
250
+ def get_sort_key(event: dict[str, Any]) -> int | float:
251
+ if "event_indices" in event and event["event_indices"]:
252
+ return event["event_indices"][0]
253
+ if "detailed_events" in event and event["detailed_events"]:
254
+ return event["detailed_events"][0].get("event_index", float("inf"))
255
+ event_idx = event.get("event_index", -1)
256
+ return event_idx if event_idx >= 0 else float("inf")
257
+
258
+ sorted_events = sorted(suspicious_events, key=get_sort_key)
259
+
260
+ for event in sorted_events:
261
+ display_suspicious_event(event, show_details)
262
+ else:
263
+ print("Success! No suspicious events detected.", file=sys.stderr)
264
+
265
+
266
+ def write_json_output(
267
+ output_path: Path,
268
+ document: str,
269
+ time_info: dict[str, Any] | None,
270
+ suspicious_events: list[dict[str, Any]],
271
+ ) -> None:
272
+ """
273
+ Write verification results to JSON file.
274
+
275
+ Parameters
276
+ ----------
277
+ output_path : Path
278
+ Path to output JSON file
279
+ document : str
280
+ Document that was processed
281
+ time_info : dict[str, Any] | None
282
+ Time information from verification
283
+ suspicious_events : list[dict[str, Any]]
284
+ List of suspicious events detected
285
+
286
+ Raises
287
+ ------
288
+ Exception
289
+ If file writing fails
290
+ """
291
+ results = {
292
+ "document": document,
293
+ "time_info": time_info,
294
+ "suspicious_events": suspicious_events,
295
+ }
296
+
297
+ output_path.parent.mkdir(parents=True, exist_ok=True)
298
+ with open(output_path, "w") as f:
299
+ json.dump(results, f, indent=2)
300
+ print(f"Results written to {output_path}", file=sys.stderr)
301
+
302
+
303
+ def create_parser() -> argparse.ArgumentParser:
304
+ """
305
+ Create and configure the argument parser.
306
+
307
+ Returns
308
+ -------
309
+ argparse.ArgumentParser
310
+ Configured argument parser
311
+ """
12
312
  parser = argparse.ArgumentParser(
13
313
  description="Process and verify code recorder JSONL files"
14
314
  )
@@ -23,30 +323,54 @@ def main():
23
323
  help="Path to the initial template file that was recorded",
24
324
  )
25
325
  parser.add_argument(
326
+ "-t",
26
327
  "--time-limit",
27
328
  type=int,
28
329
  default=None,
29
- help="Maximum allowed time in minutes between first and last edit. If exceeded, recording is flagged.",
330
+ help="Maximum allowed time in minutes between first and last edit. "
331
+ "If exceeded, recording is flagged.",
30
332
  )
31
333
  parser.add_argument(
334
+ "-d",
32
335
  "--document",
33
336
  type=str,
34
337
  default=None,
35
- help=("Document path or filename to process from the recording. "
36
- "Defaults to the document whose extension matches the template file."),
338
+ help="Document path or filename to process from the recording. "
339
+ "Defaults to the document whose extension matches the template file.",
37
340
  )
38
341
  parser.add_argument(
342
+ "-o",
39
343
  "--output-json",
40
344
  type=Path,
41
345
  default=None,
42
- help="Path to output JSON file with verification results (time info and suspicious events).",
346
+ help="Path to output JSON file with verification results "
347
+ "(time info and suspicious events).",
348
+ )
349
+ parser.add_argument(
350
+ "-s",
351
+ "--show-autocomplete-details",
352
+ action="store_true",
353
+ help="Show individual auto-complete events in addition to "
354
+ "aggregate statistics",
43
355
  )
356
+ return parser
357
+
358
+
359
+ def main() -> int:
360
+ """
361
+ Main entry point for the CLI application.
44
362
 
363
+ Returns
364
+ -------
365
+ int
366
+ Exit code (0 for success, 1 for errors)
367
+ """
368
+ parser = create_parser()
45
369
  args = parser.parse_args()
46
370
 
47
- # Load JSONL file first to get document path
371
+ # Load JSONL file
48
372
  try:
49
- jsonData = load_jsonl(args.jsonl_file)
373
+ json_data = load_jsonl(args.jsonl_file)
50
374
  except FileNotFoundError as e:
51
375
  print(f"Error: {e}", file=sys.stderr)
52
376
  return 1
@@ -54,123 +378,72 @@ def main():
54
378
  print(f"Error loading JSONL file: {e}", file=sys.stderr)
55
379
  return 1
56
380
 
57
- # Decide which recorded document to process
58
- documents = {e.get("document") for e in jsonData if "document" in e and e.get("document") is not None}
59
- recorded_docs = sorted([d for d in documents if d is not None])
60
-
61
- def resolve_document(docs: list[str], template_path: Path, override: str | None) -> str | None:
62
- if not docs:
63
- return None
64
-
65
- if override:
66
- matches = [d for d in docs if d.endswith(override) or Path(d).name == override]
67
- if not matches:
68
- raise ValueError(
69
- f"No document in recording matches '{override}'. Available: {docs}"
70
- )
71
- if len(matches) > 1:
72
- raise ValueError(
73
- f"Ambiguous document override '{override}'. Matches: {matches}"
74
- )
75
- return matches[0]
76
-
77
- template_ext = template_path.suffix
78
- ext_matches = [d for d in docs if Path(d).suffix == template_ext]
79
- if len(ext_matches) == 1:
80
- return ext_matches[0]
81
- if len(ext_matches) > 1:
82
- raise ValueError(
83
- f"Multiple documents share extension '{template_ext}': {ext_matches}. "
84
- "Use --document to choose one."
85
- )
86
-
87
- if len(docs) == 1:
88
- return docs[0]
89
-
90
- raise ValueError(
91
- "Could not determine document to process. Use --document to select one. "
92
- f"Available documents: {docs}"
93
- )
94
-
381
+ # Resolve which document to process
382
+ recorded_docs = get_recorded_documents(json_data)
95
383
  try:
96
- target_document = resolve_document(recorded_docs, args.template_file, args.document)
384
+ target_document = resolve_document(
385
+ recorded_docs, args.template_file, args.document
386
+ )
97
387
  except ValueError as e:
98
388
  print(f"Error determining document: {e}", file=sys.stderr)
99
389
  return 1
100
390
 
101
- if target_document:
102
- doc_events = tuple(e for e in jsonData if e.get("document") == target_document)
103
- if not doc_events:
104
- print(f"Error: No events found for document '{target_document}'", file=sys.stderr)
105
- return 1
106
- else:
107
- doc_events = jsonData
391
+ # Filter events for target document
392
+ doc_events = filter_events_by_document(json_data, target_document)
393
+ if target_document and not doc_events:
394
+ print(
395
+ f"Error: No events found for document '{target_document}'",
396
+ file=sys.stderr,
397
+ )
398
+ return 1
108
399
 
109
400
  print(f"Processing: {target_document or args.template_file}", file=sys.stderr)
110
401
 
111
402
  # Read template file
112
403
  try:
113
- templateData = args.template_file.read_text()
404
+ template_data = args.template_file.read_text()
114
405
  except FileNotFoundError:
115
- print(f"Error: Template file not found: {args.template_file}", file=sys.stderr)
406
+ print(
407
+ f"Error: Template file not found: {args.template_file}", file=sys.stderr
408
+ )
116
409
  return 1
117
410
  except Exception as e:
118
411
  print(f"Error reading template file: {e}", file=sys.stderr)
119
412
  return 1
120
413
 
121
- # Check time limit and display elapsed time
414
+ # Check and display time information
122
415
  time_info = check_time_limit(doc_events, args.time_limit)
123
- if time_info:
124
- print(f"Elapsed editing time: {time_info['minutes_elapsed']} minutes", file=sys.stderr)
125
- print(f"Time span (first to last edit): {(datetime.fromisoformat(time_info['last_timestamp'].replace('Z', '+00:00')) - datetime.fromisoformat(time_info['first_timestamp'].replace('Z', '+00:00'))).total_seconds() / 60:.2f} minutes", file=sys.stderr)
126
- if time_info['exceeds_limit']:
127
- print(f"\nTime limit exceeded!", file=sys.stderr)
128
- print(f" Limit: {time_info['time_limit_minutes']} minutes", file=sys.stderr)
129
- print(f" First edit: {time_info['first_timestamp']}", file=sys.stderr)
130
- print(f" Last edit: {time_info['last_timestamp']}", file=sys.stderr)
131
-
132
- # Verify and process
416
+ display_time_info(time_info)
417
+
418
+ # Verify and process the recording
133
419
  try:
134
- templateData, suspicious_events = verify(templateData, doc_events)
135
- print(reconstruct_file_from_events(doc_events, templateData, document_path=target_document))
136
-
137
- # Prepare results for JSON output
138
- results = {
139
- "document": target_document or str(args.template_file),
140
- "time_info": time_info,
141
- "suspicious_events": suspicious_events,
142
- }
143
-
144
- if suspicious_events:
145
- print("\nSuspicious copy-paste events detected:", file=sys.stderr)
146
- for ev in suspicious_events:
147
- reason = ev.get('reason', 'unknown')
148
- indices = ev.get('event_indices', [ev['event_index']])
149
- if len(indices) > 1:
150
- print(f" Events #{indices[0]}-#{indices[-1]} ({reason}): "
151
- f"{ev['line_count']} lines, {ev['char_count']} chars", file=sys.stderr)
152
- else:
153
- print(f" Event #{ev['event_index']} ({reason}): "
154
- f"{ev['line_count']} lines, {ev['char_count']} chars - "
155
- f"newFragment:\n```\n{ev['newFragment']}\n```", file=sys.stderr)
156
- else:
157
- print("Success! No suspicious events detected.", file=sys.stderr)
420
+ template_data, suspicious_events = verify(template_data, doc_events)
421
+ reconstructed = reconstruct_file_from_events(
422
+ doc_events, template_data, document_path=target_document
423
+ )
424
+ print(reconstructed)
425
+
426
+ # Display suspicious events
427
+ display_suspicious_events(suspicious_events, args.show_autocomplete_details)
158
428
 
159
429
  # Write JSON output if requested
160
430
  if args.output_json:
161
431
  try:
162
- args.output_json.parent.mkdir(parents=True, exist_ok=True)
163
- with open(args.output_json, 'w') as f:
164
- json.dump(results, f, indent=2)
165
- print(f"Results written to {args.output_json}", file=sys.stderr)
432
+ write_json_output(
433
+ args.output_json,
434
+ target_document or str(args.template_file),
435
+ time_info,
436
+ suspicious_events,
437
+ )
166
438
  except Exception as e:
167
439
  print(f"Error writing JSON output: {e}", file=sys.stderr)
168
440
  return 1
441
+
169
442
  except ValueError as e:
170
443
  print("File failed verification from template!", file=sys.stderr)
171
444
  print(str(e), file=sys.stderr)
172
445
  try:
173
- print(template_diff(templateData, doc_events), file=sys.stderr)
446
+ print(template_diff(template_data, doc_events), file=sys.stderr)
174
447
  except Exception:
175
448
  pass
176
449
  return 1
@@ -178,6 +451,8 @@ def main():
178
451
  print(f"Error processing file: {type(e).__name__}: {e}", file=sys.stderr)
179
452
  return 1
180
453
 
454
+ return 0
455
+
181
456
 
182
457
  if __name__ == "__main__":
183
458
  sys.exit(main())
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cr_proc
3
- Version: 0.1.3
4
- Summary: A tool for processing BYU CS code recording files
3
+ Version: 0.1.6
4
+ Summary: A tool for processing BYU CS code recording files.
5
5
  Author: Ethan Dye
6
6
  Author-email: mrtops03@gmail.com
7
7
  Requires-Python: >=3.14
@@ -0,0 +1,9 @@
1
+ code_recorder_processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ code_recorder_processor/api/build.py,sha256=-EMg0w-llblJ_N_vs_B1kOsAOwiV-TYetAXiOY6PcWs,7643
3
+ code_recorder_processor/api/load.py,sha256=ZKoheLsEoGJ3fpAtPauoeEyNUhGLhUYSwjRsqt1m-TI,3947
4
+ code_recorder_processor/api/verify.py,sha256=B7nFiLl_lRciUkQpcptX1t2-nO3YTh2gkPnSLk77Y2I,23772
5
+ code_recorder_processor/cli.py,sha256=B4vz_V5ZCxa8eKyj17dxopWu9_z_6-iC_vgQwNpgXoc,14109
6
+ cr_proc-0.1.6.dist-info/METADATA,sha256=VlrsimxGOWZOOziPunW7rec_YKY4kARH4X_VhcGz5mE,4070
7
+ cr_proc-0.1.6.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
8
+ cr_proc-0.1.6.dist-info/entry_points.txt,sha256=xb5dPAAWN1Z9NUHpvZgNakaslR1MVOERf_IfpG_M04M,77
9
+ cr_proc-0.1.6.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- code_recorder_processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- code_recorder_processor/api/build.py,sha256=-EMg0w-llblJ_N_vs_B1kOsAOwiV-TYetAXiOY6PcWs,7643
3
- code_recorder_processor/api/load.py,sha256=lkEPnQi3Q_91GOTImk4H380F-uKJPszeX3FJJWM4CIA,3272
4
- code_recorder_processor/api/verify.py,sha256=pc465KVk9TfRndMqS9GQH2Bcv5GmwX_GZz4pwqFdaUA,14279
5
- code_recorder_processor/cli.py,sha256=sKm9f06NEZ3psw-HEShlHt4grVZvRmNEG33yvxhIIQQ,7154
6
- cr_proc-0.1.3.dist-info/METADATA,sha256=85_nqsqQnI91zQnylj4ikVKXxNdyTrSmeu9LZ6YpasI,4069
7
- cr_proc-0.1.3.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
8
- cr_proc-0.1.3.dist-info/entry_points.txt,sha256=xb5dPAAWN1Z9NUHpvZgNakaslR1MVOERf_IfpG_M04M,77
9
- cr_proc-0.1.3.dist-info/RECORD,,