PyPI - cr-proc - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

cr-proc 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

code_recorder_processor/api/build.py +6 -0
code_recorder_processor/api/document.py +300 -0
code_recorder_processor/api/load.py +58 -0
code_recorder_processor/api/output.py +70 -0
code_recorder_processor/api/verify.py +293 -83
code_recorder_processor/cli.py +523 -349
code_recorder_processor/display.py +201 -0
code_recorder_processor/playback.py +116 -0
cr_proc-0.1.9.dist-info/METADATA +280 -0
cr_proc-0.1.9.dist-info/RECORD +13 -0
cr_proc-0.1.7.dist-info/METADATA +0 -142
cr_proc-0.1.7.dist-info/RECORD +0 -9
{cr_proc-0.1.7.dist-info → cr_proc-0.1.9.dist-info}/WHEEL +0 -0
{cr_proc-0.1.7.dist-info → cr_proc-0.1.9.dist-info}/entry_points.txt +0 -0

code_recorder_processor/api/verify.py CHANGED Viewed

@@ -2,6 +2,13 @@ from typing import Any
 from datetime import datetime
 import difflib
+# ============================================================================
+# Constants for detection thresholds
+# ============================================================================
+MIN_WHITELIST_SIZE = 10  # Minimum fragment size to add to whitelist
+MIN_MULTILINE_SIZE = 20  # Minimum size for multiline external paste detection
+MIN_AUTOCOMPLETE_SIZE = 10  # Minimum size for autocomplete detection
+MIN_RAPID_PASTE_CHARS = 5  # Minimum chars for a "paste" in rapid detection
 def _normalize_newlines(text: str) -> str:
     """Normalize CRLF to LF to avoid offset and diff noise."""
@@ -121,22 +128,29 @@ def _build_document_states(jsonData: tuple[dict[str, Any], ...]) -> tuple[list[s
     existed in the document at each point in time. This allows detectors to
     check if pasted/autocompleted content already existed in the document.
+    Only processes edit events (type="edit" or no type field for backwards compatibility).
     Parameters
     ----------
     jsonData : tuple[dict[str, Any], ...]
-        The event data from the JSONL file
+        The event data from the JSONL file (all event types)
     Returns
     -------
     tuple[list[str], set[str]]
-        - List of document states (one per event, strings of full document content)
+        - List of document states (one per edit event, strings of full document content)
         - Set of all content fragments ever seen (whitelist for internal copy detection)
     """
+    from .load import is_edit_event
+    # Filter to only edit events
+    edit_events = [e for e in jsonData if is_edit_event(e)]
     document_states = []
     content_whitelist = set()
     current_state = ""
-    for idx, event in enumerate(jsonData):
+    for idx, event in enumerate(edit_events):
         old_frag = _normalize_newlines(event.get("oldFragment", ""))
         new_frag = _normalize_newlines(event.get("newFragment", ""))
         offset = event.get("offset", 0)
@@ -152,13 +166,13 @@ def _build_document_states(jsonData: tuple[dict[str, Any], ...]) -> tuple[list[s
         # Build whitelist of all content fragments seen
         # Add both old and new fragments to whitelist for comprehensive coverage
-        if len(old_frag) > 10:  # Ignore tiny fragments
+        if len(old_frag) > MIN_WHITELIST_SIZE:
             content_whitelist.add(old_frag)
-        if len(new_frag) > 10:
+        if len(new_frag) > MIN_WHITELIST_SIZE:
             content_whitelist.add(new_frag)
         # Also add the full document state to whitelist
-        if len(current_state) > 10:
+        if len(current_state) > MIN_WHITELIST_SIZE:
             content_whitelist.add(current_state)
     return document_states, content_whitelist
@@ -175,12 +189,14 @@ def _detect_multiline_external_pastes(
     Flags newFragments that are significant in length (more than one line)
     and do not appear to be copied from within the document itself.
+    Only processes edit events (type="edit" or no type field for backwards compatibility).
     Parameters
     ----------
     jsonData : tuple[dict[str, Any], ...]
-        The event data
+        The event data (all event types)
     document_states : list[str]
-        Full document state at each event
+        Full document state at each edit event
     content_whitelist : set[str]
         All content fragments ever seen in the document (for internal copy detection)
@@ -189,67 +205,81 @@ def _detect_multiline_external_pastes(
     list[dict[str, Any]]
         List of suspicious multi-line paste events.
     """
+    from .load import is_edit_event
+    # Filter to only edit events
+    edit_events = [e for e in jsonData if is_edit_event(e)]
     suspicious_events = []
-    for idx, event in enumerate(jsonData):
+    # Build whitelist incrementally to only include content from BEFORE each event
+    past_whitelist = set()
+    for idx, event in enumerate(edit_events):
         old_frag = _normalize_newlines(event.get("oldFragment", ""))
         new_frag = _normalize_newlines(event.get("newFragment", ""))
         # Skip if no actual change
         if new_frag == old_frag or new_frag.strip() == "":
-            continue
+            pass  # Still add to whitelist below
         # Only check multi-line content (more than 2 lines means at least 2 actual lines)
-        new_lines = new_frag.split("\n")
-        if len(new_lines) <= 2:  # Single line or line + empty
-            continue
-        # Check if the new content already existed in the document at any prior point
-        is_internal_copy = False
-        # Check against document state BEFORE this event
-        if idx > 0:
-            prior_state = document_states[idx - 1]
-            if new_frag in prior_state:
-                is_internal_copy = True
-        # Also check against whitelist of all content seen
-        if not is_internal_copy:
-            for hist_content in content_whitelist:
-                # Ignore tiny fragments
-                if len(hist_content) < 20:
-                    continue
-                # Require substantial overlap in size to count as an internal copy
-                similar_length = (
-                    len(hist_content) >= 0.8 * len(new_frag)
-                    and len(hist_content) <= 1.25 * len(new_frag)
-                )
+        elif len(new_frag.split("\n")) > 2:
+            new_lines = new_frag.split("\n")
-                if new_frag == hist_content:
-                    is_internal_copy = True
-                    break
+            # Check if the new content already existed in the document at any prior point
+            is_internal_copy = False
-                if new_frag in hist_content and similar_length:
+            # Check against document state BEFORE this event
+            if idx > 0:
+                prior_state = document_states[idx - 1]
+                if new_frag in prior_state:
                     is_internal_copy = True
-                    break
-                if hist_content in new_frag and similar_length:
-                    is_internal_copy = True
-                    break
+            # Also check against whitelist of content from BEFORE this event
+            if not is_internal_copy:
+                for hist_content in past_whitelist:
+                    # Ignore tiny fragments - multiline external pastes should be significant
+                    if len(hist_content) < MIN_MULTILINE_SIZE:
+                        continue
+                    # Require substantial overlap in size to count as an internal copy
+                    similar_length = (
+                        len(hist_content) >= 0.8 * len(new_frag)
+                        and len(hist_content) <= 1.25 * len(new_frag)
+                    )
+                    if new_frag == hist_content:
+                        is_internal_copy = True
+                        break
+                    if new_frag in hist_content and similar_length:
+                        is_internal_copy = True
+                        break
+                    if hist_content in new_frag and similar_length:
+                        is_internal_copy = True
+                        break
+            # Also check if it's in the old fragment (internal move/copy)
+            if not is_internal_copy and old_frag and (new_frag in old_frag or old_frag in new_frag):
+                is_internal_copy = True
-        # Also check if it's in the old fragment (internal move/copy)
-        if not is_internal_copy and old_frag and (new_frag in old_frag or old_frag in new_frag):
-            is_internal_copy = True
+            if not is_internal_copy:
+                suspicious_events.append({
+                    "event_index": idx,
+                    "line_count": len(new_lines),
+                    "char_count": len(new_frag),
+                    "reason": "multi-line external paste",
+                    "newFragment": new_frag
+                })
-        if not is_internal_copy:
-            suspicious_events.append({
-                "event_index": idx,
-                "line_count": len(new_lines),
-                "char_count": len(new_frag),
-                "reason": "multi-line external paste",
-                "newFragment": new_frag
-            })
+        # Add current event's content to whitelist for future events
+        if len(old_frag) > MIN_MULTILINE_SIZE:
+            past_whitelist.add(old_frag)
+        if len(new_frag) > MIN_MULTILINE_SIZE:
+            past_whitelist.add(new_frag)
+        if idx > 0 and len(document_states[idx - 1]) > MIN_MULTILINE_SIZE:
+            past_whitelist.add(document_states[idx - 1])
     return suspicious_events
@@ -261,14 +291,21 @@ def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[
     Identifies clusters of 3+ one-line paste events occurring within 1 second,
     which may indicate AI-assisted code generation.
+    Only processes edit events (type="edit" or no type field for backwards compatibility).
     Returns a list of suspicious rapid-paste events.
     """
+    from .load import is_edit_event
+    # Filter to only edit events
+    edit_events = [e for e in jsonData if is_edit_event(e)]
     suspicious_events = []
     # Track one-line paste events for rapid-paste detection
     one_line_pastes = []
-    for idx, event in enumerate(jsonData):
+    for idx, event in enumerate(edit_events):
         new_frag = _normalize_newlines(event.get("newFragment", ""))
         old_frag = _normalize_newlines(event.get("oldFragment", ""))
         timestamp = event.get("timestamp")
@@ -281,7 +318,7 @@ def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[
         new_lines = new_frag.split("\n")
         if len(new_lines) == 2:
             # Heuristic: if it's more than a few characters, it might be pasted
-            if len(new_frag.strip()) > 5:
+            if len(new_frag.strip()) > MIN_RAPID_PASTE_CHARS:
                 one_line_pastes.append({
                     "event_index": idx,
                     "timestamp": timestamp,
@@ -367,12 +404,14 @@ def _detect_fullline_autocomplete(
     - newFragment does NOT already exist in the document state
     - Event not already flagged as external copy-paste
+    Only processes edit events (type="edit" or no type field for backwards compatibility).
     Parameters
     ----------
     jsonData : tuple[dict[str, Any], ...]
-        The event data
+        The event data (all event types)
     document_states : list[str]
-        Full document state at each event
+        Full document state at each edit event
     content_whitelist : set[str]
         All content fragments ever seen in the document
     excluded_indices : set[int]
@@ -383,11 +422,20 @@ def _detect_fullline_autocomplete(
     list[dict[str, Any]]
         List of suspected multi-line auto-complete events.
     """
+    from .load import is_edit_event
+    # Filter to only edit events
+    edit_events = [e for e in jsonData if is_edit_event(e)]
     suspicious_events = []
-    for idx, event in enumerate(jsonData):
+    # Build whitelist incrementally to only include content from BEFORE each event
+    past_whitelist = set()
+    for idx, event in enumerate(edit_events):
         # Skip if already flagged by another detector
         if idx in excluded_indices:
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         old_frag = _normalize_newlines(event.get("oldFragment", ""))
@@ -395,6 +443,7 @@ def _detect_fullline_autocomplete(
         # Skip first event (template) and no-change events
         if idx == 0 or new_frag == old_frag:
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         old_len = len(old_frag)
@@ -403,6 +452,7 @@ def _detect_fullline_autocomplete(
         # At keystroke level, oldFragment is typically empty for insertions
         # Allow up to 3 chars for prefix-based triggers (e.g., "de" -> "def")
         if old_len > 3:
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # Check line count - we care about complete statements
@@ -417,10 +467,12 @@ def _detect_fullline_autocomplete(
         if not (is_single_line or is_multi_line):
             # Shouldn't happen, but skip if malformed
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # The new fragment should not be just whitespace
         if not new_frag.strip():
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # Check if the new fragment contains code structure indicators
@@ -443,21 +495,25 @@ def _detect_fullline_autocomplete(
         if not has_complete_statement:
             # No complete statement - skip basic identifier completion
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # Minimum size for meaningful completion
-        if new_len < 10:
+        if new_len < MIN_AUTOCOMPLETE_SIZE:
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # For multi-line: maximum size to distinguish from external pastes
         # External pastes are typically much larger (100+ chars)
         # Multi-line completions are usually 20-300 chars for a small function/block
         if is_multi_line and new_len > 300:
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # For single-line: could be larger due to chained methods or long statements
         # but cap at 200 chars to avoid flagging user-typed long lines
         if is_single_line and new_len > 200:
+            past_whitelist_update(idx, event, document_states, past_whitelist)
             continue
         # Check if this content already existed in the document state BEFORE this event
@@ -468,6 +524,28 @@ def _detect_fullline_autocomplete(
             if new_frag in prior_state:
                 is_internal_copy = True
+        # Also check against whitelist of content from BEFORE this event
+        if not is_internal_copy:
+            for hist_content in past_whitelist:
+                # Ignore tiny fragments
+                if len(hist_content) < MIN_AUTOCOMPLETE_SIZE:
+                    continue
+                # Check for exact match or significant overlap
+                if new_frag == hist_content:
+                    is_internal_copy = True
+                    break
+                # Check for substring matches with similar length
+                similar_length = (
+                    len(hist_content) >= 0.8 * len(new_frag)
+                    and len(hist_content) <= 1.25 * len(new_frag)
+                )
+                if (new_frag in hist_content or hist_content in new_frag) and similar_length:
+                    is_internal_copy = True
+                    break
         if not is_internal_copy:
             line_desc = "line" if is_single_line else "lines"
             suspicious_events.append({
@@ -478,9 +556,30 @@ def _detect_fullline_autocomplete(
                 "newFragment": new_frag,
             })
+        # Add current event's content to whitelist for future events
+        past_whitelist_update(idx, event, document_states, past_whitelist)
     return suspicious_events
+def past_whitelist_update(
+    idx: int,
+    event: dict[str, Any],
+    document_states: list[str],
+    past_whitelist: set[str]
+) -> None:
+    """Helper to update the past_whitelist with content from current event."""
+    old_frag = _normalize_newlines(event.get("oldFragment", ""))
+    new_frag = _normalize_newlines(event.get("newFragment", ""))
+    if len(old_frag) > MIN_AUTOCOMPLETE_SIZE:
+        past_whitelist.add(old_frag)
+    if len(new_frag) > MIN_AUTOCOMPLETE_SIZE:
+        past_whitelist.add(new_frag)
+    if idx < len(document_states) and len(document_states[idx]) > MIN_AUTOCOMPLETE_SIZE:
+        past_whitelist.add(document_states[idx])
 def detect_external_copypaste(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
     """
     Detect copy-paste events from external sources and AI-assisted coding patterns.
@@ -555,13 +654,20 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
     Check if the time between first and last edit exceeds the specified time limit.
     Tracks elapsed editing time across sessions by summing actual editing time within
-    each session (excluding gaps between sessions). For the time limit check, compares
-    the span from the first timestamp to the last timestamp overall.
+    each session (excluding gaps between sessions). Focus events (type="focusStatus")
+    are used to pause time tracking when the window loses focus for extended periods.
+    Time tracking behavior:
+    - Tracks actual editing time by looking at timestamps between edit events
+    - When a focusStatus event with focused=false is encountered, time tracking pauses
+    - Time tracking resumes when a focusStatus event with focused=true is encountered
+    - Gaps > 5 minutes while unfocused are excluded from time tracking
+    - Gaps <= 5 minutes are counted even when unfocused (student thinking/reviewing)
     Parameters
     ----------
     jsonData : tuple[dict[str, Any], ...]
-        The event data from the JSONL file
+        The event data from the JSONL file (all event types)
     time_limit_minutes : int | None
         Maximum allowed time in minutes between first and last overall edit.
         If None, no time limit is enforced.
@@ -578,25 +684,34 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
     def parse_ts(ts_str: str) -> datetime:
         return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
+    # Separate edit events from focus events
+    from .load import is_edit_event
+    edit_events = [e for e in jsonData if is_edit_event(e)]
+    focus_events = [e for e in jsonData if e.get("type") == "focusStatus"]
+    if not edit_events:
+        return None
     # Identify session boundaries: sessions start at indices where offset == 0
     # (indicating file reopen/recording restart) and oldFragment == newFragment (initial snapshot)
     session_starts = [0]  # First session always starts at index 0
-    for idx in range(1, len(jsonData)):
-        offset = jsonData[idx].get("offset", -1)
-        old_frag = jsonData[idx].get("oldFragment", "")
-        new_frag = jsonData[idx].get("newFragment", "")
+    for idx in range(1, len(edit_events)):
+        offset = edit_events[idx].get("offset", -1)
+        old_frag = edit_events[idx].get("oldFragment", "")
+        new_frag = edit_events[idx].get("newFragment", "")
         # Session boundary: offset is 0 and it's an initial snapshot (old == new, non-empty)
         if offset == 0 and old_frag == new_frag and old_frag.strip() != "":
             session_starts.append(idx)
     # Add sentinel to mark end of last session
-    session_starts.append(len(jsonData))
+    session_starts.append(len(edit_events))
     # Find first and last timestamps overall
     first_timestamp_overall = None
     last_timestamp_overall = None
-    for event in jsonData:
+    for event in edit_events:
         if event.get("timestamp"):
             if first_timestamp_overall is None:
                 first_timestamp_overall = event["timestamp"]
@@ -606,34 +721,72 @@ def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: i
         # Not enough events with timestamps
         return None
+    # Build a focus status timeline from focus events
+    # Map timestamp -> focused (True/False)
+    focus_timeline: list[tuple[datetime, bool]] = []
+    for focus_event in focus_events:
+        if "timestamp" in focus_event and "focused" in focus_event:
+            try:
+                ts = parse_ts(focus_event["timestamp"])
+                focused = focus_event["focused"]
+                focus_timeline.append((ts, focused))
+            except (ValueError, KeyError):
+                continue
+    # Sort by timestamp
+    focus_timeline.sort(key=lambda x: x[0])
+    def is_focused_at(timestamp: datetime) -> bool:
+        """Check if the window was focused at the given timestamp."""
+        # Walk backwards through focus events to find the most recent state
+        for ts, focused in reversed(focus_timeline):
+            if ts <= timestamp:
+                return focused
+        # Default to focused if no prior focus event found
+        return True
     # Calculate elapsed time by summing editing time within each session
+    # with focus-aware gap handling
     total_minutes_elapsed = 0.0
+    UNFOCUSED_GAP_THRESHOLD_MINUTES = 5.0  # Don't count gaps > 5 min when unfocused
     for i in range(len(session_starts) - 1):
         session_start = session_starts[i]
         session_end = session_starts[i + 1]
-        # Find first and last events with timestamps in this session
-        first_event_time = None
-        last_event_time = None
+        # Collect all timestamped events in this session
+        session_events: list[tuple[datetime, int]] = []
         for idx in range(session_start, session_end):
-            event = jsonData[idx]
+            event = edit_events[idx]
             timestamp = event.get("timestamp")
             if timestamp:
                 try:
                     event_time = parse_ts(timestamp)
-                    if first_event_time is None:
-                        first_event_time = event_time
-                    last_event_time = event_time
+                    session_events.append((event_time, idx))
                 except (ValueError, KeyError):
-                    # Skip events with invalid timestamps
                     continue
-        # If this session has timestamped events, add its elapsed time
-        if first_event_time is not None and last_event_time is not None:
-            session_diff = last_event_time - first_event_time
-            total_minutes_elapsed += session_diff.total_seconds() / 60
+        if not session_events:
+            continue
+        # Sort by timestamp
+        session_events.sort(key=lambda x: x[0])
+        # Calculate time by summing gaps between consecutive events
+        for j in range(len(session_events) - 1):
+            current_time, _ = session_events[j]
+            next_time, _ = session_events[j + 1]
+            gap_seconds = (next_time - current_time).total_seconds()
+            gap_minutes = gap_seconds / 60
+            # Check focus status at the end of this gap (next_time)
+            # If unfocused and gap is large, don't count it
+            if not is_focused_at(next_time) and gap_minutes > UNFOCUSED_GAP_THRESHOLD_MINUTES:
+                # Skip this gap - student was away from editor
+                continue
+            total_minutes_elapsed += gap_minutes
     # For time limit check, use the span from first to last timestamp overall
     try:
@@ -681,3 +834,60 @@ def verify(template: str, jsonData: tuple[dict[str, Any], ...]) -> tuple[str, li
     suspicious_events = detect_external_copypaste(jsonData)
     return verified_template, suspicious_events
+def combine_time_info(
+    time_infos: list[dict[str, Any] | None], time_limit_minutes: int | None
+) -> dict[str, Any] | None:
+    """
+    Combine time information from multiple recording files.
+    Parameters
+    ----------
+    time_infos : list[dict[str, Any] | None]
+        List of time information dictionaries from multiple files
+    time_limit_minutes : int | None
+        Time limit to check against
+    Returns
+    -------
+    dict[str, Any] | None
+        Combined time information, or None if no valid data
+    """
+    valid_infos = [info for info in time_infos if info is not None]
+    if not valid_infos:
+        return None
+    # Sum elapsed times across all sessions
+    total_elapsed = sum(info["minutes_elapsed"] for info in valid_infos)
+    # Find overall first and last timestamps
+    all_timestamps = []
+    for info in valid_infos:
+        all_timestamps.append(
+            datetime.fromisoformat(info["first_timestamp"].replace("Z", "+00:00"))
+        )
+        all_timestamps.append(
+            datetime.fromisoformat(info["last_timestamp"].replace("Z", "+00:00"))
+        )
+    first_ts = min(all_timestamps)
+    last_ts = max(all_timestamps)
+    overall_span = (last_ts - first_ts).total_seconds() / 60
+    result = {
+        "time_limit_minutes": time_limit_minutes,
+        "minutes_elapsed": round(total_elapsed, 2),
+        "first_timestamp": first_ts.isoformat().replace("+00:00", "Z"),
+        "last_timestamp": last_ts.isoformat().replace("+00:00", "Z"),
+        "file_count": len(valid_infos),
+        "overall_span_minutes": round(overall_span, 2),
+    }
+    # For time limit check in combined mode, use the sum of elapsed times
+    if time_limit_minutes is not None:
+        result["exceeds_limit"] = total_elapsed > time_limit_minutes
+    else:
+        result["exceeds_limit"] = False
+    return result

cr-proc 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

cr-proc 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl