cr_proc 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cr_proc/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Code Recorder Processor - A tool for processing BYU CS code recording files."""
2
+ from importlib.metadata import version, PackageNotFoundError
3
+
4
+ try:
5
+ __version__ = version("cr_proc")
6
+ except PackageNotFoundError:
7
+ __version__ = "unknown"
cr_proc/api/build.py ADDED
@@ -0,0 +1,184 @@
1
+ """Replay edit events to reconstruct document state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+ from ..timeutil import parse_timestamp
9
+ from .document import filter_events_by_document_with_rename_handling
10
+ from .load import filter_edit_events
11
+
12
+
13
+ def _normalize_newlines(text: str) -> str:
14
+ """Normalize CRLF to LF for stable replay and diff behavior."""
15
+ return text.replace("\r\n", "\n")
16
+
17
+
18
+ def _ordered_edit_events(events: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
19
+ decorated: list[tuple[int, object, dict[str, Any]]] = []
20
+ for index, event in enumerate(events):
21
+ timestamp = event.get("timestamp")
22
+ if timestamp:
23
+ try:
24
+ decorated.append((0, parse_timestamp(str(timestamp)), event))
25
+ continue
26
+ except ValueError:
27
+ pass
28
+ decorated.append((1, index, event))
29
+
30
+ decorated.sort(key=lambda item: (item[0], item[1]))
31
+ return [event for _, _, event in decorated]
32
+
33
+
34
+ def _utf16_units_to_index(text: str, units: int) -> int:
35
+ if units <= 0:
36
+ return 0
37
+
38
+ consumed = 0
39
+ index = 0
40
+ for char in text:
41
+ if consumed >= units:
42
+ break
43
+ consumed += 2 if ord(char) > 0xFFFF else 1
44
+ index += 1
45
+ return index
46
+
47
+
48
+ def _resolve_offset(document: str, old_fragment: str, offset: int, window: int) -> int:
49
+ if old_fragment == "":
50
+ return max(0, min(offset, len(document)))
51
+
52
+ if 0 <= offset <= len(document) and document[offset : offset + len(old_fragment)] == old_fragment:
53
+ return offset
54
+
55
+ start = max(0, offset - window)
56
+ end = min(len(document), offset + window + len(old_fragment))
57
+
58
+ best_match: tuple[int, int] | None = None
59
+ search_at = start
60
+ while True:
61
+ found = document.find(old_fragment, search_at, end)
62
+ if found == -1:
63
+ break
64
+ distance = abs(found - offset)
65
+ candidate = (distance, found)
66
+ if best_match is None or candidate < best_match:
67
+ best_match = candidate
68
+ search_at = found + 1
69
+
70
+ if best_match is None:
71
+ raise ValueError(
72
+ f"Old fragment not found near offset {offset}.\n"
73
+ f"old={old_fragment!r}\nnew fragment length={len(old_fragment)}"
74
+ )
75
+
76
+ return best_match[1]
77
+
78
+
79
+ def _apply_edit(
80
+ document: str,
81
+ *,
82
+ old_fragment: str,
83
+ new_fragment: str,
84
+ offset: int,
85
+ window: int,
86
+ utf16_mode: bool,
87
+ ) -> str:
88
+ text_offset = _utf16_units_to_index(document, offset) if utf16_mode else offset
89
+ resolved_offset = _resolve_offset(document, old_fragment, text_offset, window)
90
+ return (
91
+ document[:resolved_offset]
92
+ + new_fragment
93
+ + document[resolved_offset + len(old_fragment) :]
94
+ )
95
+
96
+
97
+ def reconstruct_file_from_events(
98
+ events: tuple[dict[str, Any], ...],
99
+ template: str,
100
+ document_path: str | None = None,
101
+ *,
102
+ utf16_mode: bool = False,
103
+ window: int = 200,
104
+ normalize_newlines: bool = True,
105
+ skip_unreplayable: bool = True,
106
+ ) -> str:
107
+ """Replay edit events to reconstruct the final document state."""
108
+ edit_events = filter_edit_events(events)
109
+ if not edit_events:
110
+ return _normalize_newlines(template) if normalize_newlines else template
111
+
112
+ target_document = document_path
113
+ if target_document is None:
114
+ recorded_docs = {
115
+ str(event["document"])
116
+ for event in edit_events
117
+ if event.get("document") is not None
118
+ }
119
+ if len(recorded_docs) == 1:
120
+ target_document = next(iter(recorded_docs))
121
+ else:
122
+ raise ValueError(
123
+ "Ambiguous target document: provide document_path explicitly."
124
+ )
125
+
126
+ doc_events = tuple(
127
+ filter_events_by_document_with_rename_handling(edit_events, target_document)
128
+ )
129
+ ordered_events = _ordered_edit_events(doc_events)
130
+ if not ordered_events:
131
+ return _normalize_newlines(template) if normalize_newlines else template
132
+
133
+ document = _normalize_newlines(template) if normalize_newlines else template
134
+ skipped = 0
135
+
136
+ for event_index, event in enumerate(ordered_events):
137
+ old_fragment = str(event.get("oldFragment", ""))
138
+ new_fragment = str(event.get("newFragment", ""))
139
+ if normalize_newlines:
140
+ old_fragment = _normalize_newlines(old_fragment)
141
+ new_fragment = _normalize_newlines(new_fragment)
142
+
143
+ try:
144
+ offset = int(event.get("offset", 0) or 0)
145
+ except (TypeError, ValueError):
146
+ offset = 0
147
+
148
+ if old_fragment == new_fragment and offset == 0:
149
+ if old_fragment:
150
+ document = old_fragment
151
+ continue
152
+
153
+ if old_fragment == new_fragment:
154
+ continue
155
+
156
+ try:
157
+ document = _apply_edit(
158
+ document,
159
+ old_fragment=old_fragment,
160
+ new_fragment=new_fragment,
161
+ offset=offset,
162
+ window=window,
163
+ utf16_mode=utf16_mode,
164
+ )
165
+ except ValueError as exc:
166
+ if not skip_unreplayable:
167
+ raise
168
+
169
+ skipped += 1
170
+ print(
171
+ "Warning: "
172
+ f"Skipping event #{event_index} "
173
+ f"(timestamp: {event.get('timestamp', 'unknown')}): "
174
+ f"{exc} - document offset may have drifted",
175
+ file=sys.stderr,
176
+ )
177
+
178
+ if skipped:
179
+ print(
180
+ f"Warning: Skipped {skipped} event(s) due to offset drift",
181
+ file=sys.stderr,
182
+ )
183
+
184
+ return document
@@ -0,0 +1,249 @@
1
+ """Document selection and template matching utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import difflib
6
+ from pathlib import Path, PurePosixPath, PureWindowsPath
7
+ from typing import Any
8
+
9
+
10
+ def normalize_path_string(path_str: str) -> str:
11
+ """Normalize a Windows or POSIX path string into POSIX form."""
12
+ if "\\" in path_str:
13
+ return PureWindowsPath(path_str).as_posix()
14
+ return PurePosixPath(path_str).as_posix()
15
+
16
+
17
+ def _path_parts(path_str: str) -> PurePosixPath:
18
+ return PurePosixPath(normalize_path_string(path_str))
19
+
20
+
21
+ def _stem_candidates(stem: str) -> list[str]:
22
+ candidates = [stem]
23
+ current = stem
24
+ while True:
25
+ last_dash = current.rfind("-")
26
+ last_underscore = current.rfind("_")
27
+ split_index = max(last_dash, last_underscore)
28
+ if split_index <= 0:
29
+ break
30
+ current = current[:split_index]
31
+ if current not in candidates:
32
+ candidates.append(current)
33
+ return candidates
34
+
35
+
36
+ def _is_recording_artifact(path: Path) -> bool:
37
+ name = path.name.casefold()
38
+ return name.endswith(".recording.jsonl.gz") or name.endswith(".jsonl.gz") or name.endswith(".jsonl")
39
+
40
+
41
+ def get_normalized_document_key(doc_path: str) -> tuple[str, str]:
42
+ """Return a normalized filename and suffix pair for a document path."""
43
+ path = _path_parts(doc_path)
44
+ return path.name, path.suffix
45
+
46
+
47
+ def group_documents_by_name(docs: list[str]) -> dict[tuple[str, str], list[str]]:
48
+ """Group recorder document paths by normalized filename and suffix."""
49
+ groups: dict[tuple[str, str], list[str]] = {}
50
+ for doc in docs:
51
+ groups.setdefault(get_normalized_document_key(doc), []).append(doc)
52
+ return groups
53
+
54
+
55
+ def get_recorded_documents(events: tuple[dict[str, Any], ...]) -> list[str]:
56
+ """Return sorted unique document paths found in the recording."""
57
+ return sorted(
58
+ {
59
+ str(event["document"])
60
+ for event in events
61
+ if event.get("document") is not None
62
+ }
63
+ )
64
+
65
+
66
+ def _matching_documents(recorded_docs: list[str], requested: str) -> list[str]:
67
+ target = _path_parts(requested)
68
+ exact_matches = [
69
+ doc for doc in recorded_docs if normalize_path_string(doc) == target.as_posix()
70
+ ]
71
+ if exact_matches:
72
+ return exact_matches
73
+
74
+ by_name = [
75
+ doc for doc in recorded_docs if _path_parts(doc).name.casefold() == target.name.casefold()
76
+ ]
77
+ if by_name:
78
+ return by_name
79
+
80
+ by_stem = [
81
+ doc
82
+ for doc in recorded_docs
83
+ if _path_parts(doc).stem.casefold() == target.stem.casefold()
84
+ and (
85
+ not target.suffix
86
+ or _path_parts(doc).suffix.casefold() == target.suffix.casefold()
87
+ )
88
+ ]
89
+ if by_stem:
90
+ return by_stem
91
+
92
+ if target.suffix:
93
+ same_suffix = [
94
+ doc for doc in recorded_docs if _path_parts(doc).suffix.casefold() == target.suffix.casefold()
95
+ ]
96
+ if len(same_suffix) == 1:
97
+ return same_suffix
98
+ if same_suffix:
99
+ names = [_path_parts(doc).name for doc in same_suffix]
100
+ match = difflib.get_close_matches(target.name, names, n=1, cutoff=0.4)
101
+ if match:
102
+ return [doc for doc in same_suffix if _path_parts(doc).name == match[0]]
103
+
104
+ return []
105
+
106
+
107
+ def filter_events_by_document(
108
+ events: tuple[dict[str, Any], ...],
109
+ document: str | None,
110
+ ) -> tuple[dict[str, Any], ...]:
111
+ """Filter events to an exact document path match."""
112
+ if document is None:
113
+ return events
114
+
115
+ target = normalize_path_string(document)
116
+ return tuple(
117
+ event
118
+ for event in events
119
+ if normalize_path_string(str(event.get("document", ""))) == target
120
+ )
121
+
122
+
123
+ def filter_events_by_document_with_rename_handling(
124
+ events: tuple[dict[str, Any], ...],
125
+ document: str | None,
126
+ ) -> tuple[dict[str, Any], ...]:
127
+ """Filter events to the best matching document, allowing renamed paths."""
128
+ if document is None:
129
+ return events
130
+
131
+ recorded_docs = get_recorded_documents(events)
132
+ matching_docs = _matching_documents(recorded_docs, document)
133
+ if not matching_docs:
134
+ return ()
135
+
136
+ normalized_matches = {normalize_path_string(doc) for doc in matching_docs}
137
+ return tuple(
138
+ event
139
+ for event in events
140
+ if normalize_path_string(str(event.get("document", ""))) in normalized_matches
141
+ )
142
+
143
+
144
+ def find_matching_template(template_dir: Path, document_path: str) -> Path | None:
145
+ """Find the template file that best matches a recorded document."""
146
+ if not template_dir.is_dir():
147
+ return None
148
+
149
+ candidates = sorted(
150
+ path
151
+ for path in template_dir.iterdir()
152
+ if path.is_file() and not _is_recording_artifact(path)
153
+ )
154
+ if not candidates:
155
+ return None
156
+
157
+ document = _path_parts(document_path)
158
+ exact = template_dir / document.name
159
+ if exact.is_file() and not _is_recording_artifact(exact):
160
+ return exact
161
+
162
+ suffix_candidates = candidates
163
+ if document.suffix:
164
+ suffix_candidates = [
165
+ path for path in candidates if path.suffix.casefold() == document.suffix.casefold()
166
+ ]
167
+
168
+ for stem in _stem_candidates(document.stem):
169
+ same_stem = [
170
+ path
171
+ for path in suffix_candidates
172
+ if path.stem == stem
173
+ ]
174
+ if same_stem:
175
+ return same_stem[0]
176
+
177
+ fuzzy_candidates = suffix_candidates if suffix_candidates else candidates
178
+ if document.suffix and not suffix_candidates:
179
+ return None
180
+
181
+ names = [path.name for path in fuzzy_candidates]
182
+ name_match = difflib.get_close_matches(document.name, names, n=1, cutoff=0.4)
183
+ if name_match:
184
+ return template_dir / name_match[0]
185
+
186
+ stems = [path.stem for path in fuzzy_candidates]
187
+ stem_match = difflib.get_close_matches(document.stem, stems, n=1, cutoff=0.4)
188
+ if stem_match:
189
+ for path in fuzzy_candidates:
190
+ if path.stem == stem_match[0]:
191
+ return path
192
+
193
+ return None
194
+
195
+
196
+ def resolve_document(
197
+ recorded_docs: list[str],
198
+ template_base: Path | None,
199
+ document_override: str | None,
200
+ ) -> str | None:
201
+ """Choose the document to process from a recording."""
202
+ if not recorded_docs:
203
+ return None
204
+
205
+ if document_override:
206
+ matches = _matching_documents(recorded_docs, document_override)
207
+ return matches[0] if matches else document_override
208
+
209
+ if len(recorded_docs) == 1:
210
+ return recorded_docs[0]
211
+
212
+ if template_base and template_base.is_file():
213
+ matches = _matching_documents(recorded_docs, str(template_base))
214
+ if matches:
215
+ return matches[0]
216
+
217
+ if template_base and template_base.suffix:
218
+ same_suffix = [
219
+ doc for doc in recorded_docs if _path_parts(doc).suffix == template_base.suffix
220
+ ]
221
+ if len(same_suffix) == 1:
222
+ return same_suffix[0]
223
+
224
+ python_docs = [doc for doc in recorded_docs if _path_parts(doc).suffix == ".py"]
225
+ if len(python_docs) == 1:
226
+ return python_docs[0]
227
+
228
+ return recorded_docs[0]
229
+
230
+
231
+ def resolve_template_file(
232
+ template_path: Path | None,
233
+ template_dir: Path | None,
234
+ document_path: str | None,
235
+ ) -> tuple[str, Path | None]:
236
+ """Load the template text for a document from a file or template directory."""
237
+ if template_dir is not None:
238
+ if document_path is None:
239
+ return "", None
240
+ match = find_matching_template(template_dir, document_path)
241
+ if match is None:
242
+ return "", None
243
+ return match.read_text(), match
244
+
245
+ if template_path is None:
246
+ return "", None
247
+ if not template_path.exists():
248
+ raise FileNotFoundError(f"Template file not found: {template_path}")
249
+ return template_path.read_text(), template_path
cr_proc/api/load.py ADDED
@@ -0,0 +1,165 @@
1
+ """Utilities for loading and classifying recorder event streams."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gzip
6
+ import json
7
+ import zlib
8
+ from gzip import BadGzipFile
9
+ from io import StringIO
10
+ from pathlib import Path
11
+ from typing import Any, TextIO
12
+
13
+
14
+ Event = dict[str, Any]
15
+ _GZIP_MAGIC = b"\x1f\x8b"
16
+
17
+
18
+ def _parse_jsonl(stream: TextIO) -> tuple[Event, ...]:
19
+ events: list[Event] = []
20
+ for line_number, raw_line in enumerate(stream, start=1):
21
+ line = raw_line.strip()
22
+ if not line:
23
+ continue
24
+ try:
25
+ payload = json.loads(line)
26
+ except json.JSONDecodeError as exc:
27
+ raise ValueError(f"Invalid JSON on line {line_number}: {exc.msg}") from exc
28
+ if not isinstance(payload, dict):
29
+ raise ValueError(f"Line {line_number} must decode to a JSON object")
30
+ events.append(payload)
31
+
32
+ if not events:
33
+ raise ValueError("JSONL file is empty")
34
+
35
+ return tuple(events)
36
+
37
+
38
+ def _salvage_gzip_text(raw: bytes) -> str | None:
39
+ """Best-effort recovery for truncated or partially corrupted gzip data."""
40
+ chunks: list[bytes] = []
41
+ remaining = raw
42
+
43
+ while remaining:
44
+ decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
45
+ try:
46
+ chunk = decompressor.decompress(remaining) + decompressor.flush()
47
+ except zlib.error:
48
+ break
49
+
50
+ if chunk:
51
+ chunks.append(chunk)
52
+
53
+ if not decompressor.unused_data:
54
+ break
55
+ if decompressor.unused_data == remaining:
56
+ break
57
+
58
+ remaining = decompressor.unused_data
59
+
60
+ if not chunks:
61
+ return None
62
+
63
+ return b"".join(chunks).decode("utf-8", errors="replace")
64
+
65
+
66
+ def _decode_jsonl_bytes(raw: bytes, *, prefer_gzip: bool) -> tuple[Event, ...]:
67
+ gzip_error: Exception | None = None
68
+ text_error: Exception | None = None
69
+
70
+ if prefer_gzip:
71
+ try:
72
+ text = gzip.decompress(raw).decode("utf-8")
73
+ return _parse_jsonl(StringIO(text))
74
+ except (
75
+ BadGzipFile,
76
+ EOFError,
77
+ OSError,
78
+ UnicodeDecodeError,
79
+ ValueError,
80
+ ) as exc:
81
+ gzip_error = exc
82
+
83
+ salvaged_text = _salvage_gzip_text(raw)
84
+ if salvaged_text is not None:
85
+ try:
86
+ return _parse_jsonl(StringIO(salvaged_text))
87
+ except ValueError as exc:
88
+ gzip_error = exc
89
+
90
+ try:
91
+ text = raw.decode("utf-8")
92
+ return _parse_jsonl(StringIO(text))
93
+ except (UnicodeDecodeError, ValueError) as exc:
94
+ text_error = exc
95
+
96
+ if gzip_error is not None and text_error is not None:
97
+ raise IOError(
98
+ f"Unable to decode recording as gzip ({gzip_error}) "
99
+ f"or plain text ({text_error})"
100
+ ) from gzip_error
101
+ if gzip_error is not None:
102
+ raise IOError(f"Unable to decode recording as gzip: {gzip_error}") from gzip_error
103
+
104
+ assert text_error is not None
105
+ raise IOError(f"Unable to decode recording as plain text: {text_error}") from text_error
106
+
107
+
108
+ def load_jsonl(file: Path) -> tuple[Event, ...]:
109
+ """Load a plain or gzipped JSONL recorder file."""
110
+ if not file.exists():
111
+ raise FileNotFoundError(f"File not found: {file}")
112
+ if not file.is_file():
113
+ raise ValueError(f"Path is not a file: {file}")
114
+
115
+ try:
116
+ raw = file.read_bytes()
117
+ except OSError as exc:
118
+ raise IOError(f"Error reading file: {exc}") from exc
119
+
120
+ if not raw:
121
+ raise ValueError("JSONL file is empty")
122
+
123
+ prefer_gzip = raw.startswith(_GZIP_MAGIC) or file.suffix == ".gz"
124
+ return _decode_jsonl_bytes(raw, prefer_gzip=prefer_gzip)
125
+
126
+
127
+ def is_edit_event(event: Event) -> bool:
128
+ """Return True for recorder edit events, including legacy untyped entries."""
129
+ return event.get("type") in (None, "edit")
130
+
131
+
132
+ def filter_edit_events(events: tuple[Event, ...]) -> tuple[Event, ...]:
133
+ """Keep edit events and prefer the typed stream when mixed streams appear."""
134
+ edit_events = tuple(event for event in events if is_edit_event(event))
135
+ if not edit_events:
136
+ return ()
137
+
138
+ first_typed_idx = next(
139
+ (index for index, event in enumerate(edit_events) if event.get("type") == "edit"),
140
+ None,
141
+ )
142
+ first_legacy_idx = next(
143
+ (index for index, event in enumerate(edit_events) if event.get("type") is None),
144
+ None,
145
+ )
146
+
147
+ if (
148
+ first_typed_idx is not None
149
+ and first_legacy_idx is not None
150
+ and first_typed_idx < first_legacy_idx
151
+ ):
152
+ return tuple(event for event in edit_events if event.get("type") == "edit")
153
+
154
+ return edit_events
155
+
156
+
157
+ def get_focus_events(events: tuple[Event, ...]) -> tuple[Event, ...]:
158
+ """Return focus status events from a recording."""
159
+ return tuple(event for event in events if event.get("type") == "focusStatus")
160
+
161
+
162
+ def get_editors(events: tuple[Event, ...]) -> set[str]:
163
+ """Return the set of editors observed in the recording."""
164
+ editors = {str(event["editor"]) for event in events if event.get("editor")}
165
+ return editors or {"jetbrains"}
cr_proc/api/output.py ADDED
@@ -0,0 +1,75 @@
1
+ """JSON output helpers for processed recordings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from .document import normalize_path_string
11
+
12
+
13
+ def _sanitize_time_info(time_info: dict[str, Any] | None) -> dict[str, Any] | None:
14
+ if time_info is None:
15
+ return None
16
+ return {key: value for key, value in time_info.items() if key != "active_intervals"}
17
+
18
+
19
+ def write_batch_json_output(
20
+ output_path: Path,
21
+ results: list[dict[str, Any]],
22
+ combined_time_info: dict[str, Any] | None,
23
+ all_verified: bool,
24
+ batch_mode: bool = False,
25
+ submitted_all_passed: bool | None = None,
26
+ version: str | None = None,
27
+ ) -> None:
28
+ """Write processing results to a JSON file."""
29
+ files_data: list[dict[str, Any]] = []
30
+ all_editors: set[str] = set()
31
+
32
+ for result in results:
33
+ editors = list(result.get("editors", []))
34
+ all_editors.update(editors)
35
+
36
+ file_data = {
37
+ "jsonl_file": normalize_path_string(str(result["jsonl_file"])),
38
+ "document": result.get("target_document"),
39
+ "verified": result.get("verified", False),
40
+ "time_info": _sanitize_time_info(result.get("time_info")),
41
+ "suspicious_events": result.get("suspicious_events", []),
42
+ "template_diff": result.get("template_diff", ""),
43
+ "reconstructed_code": result.get("reconstructed", ""),
44
+ }
45
+ if editors:
46
+ file_data["editors"] = editors
47
+ if result.get("warnings"):
48
+ file_data["warnings"] = result["warnings"]
49
+ if result.get("submitted_comparison") is not None:
50
+ file_data["submitted_comparison"] = result["submitted_comparison"]
51
+
52
+ files_data.append(file_data)
53
+
54
+ payload: dict[str, Any] = {
55
+ "version": version,
56
+ "batch_mode": batch_mode,
57
+ "total_files": len(results),
58
+ "verified_count": sum(1 for result in results if result.get("verified")),
59
+ "all_verified": all_verified,
60
+ "files": files_data,
61
+ }
62
+ if combined_time_info is not None:
63
+ payload["combined_time_info"] = _sanitize_time_info(combined_time_info)
64
+ if all_editors:
65
+ payload["editors"] = sorted(all_editors)
66
+ if submitted_all_passed is not None:
67
+ payload["submitted_all_passed"] = submitted_all_passed
68
+
69
+ output_path.parent.mkdir(parents=True, exist_ok=True)
70
+ output_path.write_text(json.dumps(payload, indent=2))
71
+
72
+ if batch_mode:
73
+ print(f"Batch results written to {output_path}", file=sys.stderr)
74
+ else:
75
+ print(f"Results written to {output_path}", file=sys.stderr)