cr_proc 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cr_proc/__init__.py +7 -0
- cr_proc/api/build.py +184 -0
- cr_proc/api/document.py +249 -0
- cr_proc/api/load.py +165 -0
- cr_proc/api/output.py +75 -0
- cr_proc/api/verify.py +672 -0
- cr_proc/cli.py +556 -0
- cr_proc/display.py +157 -0
- cr_proc/playback.py +559 -0
- cr_proc/timeutil.py +31 -0
- cr_proc-0.2.0.dist-info/METADATA +247 -0
- cr_proc-0.2.0.dist-info/RECORD +14 -0
- cr_proc-0.2.0.dist-info/WHEEL +4 -0
- cr_proc-0.2.0.dist-info/entry_points.txt +3 -0
cr_proc/__init__.py
ADDED
cr_proc/api/build.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Replay edit events to reconstruct document state."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..timeutil import parse_timestamp
|
|
9
|
+
from .document import filter_events_by_document_with_rename_handling
|
|
10
|
+
from .load import filter_edit_events
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize_newlines(text: str) -> str:
|
|
14
|
+
"""Normalize CRLF to LF for stable replay and diff behavior."""
|
|
15
|
+
return text.replace("\r\n", "\n")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _ordered_edit_events(events: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
|
|
19
|
+
decorated: list[tuple[int, object, dict[str, Any]]] = []
|
|
20
|
+
for index, event in enumerate(events):
|
|
21
|
+
timestamp = event.get("timestamp")
|
|
22
|
+
if timestamp:
|
|
23
|
+
try:
|
|
24
|
+
decorated.append((0, parse_timestamp(str(timestamp)), event))
|
|
25
|
+
continue
|
|
26
|
+
except ValueError:
|
|
27
|
+
pass
|
|
28
|
+
decorated.append((1, index, event))
|
|
29
|
+
|
|
30
|
+
decorated.sort(key=lambda item: (item[0], item[1]))
|
|
31
|
+
return [event for _, _, event in decorated]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _utf16_units_to_index(text: str, units: int) -> int:
|
|
35
|
+
if units <= 0:
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
consumed = 0
|
|
39
|
+
index = 0
|
|
40
|
+
for char in text:
|
|
41
|
+
if consumed >= units:
|
|
42
|
+
break
|
|
43
|
+
consumed += 2 if ord(char) > 0xFFFF else 1
|
|
44
|
+
index += 1
|
|
45
|
+
return index
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _resolve_offset(document: str, old_fragment: str, offset: int, window: int) -> int:
|
|
49
|
+
if old_fragment == "":
|
|
50
|
+
return max(0, min(offset, len(document)))
|
|
51
|
+
|
|
52
|
+
if 0 <= offset <= len(document) and document[offset : offset + len(old_fragment)] == old_fragment:
|
|
53
|
+
return offset
|
|
54
|
+
|
|
55
|
+
start = max(0, offset - window)
|
|
56
|
+
end = min(len(document), offset + window + len(old_fragment))
|
|
57
|
+
|
|
58
|
+
best_match: tuple[int, int] | None = None
|
|
59
|
+
search_at = start
|
|
60
|
+
while True:
|
|
61
|
+
found = document.find(old_fragment, search_at, end)
|
|
62
|
+
if found == -1:
|
|
63
|
+
break
|
|
64
|
+
distance = abs(found - offset)
|
|
65
|
+
candidate = (distance, found)
|
|
66
|
+
if best_match is None or candidate < best_match:
|
|
67
|
+
best_match = candidate
|
|
68
|
+
search_at = found + 1
|
|
69
|
+
|
|
70
|
+
if best_match is None:
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Old fragment not found near offset {offset}.\n"
|
|
73
|
+
f"old={old_fragment!r}\nnew fragment length={len(old_fragment)}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return best_match[1]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _apply_edit(
|
|
80
|
+
document: str,
|
|
81
|
+
*,
|
|
82
|
+
old_fragment: str,
|
|
83
|
+
new_fragment: str,
|
|
84
|
+
offset: int,
|
|
85
|
+
window: int,
|
|
86
|
+
utf16_mode: bool,
|
|
87
|
+
) -> str:
|
|
88
|
+
text_offset = _utf16_units_to_index(document, offset) if utf16_mode else offset
|
|
89
|
+
resolved_offset = _resolve_offset(document, old_fragment, text_offset, window)
|
|
90
|
+
return (
|
|
91
|
+
document[:resolved_offset]
|
|
92
|
+
+ new_fragment
|
|
93
|
+
+ document[resolved_offset + len(old_fragment) :]
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def reconstruct_file_from_events(
|
|
98
|
+
events: tuple[dict[str, Any], ...],
|
|
99
|
+
template: str,
|
|
100
|
+
document_path: str | None = None,
|
|
101
|
+
*,
|
|
102
|
+
utf16_mode: bool = False,
|
|
103
|
+
window: int = 200,
|
|
104
|
+
normalize_newlines: bool = True,
|
|
105
|
+
skip_unreplayable: bool = True,
|
|
106
|
+
) -> str:
|
|
107
|
+
"""Replay edit events to reconstruct the final document state."""
|
|
108
|
+
edit_events = filter_edit_events(events)
|
|
109
|
+
if not edit_events:
|
|
110
|
+
return _normalize_newlines(template) if normalize_newlines else template
|
|
111
|
+
|
|
112
|
+
target_document = document_path
|
|
113
|
+
if target_document is None:
|
|
114
|
+
recorded_docs = {
|
|
115
|
+
str(event["document"])
|
|
116
|
+
for event in edit_events
|
|
117
|
+
if event.get("document") is not None
|
|
118
|
+
}
|
|
119
|
+
if len(recorded_docs) == 1:
|
|
120
|
+
target_document = next(iter(recorded_docs))
|
|
121
|
+
else:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"Ambiguous target document: provide document_path explicitly."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
doc_events = tuple(
|
|
127
|
+
filter_events_by_document_with_rename_handling(edit_events, target_document)
|
|
128
|
+
)
|
|
129
|
+
ordered_events = _ordered_edit_events(doc_events)
|
|
130
|
+
if not ordered_events:
|
|
131
|
+
return _normalize_newlines(template) if normalize_newlines else template
|
|
132
|
+
|
|
133
|
+
document = _normalize_newlines(template) if normalize_newlines else template
|
|
134
|
+
skipped = 0
|
|
135
|
+
|
|
136
|
+
for event_index, event in enumerate(ordered_events):
|
|
137
|
+
old_fragment = str(event.get("oldFragment", ""))
|
|
138
|
+
new_fragment = str(event.get("newFragment", ""))
|
|
139
|
+
if normalize_newlines:
|
|
140
|
+
old_fragment = _normalize_newlines(old_fragment)
|
|
141
|
+
new_fragment = _normalize_newlines(new_fragment)
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
offset = int(event.get("offset", 0) or 0)
|
|
145
|
+
except (TypeError, ValueError):
|
|
146
|
+
offset = 0
|
|
147
|
+
|
|
148
|
+
if old_fragment == new_fragment and offset == 0:
|
|
149
|
+
if old_fragment:
|
|
150
|
+
document = old_fragment
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
if old_fragment == new_fragment:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
document = _apply_edit(
|
|
158
|
+
document,
|
|
159
|
+
old_fragment=old_fragment,
|
|
160
|
+
new_fragment=new_fragment,
|
|
161
|
+
offset=offset,
|
|
162
|
+
window=window,
|
|
163
|
+
utf16_mode=utf16_mode,
|
|
164
|
+
)
|
|
165
|
+
except ValueError as exc:
|
|
166
|
+
if not skip_unreplayable:
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
skipped += 1
|
|
170
|
+
print(
|
|
171
|
+
"Warning: "
|
|
172
|
+
f"Skipping event #{event_index} "
|
|
173
|
+
f"(timestamp: {event.get('timestamp', 'unknown')}): "
|
|
174
|
+
f"{exc} - document offset may have drifted",
|
|
175
|
+
file=sys.stderr,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if skipped:
|
|
179
|
+
print(
|
|
180
|
+
f"Warning: Skipped {skipped} event(s) due to offset drift",
|
|
181
|
+
file=sys.stderr,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return document
|
cr_proc/api/document.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Document selection and template matching utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import difflib
|
|
6
|
+
from pathlib import Path, PurePosixPath, PureWindowsPath
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def normalize_path_string(path_str: str) -> str:
|
|
11
|
+
"""Normalize a Windows or POSIX path string into POSIX form."""
|
|
12
|
+
if "\\" in path_str:
|
|
13
|
+
return PureWindowsPath(path_str).as_posix()
|
|
14
|
+
return PurePosixPath(path_str).as_posix()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _path_parts(path_str: str) -> PurePosixPath:
|
|
18
|
+
return PurePosixPath(normalize_path_string(path_str))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _stem_candidates(stem: str) -> list[str]:
|
|
22
|
+
candidates = [stem]
|
|
23
|
+
current = stem
|
|
24
|
+
while True:
|
|
25
|
+
last_dash = current.rfind("-")
|
|
26
|
+
last_underscore = current.rfind("_")
|
|
27
|
+
split_index = max(last_dash, last_underscore)
|
|
28
|
+
if split_index <= 0:
|
|
29
|
+
break
|
|
30
|
+
current = current[:split_index]
|
|
31
|
+
if current not in candidates:
|
|
32
|
+
candidates.append(current)
|
|
33
|
+
return candidates
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _is_recording_artifact(path: Path) -> bool:
|
|
37
|
+
name = path.name.casefold()
|
|
38
|
+
return name.endswith(".recording.jsonl.gz") or name.endswith(".jsonl.gz") or name.endswith(".jsonl")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_normalized_document_key(doc_path: str) -> tuple[str, str]:
|
|
42
|
+
"""Return a normalized filename and suffix pair for a document path."""
|
|
43
|
+
path = _path_parts(doc_path)
|
|
44
|
+
return path.name, path.suffix
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def group_documents_by_name(docs: list[str]) -> dict[tuple[str, str], list[str]]:
|
|
48
|
+
"""Group recorder document paths by normalized filename and suffix."""
|
|
49
|
+
groups: dict[tuple[str, str], list[str]] = {}
|
|
50
|
+
for doc in docs:
|
|
51
|
+
groups.setdefault(get_normalized_document_key(doc), []).append(doc)
|
|
52
|
+
return groups
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_recorded_documents(events: tuple[dict[str, Any], ...]) -> list[str]:
|
|
56
|
+
"""Return sorted unique document paths found in the recording."""
|
|
57
|
+
return sorted(
|
|
58
|
+
{
|
|
59
|
+
str(event["document"])
|
|
60
|
+
for event in events
|
|
61
|
+
if event.get("document") is not None
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _matching_documents(recorded_docs: list[str], requested: str) -> list[str]:
|
|
67
|
+
target = _path_parts(requested)
|
|
68
|
+
exact_matches = [
|
|
69
|
+
doc for doc in recorded_docs if normalize_path_string(doc) == target.as_posix()
|
|
70
|
+
]
|
|
71
|
+
if exact_matches:
|
|
72
|
+
return exact_matches
|
|
73
|
+
|
|
74
|
+
by_name = [
|
|
75
|
+
doc for doc in recorded_docs if _path_parts(doc).name.casefold() == target.name.casefold()
|
|
76
|
+
]
|
|
77
|
+
if by_name:
|
|
78
|
+
return by_name
|
|
79
|
+
|
|
80
|
+
by_stem = [
|
|
81
|
+
doc
|
|
82
|
+
for doc in recorded_docs
|
|
83
|
+
if _path_parts(doc).stem.casefold() == target.stem.casefold()
|
|
84
|
+
and (
|
|
85
|
+
not target.suffix
|
|
86
|
+
or _path_parts(doc).suffix.casefold() == target.suffix.casefold()
|
|
87
|
+
)
|
|
88
|
+
]
|
|
89
|
+
if by_stem:
|
|
90
|
+
return by_stem
|
|
91
|
+
|
|
92
|
+
if target.suffix:
|
|
93
|
+
same_suffix = [
|
|
94
|
+
doc for doc in recorded_docs if _path_parts(doc).suffix.casefold() == target.suffix.casefold()
|
|
95
|
+
]
|
|
96
|
+
if len(same_suffix) == 1:
|
|
97
|
+
return same_suffix
|
|
98
|
+
if same_suffix:
|
|
99
|
+
names = [_path_parts(doc).name for doc in same_suffix]
|
|
100
|
+
match = difflib.get_close_matches(target.name, names, n=1, cutoff=0.4)
|
|
101
|
+
if match:
|
|
102
|
+
return [doc for doc in same_suffix if _path_parts(doc).name == match[0]]
|
|
103
|
+
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def filter_events_by_document(
|
|
108
|
+
events: tuple[dict[str, Any], ...],
|
|
109
|
+
document: str | None,
|
|
110
|
+
) -> tuple[dict[str, Any], ...]:
|
|
111
|
+
"""Filter events to an exact document path match."""
|
|
112
|
+
if document is None:
|
|
113
|
+
return events
|
|
114
|
+
|
|
115
|
+
target = normalize_path_string(document)
|
|
116
|
+
return tuple(
|
|
117
|
+
event
|
|
118
|
+
for event in events
|
|
119
|
+
if normalize_path_string(str(event.get("document", ""))) == target
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def filter_events_by_document_with_rename_handling(
|
|
124
|
+
events: tuple[dict[str, Any], ...],
|
|
125
|
+
document: str | None,
|
|
126
|
+
) -> tuple[dict[str, Any], ...]:
|
|
127
|
+
"""Filter events to the best matching document, allowing renamed paths."""
|
|
128
|
+
if document is None:
|
|
129
|
+
return events
|
|
130
|
+
|
|
131
|
+
recorded_docs = get_recorded_documents(events)
|
|
132
|
+
matching_docs = _matching_documents(recorded_docs, document)
|
|
133
|
+
if not matching_docs:
|
|
134
|
+
return ()
|
|
135
|
+
|
|
136
|
+
normalized_matches = {normalize_path_string(doc) for doc in matching_docs}
|
|
137
|
+
return tuple(
|
|
138
|
+
event
|
|
139
|
+
for event in events
|
|
140
|
+
if normalize_path_string(str(event.get("document", ""))) in normalized_matches
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def find_matching_template(template_dir: Path, document_path: str) -> Path | None:
|
|
145
|
+
"""Find the template file that best matches a recorded document."""
|
|
146
|
+
if not template_dir.is_dir():
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
candidates = sorted(
|
|
150
|
+
path
|
|
151
|
+
for path in template_dir.iterdir()
|
|
152
|
+
if path.is_file() and not _is_recording_artifact(path)
|
|
153
|
+
)
|
|
154
|
+
if not candidates:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
document = _path_parts(document_path)
|
|
158
|
+
exact = template_dir / document.name
|
|
159
|
+
if exact.is_file() and not _is_recording_artifact(exact):
|
|
160
|
+
return exact
|
|
161
|
+
|
|
162
|
+
suffix_candidates = candidates
|
|
163
|
+
if document.suffix:
|
|
164
|
+
suffix_candidates = [
|
|
165
|
+
path for path in candidates if path.suffix.casefold() == document.suffix.casefold()
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
for stem in _stem_candidates(document.stem):
|
|
169
|
+
same_stem = [
|
|
170
|
+
path
|
|
171
|
+
for path in suffix_candidates
|
|
172
|
+
if path.stem == stem
|
|
173
|
+
]
|
|
174
|
+
if same_stem:
|
|
175
|
+
return same_stem[0]
|
|
176
|
+
|
|
177
|
+
fuzzy_candidates = suffix_candidates if suffix_candidates else candidates
|
|
178
|
+
if document.suffix and not suffix_candidates:
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
names = [path.name for path in fuzzy_candidates]
|
|
182
|
+
name_match = difflib.get_close_matches(document.name, names, n=1, cutoff=0.4)
|
|
183
|
+
if name_match:
|
|
184
|
+
return template_dir / name_match[0]
|
|
185
|
+
|
|
186
|
+
stems = [path.stem for path in fuzzy_candidates]
|
|
187
|
+
stem_match = difflib.get_close_matches(document.stem, stems, n=1, cutoff=0.4)
|
|
188
|
+
if stem_match:
|
|
189
|
+
for path in fuzzy_candidates:
|
|
190
|
+
if path.stem == stem_match[0]:
|
|
191
|
+
return path
|
|
192
|
+
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def resolve_document(
|
|
197
|
+
recorded_docs: list[str],
|
|
198
|
+
template_base: Path | None,
|
|
199
|
+
document_override: str | None,
|
|
200
|
+
) -> str | None:
|
|
201
|
+
"""Choose the document to process from a recording."""
|
|
202
|
+
if not recorded_docs:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
if document_override:
|
|
206
|
+
matches = _matching_documents(recorded_docs, document_override)
|
|
207
|
+
return matches[0] if matches else document_override
|
|
208
|
+
|
|
209
|
+
if len(recorded_docs) == 1:
|
|
210
|
+
return recorded_docs[0]
|
|
211
|
+
|
|
212
|
+
if template_base and template_base.is_file():
|
|
213
|
+
matches = _matching_documents(recorded_docs, str(template_base))
|
|
214
|
+
if matches:
|
|
215
|
+
return matches[0]
|
|
216
|
+
|
|
217
|
+
if template_base and template_base.suffix:
|
|
218
|
+
same_suffix = [
|
|
219
|
+
doc for doc in recorded_docs if _path_parts(doc).suffix == template_base.suffix
|
|
220
|
+
]
|
|
221
|
+
if len(same_suffix) == 1:
|
|
222
|
+
return same_suffix[0]
|
|
223
|
+
|
|
224
|
+
python_docs = [doc for doc in recorded_docs if _path_parts(doc).suffix == ".py"]
|
|
225
|
+
if len(python_docs) == 1:
|
|
226
|
+
return python_docs[0]
|
|
227
|
+
|
|
228
|
+
return recorded_docs[0]
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def resolve_template_file(
|
|
232
|
+
template_path: Path | None,
|
|
233
|
+
template_dir: Path | None,
|
|
234
|
+
document_path: str | None,
|
|
235
|
+
) -> tuple[str, Path | None]:
|
|
236
|
+
"""Load the template text for a document from a file or template directory."""
|
|
237
|
+
if template_dir is not None:
|
|
238
|
+
if document_path is None:
|
|
239
|
+
return "", None
|
|
240
|
+
match = find_matching_template(template_dir, document_path)
|
|
241
|
+
if match is None:
|
|
242
|
+
return "", None
|
|
243
|
+
return match.read_text(), match
|
|
244
|
+
|
|
245
|
+
if template_path is None:
|
|
246
|
+
return "", None
|
|
247
|
+
if not template_path.exists():
|
|
248
|
+
raise FileNotFoundError(f"Template file not found: {template_path}")
|
|
249
|
+
return template_path.read_text(), template_path
|
cr_proc/api/load.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Utilities for loading and classifying recorder event streams."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import gzip
|
|
6
|
+
import json
|
|
7
|
+
import zlib
|
|
8
|
+
from gzip import BadGzipFile
|
|
9
|
+
from io import StringIO
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, TextIO
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
Event = dict[str, Any]
|
|
15
|
+
_GZIP_MAGIC = b"\x1f\x8b"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _parse_jsonl(stream: TextIO) -> tuple[Event, ...]:
|
|
19
|
+
events: list[Event] = []
|
|
20
|
+
for line_number, raw_line in enumerate(stream, start=1):
|
|
21
|
+
line = raw_line.strip()
|
|
22
|
+
if not line:
|
|
23
|
+
continue
|
|
24
|
+
try:
|
|
25
|
+
payload = json.loads(line)
|
|
26
|
+
except json.JSONDecodeError as exc:
|
|
27
|
+
raise ValueError(f"Invalid JSON on line {line_number}: {exc.msg}") from exc
|
|
28
|
+
if not isinstance(payload, dict):
|
|
29
|
+
raise ValueError(f"Line {line_number} must decode to a JSON object")
|
|
30
|
+
events.append(payload)
|
|
31
|
+
|
|
32
|
+
if not events:
|
|
33
|
+
raise ValueError("JSONL file is empty")
|
|
34
|
+
|
|
35
|
+
return tuple(events)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _salvage_gzip_text(raw: bytes) -> str | None:
|
|
39
|
+
"""Best-effort recovery for truncated or partially corrupted gzip data."""
|
|
40
|
+
chunks: list[bytes] = []
|
|
41
|
+
remaining = raw
|
|
42
|
+
|
|
43
|
+
while remaining:
|
|
44
|
+
decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
45
|
+
try:
|
|
46
|
+
chunk = decompressor.decompress(remaining) + decompressor.flush()
|
|
47
|
+
except zlib.error:
|
|
48
|
+
break
|
|
49
|
+
|
|
50
|
+
if chunk:
|
|
51
|
+
chunks.append(chunk)
|
|
52
|
+
|
|
53
|
+
if not decompressor.unused_data:
|
|
54
|
+
break
|
|
55
|
+
if decompressor.unused_data == remaining:
|
|
56
|
+
break
|
|
57
|
+
|
|
58
|
+
remaining = decompressor.unused_data
|
|
59
|
+
|
|
60
|
+
if not chunks:
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
return b"".join(chunks).decode("utf-8", errors="replace")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _decode_jsonl_bytes(raw: bytes, *, prefer_gzip: bool) -> tuple[Event, ...]:
|
|
67
|
+
gzip_error: Exception | None = None
|
|
68
|
+
text_error: Exception | None = None
|
|
69
|
+
|
|
70
|
+
if prefer_gzip:
|
|
71
|
+
try:
|
|
72
|
+
text = gzip.decompress(raw).decode("utf-8")
|
|
73
|
+
return _parse_jsonl(StringIO(text))
|
|
74
|
+
except (
|
|
75
|
+
BadGzipFile,
|
|
76
|
+
EOFError,
|
|
77
|
+
OSError,
|
|
78
|
+
UnicodeDecodeError,
|
|
79
|
+
ValueError,
|
|
80
|
+
) as exc:
|
|
81
|
+
gzip_error = exc
|
|
82
|
+
|
|
83
|
+
salvaged_text = _salvage_gzip_text(raw)
|
|
84
|
+
if salvaged_text is not None:
|
|
85
|
+
try:
|
|
86
|
+
return _parse_jsonl(StringIO(salvaged_text))
|
|
87
|
+
except ValueError as exc:
|
|
88
|
+
gzip_error = exc
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
text = raw.decode("utf-8")
|
|
92
|
+
return _parse_jsonl(StringIO(text))
|
|
93
|
+
except (UnicodeDecodeError, ValueError) as exc:
|
|
94
|
+
text_error = exc
|
|
95
|
+
|
|
96
|
+
if gzip_error is not None and text_error is not None:
|
|
97
|
+
raise IOError(
|
|
98
|
+
f"Unable to decode recording as gzip ({gzip_error}) "
|
|
99
|
+
f"or plain text ({text_error})"
|
|
100
|
+
) from gzip_error
|
|
101
|
+
if gzip_error is not None:
|
|
102
|
+
raise IOError(f"Unable to decode recording as gzip: {gzip_error}") from gzip_error
|
|
103
|
+
|
|
104
|
+
assert text_error is not None
|
|
105
|
+
raise IOError(f"Unable to decode recording as plain text: {text_error}") from text_error
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def load_jsonl(file: Path) -> tuple[Event, ...]:
|
|
109
|
+
"""Load a plain or gzipped JSONL recorder file."""
|
|
110
|
+
if not file.exists():
|
|
111
|
+
raise FileNotFoundError(f"File not found: {file}")
|
|
112
|
+
if not file.is_file():
|
|
113
|
+
raise ValueError(f"Path is not a file: {file}")
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
raw = file.read_bytes()
|
|
117
|
+
except OSError as exc:
|
|
118
|
+
raise IOError(f"Error reading file: {exc}") from exc
|
|
119
|
+
|
|
120
|
+
if not raw:
|
|
121
|
+
raise ValueError("JSONL file is empty")
|
|
122
|
+
|
|
123
|
+
prefer_gzip = raw.startswith(_GZIP_MAGIC) or file.suffix == ".gz"
|
|
124
|
+
return _decode_jsonl_bytes(raw, prefer_gzip=prefer_gzip)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def is_edit_event(event: Event) -> bool:
|
|
128
|
+
"""Return True for recorder edit events, including legacy untyped entries."""
|
|
129
|
+
return event.get("type") in (None, "edit")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def filter_edit_events(events: tuple[Event, ...]) -> tuple[Event, ...]:
|
|
133
|
+
"""Keep edit events and prefer the typed stream when mixed streams appear."""
|
|
134
|
+
edit_events = tuple(event for event in events if is_edit_event(event))
|
|
135
|
+
if not edit_events:
|
|
136
|
+
return ()
|
|
137
|
+
|
|
138
|
+
first_typed_idx = next(
|
|
139
|
+
(index for index, event in enumerate(edit_events) if event.get("type") == "edit"),
|
|
140
|
+
None,
|
|
141
|
+
)
|
|
142
|
+
first_legacy_idx = next(
|
|
143
|
+
(index for index, event in enumerate(edit_events) if event.get("type") is None),
|
|
144
|
+
None,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if (
|
|
148
|
+
first_typed_idx is not None
|
|
149
|
+
and first_legacy_idx is not None
|
|
150
|
+
and first_typed_idx < first_legacy_idx
|
|
151
|
+
):
|
|
152
|
+
return tuple(event for event in edit_events if event.get("type") == "edit")
|
|
153
|
+
|
|
154
|
+
return edit_events
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_focus_events(events: tuple[Event, ...]) -> tuple[Event, ...]:
|
|
158
|
+
"""Return focus status events from a recording."""
|
|
159
|
+
return tuple(event for event in events if event.get("type") == "focusStatus")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_editors(events: tuple[Event, ...]) -> set[str]:
|
|
163
|
+
"""Return the set of editors observed in the recording."""
|
|
164
|
+
editors = {str(event["editor"]) for event in events if event.get("editor")}
|
|
165
|
+
return editors or {"jetbrains"}
|
cr_proc/api/output.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""JSON output helpers for processed recordings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .document import normalize_path_string
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _sanitize_time_info(time_info: dict[str, Any] | None) -> dict[str, Any] | None:
|
|
14
|
+
if time_info is None:
|
|
15
|
+
return None
|
|
16
|
+
return {key: value for key, value in time_info.items() if key != "active_intervals"}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def write_batch_json_output(
|
|
20
|
+
output_path: Path,
|
|
21
|
+
results: list[dict[str, Any]],
|
|
22
|
+
combined_time_info: dict[str, Any] | None,
|
|
23
|
+
all_verified: bool,
|
|
24
|
+
batch_mode: bool = False,
|
|
25
|
+
submitted_all_passed: bool | None = None,
|
|
26
|
+
version: str | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Write processing results to a JSON file."""
|
|
29
|
+
files_data: list[dict[str, Any]] = []
|
|
30
|
+
all_editors: set[str] = set()
|
|
31
|
+
|
|
32
|
+
for result in results:
|
|
33
|
+
editors = list(result.get("editors", []))
|
|
34
|
+
all_editors.update(editors)
|
|
35
|
+
|
|
36
|
+
file_data = {
|
|
37
|
+
"jsonl_file": normalize_path_string(str(result["jsonl_file"])),
|
|
38
|
+
"document": result.get("target_document"),
|
|
39
|
+
"verified": result.get("verified", False),
|
|
40
|
+
"time_info": _sanitize_time_info(result.get("time_info")),
|
|
41
|
+
"suspicious_events": result.get("suspicious_events", []),
|
|
42
|
+
"template_diff": result.get("template_diff", ""),
|
|
43
|
+
"reconstructed_code": result.get("reconstructed", ""),
|
|
44
|
+
}
|
|
45
|
+
if editors:
|
|
46
|
+
file_data["editors"] = editors
|
|
47
|
+
if result.get("warnings"):
|
|
48
|
+
file_data["warnings"] = result["warnings"]
|
|
49
|
+
if result.get("submitted_comparison") is not None:
|
|
50
|
+
file_data["submitted_comparison"] = result["submitted_comparison"]
|
|
51
|
+
|
|
52
|
+
files_data.append(file_data)
|
|
53
|
+
|
|
54
|
+
payload: dict[str, Any] = {
|
|
55
|
+
"version": version,
|
|
56
|
+
"batch_mode": batch_mode,
|
|
57
|
+
"total_files": len(results),
|
|
58
|
+
"verified_count": sum(1 for result in results if result.get("verified")),
|
|
59
|
+
"all_verified": all_verified,
|
|
60
|
+
"files": files_data,
|
|
61
|
+
}
|
|
62
|
+
if combined_time_info is not None:
|
|
63
|
+
payload["combined_time_info"] = _sanitize_time_info(combined_time_info)
|
|
64
|
+
if all_editors:
|
|
65
|
+
payload["editors"] = sorted(all_editors)
|
|
66
|
+
if submitted_all_passed is not None:
|
|
67
|
+
payload["submitted_all_passed"] = submitted_all_passed
|
|
68
|
+
|
|
69
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
output_path.write_text(json.dumps(payload, indent=2))
|
|
71
|
+
|
|
72
|
+
if batch_mode:
|
|
73
|
+
print(f"Batch results written to {output_path}", file=sys.stderr)
|
|
74
|
+
else:
|
|
75
|
+
print(f"Results written to {output_path}", file=sys.stderr)
|