cr-proc 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -127,6 +127,8 @@ def reconstruct_file_from_events(
127
127
  ----------
128
128
  events : tuple of dict
129
129
  Each dict should contain:
130
+ - 'type': (optional) event type - only 'edit' events or events without
131
+ type field are processed (backwards compatible)
130
132
  - 'timestamp': ISO 8601 string, e.g., '2026-01-13T22:40:44.137341Z'
131
133
  - 'document': absolute path string of the edited file
132
134
  - 'offset': integer offset (JetBrains Document uses UTF-16 code units)
@@ -163,6 +165,10 @@ def reconstruct_file_from_events(
163
165
  - If the target document cannot be determined.
164
166
  - If an edit cannot be applied (oldFragment not found near offset).
165
167
  """
168
+ # Filter to only edit events (backwards compatible with old format)
169
+ from .load import is_edit_event
170
+ events = tuple(e for e in events if is_edit_event(e))
171
+
166
172
  # Read template content
167
173
  if normalize_newlines:
168
174
  template = _normalize_newlines(template)
@@ -0,0 +1,300 @@
1
+ """Document resolution and filtering utilities."""
2
+ import difflib
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ def find_matching_template(
9
+ template_dir: Path, document_path: str
10
+ ) -> Path | None:
11
+ """
12
+ Find a template file that matches or closely matches the document path.
13
+
14
+ Searches for template files in the given directory that either:
15
+ 1. Exactly match the document filename
16
+ 2. Have the same base name (stem) as the document
17
+ 3. Have a fuzzy match with the document name
18
+
19
+ Parameters
20
+ ----------
21
+ template_dir : Path
22
+ Directory containing template files
23
+ document_path : str
24
+ Path of the document to find a template for
25
+
26
+ Returns
27
+ -------
28
+ Path | None
29
+ Path to the matching template file, or None if no good match found
30
+ """
31
+ if not template_dir.is_dir():
32
+ return None
33
+
34
+ doc_name = Path(document_path).name
35
+ doc_stem = Path(document_path).stem
36
+
37
+ # First, try exact filename match
38
+ exact_match = template_dir / doc_name
39
+ if exact_match.exists() and exact_match.is_file():
40
+ return exact_match
41
+
42
+ # Get all template files
43
+ template_files = list(template_dir.glob("*"))
44
+ template_files = [f for f in template_files if f.is_file()]
45
+
46
+ if not template_files:
47
+ return None
48
+
49
+ # Then try matching by stem (name without extension)
50
+ stem_matches = [f for f in template_files if f.stem == doc_stem]
51
+ if stem_matches:
52
+ return stem_matches[0]
53
+
54
+ # Try matching by just the stem component from document (in case doc has extra prefixes/suffixes)
55
+ # For example: "cs111-homework0-CPT.py" -> stem is "cs111-homework0-CPT", but we want to match "cs111-homework0.py"
56
+ # So try matching just the core part before any suffixes like -CPT, -CPY, etc
57
+ for sep in ['-', '_']:
58
+ if sep in doc_stem:
59
+ base_stem = doc_stem.split(sep)[0]
60
+ base_matches = [f for f in template_files if f.stem == base_stem or f.stem.startswith(base_stem + sep)]
61
+ if base_matches:
62
+ return base_matches[0]
63
+
64
+ # Finally, try to find the closest match using sequence matching with lower threshold
65
+ matches = difflib.get_close_matches(doc_name, [f.name for f in template_files], n=1, cutoff=0.4)
66
+ if matches:
67
+ return template_dir / matches[0]
68
+
69
+ # Last resort: try fuzzy matching on stem
70
+ matches = difflib.get_close_matches(doc_stem, [f.stem for f in template_files], n=1, cutoff=0.4)
71
+ if matches:
72
+ stem_match_files = [f for f in template_files if f.stem == matches[0]]
73
+ if stem_match_files:
74
+ return stem_match_files[0]
75
+
76
+ return None
77
+
78
+
79
+ def get_normalized_document_key(doc_path: str) -> tuple[str, str]:
80
+ """
81
+ Get a normalized key for a document based on filename and extension.
82
+
83
+ This helps identify documents that are the same but with different paths.
84
+
85
+ Parameters
86
+ ----------
87
+ doc_path : str
88
+ Document path
89
+
90
+ Returns
91
+ -------
92
+ tuple[str, str]
93
+ (filename_with_extension, extension) for grouping similar documents
94
+ """
95
+ path_obj = Path(doc_path)
96
+ return (path_obj.name, path_obj.suffix)
97
+
98
+
99
+ def group_documents_by_name(docs: list[str]) -> dict[tuple[str, str], list[str]]:
100
+ """
101
+ Group documents by their normalized key (name + extension).
102
+
103
+ Documents with the same filename and extension but different paths
104
+ are considered the same file (likely renamed).
105
+
106
+ Parameters
107
+ ----------
108
+ docs : list[str]
109
+ List of document paths from recording
110
+
111
+ Returns
112
+ -------
113
+ dict[tuple[str, str], list[str]]
114
+ Dictionary mapping (filename, extension) to list of paths with that name
115
+ """
116
+ groups = {}
117
+ for doc in docs:
118
+ key = get_normalized_document_key(doc)
119
+ if key not in groups:
120
+ groups[key] = []
121
+ groups[key].append(doc)
122
+ return groups
123
+
124
+
125
+ def get_recorded_documents(events: tuple[dict[str, Any], ...]) -> list[str]:
126
+ """
127
+ Extract unique document paths from recording events.
128
+
129
+ Parameters
130
+ ----------
131
+ events : tuple[dict[str, Any], ...]
132
+ Recording events loaded from JSONL
133
+
134
+ Returns
135
+ -------
136
+ list[str]
137
+ Sorted list of unique document paths
138
+ """
139
+ documents = {
140
+ e.get("document")
141
+ for e in events
142
+ if "document" in e and e.get("document") is not None
143
+ }
144
+ return sorted([d for d in documents if d is not None])
145
+
146
+
147
+ def filter_events_by_document(
148
+ events: tuple[dict[str, Any], ...], document: str | None
149
+ ) -> tuple[dict[str, Any], ...]:
150
+ """
151
+ Filter events to only those for a specific document.
152
+
153
+ Parameters
154
+ ----------
155
+ events : tuple[dict[str, Any], ...]
156
+ All recording events
157
+ document : str | None
158
+ Document path to filter by, or None to return all events
159
+
160
+ Returns
161
+ -------
162
+ tuple[dict[str, Any], ...]
163
+ Filtered events
164
+ """
165
+ if document:
166
+ return tuple(e for e in events if e.get("document") == document)
167
+ return events
168
+
169
+
170
+ def resolve_document(
171
+ docs: list[str], template_path: Path | None, override: str | None
172
+ ) -> str | None:
173
+ """
174
+ Determine which document from the recording to process.
175
+
176
+ Handles deduplication of documents with same name/extension (treating them as renames).
177
+
178
+ Parameters
179
+ ----------
180
+ docs : list[str]
181
+ List of document paths found in the recording
182
+ template_path : Path | None
183
+ Path to template file or directory (used for extension matching if it's a file)
184
+ override : str | None
185
+ Explicit document name or path override
186
+
187
+ Returns
188
+ -------
189
+ str | None
190
+ The resolved document path, or None if no documents exist
191
+
192
+ Raises
193
+ ------
194
+ ValueError
195
+ If document resolution is ambiguous or the override doesn't match
196
+ """
197
+ if not docs:
198
+ return None
199
+
200
+ # Group documents by name/extension to handle renames
201
+ doc_groups = group_documents_by_name(docs)
202
+
203
+ # For ambiguity checking, use the groups (deduplicated by name)
204
+ unique_docs = [paths[0] for paths in doc_groups.values()]
205
+
206
+ if override:
207
+ matches = [
208
+ d for d in unique_docs if d.endswith(override) or Path(d).name == override
209
+ ]
210
+ if not matches:
211
+ raise ValueError(
212
+ f"No document in recording matches '{override}'. Available: {unique_docs}"
213
+ )
214
+ if len(matches) > 1:
215
+ raise ValueError(
216
+ f"Ambiguous document override '{override}'. Matches: {matches}"
217
+ )
218
+ return matches[0]
219
+
220
+ # If template_path is provided and is a file (not directory), use its extension for matching
221
+ if template_path and template_path.is_file():
222
+ template_ext = template_path.suffix
223
+ ext_matches = [d for d in unique_docs if Path(d).suffix == template_ext]
224
+ if len(ext_matches) == 1:
225
+ return ext_matches[0]
226
+ if len(ext_matches) > 1:
227
+ raise ValueError(
228
+ f"Multiple documents share extension '{template_ext}': {ext_matches}. "
229
+ "Use --document to choose one."
230
+ )
231
+
232
+ if len(unique_docs) == 1:
233
+ return unique_docs[0]
234
+
235
+ raise ValueError(
236
+ "Could not determine document to process. Use --document to select one. "
237
+ f"Available documents: {unique_docs}"
238
+ )
239
+
240
+
241
+ def resolve_template_file(
242
+ template_path: Path | None,
243
+ template_dir: Path | None,
244
+ document_path: str | None
245
+ ) -> tuple[str, bool]:
246
+ """
247
+ Resolve which template file to use and whether a warning was issued.
248
+
249
+ Supports both a direct template file path or a directory to search for templates.
250
+ If using a directory and no exact match is found, a warning is issued but
251
+ reconstruction will still proceed.
252
+
253
+ Parameters
254
+ ----------
255
+ template_path : Path | None
256
+ Direct path to a template file (if provided)
257
+ template_dir : Path | None
258
+ Directory to search for template files (if provided)
259
+ document_path : str | None
260
+ Path of the document being processed (for matching)
261
+
262
+ Returns
263
+ -------
264
+ tuple[str, bool]
265
+ (template_content, had_warning) where had_warning indicates if template was not found
266
+ but reconstruction is proceeding anyway
267
+
268
+ Raises
269
+ ------
270
+ FileNotFoundError
271
+ If a direct template path is specified but not found
272
+ ValueError
273
+ If neither template_path nor template_dir is provided
274
+ """
275
+ if template_path is not None:
276
+ # Direct template file path provided
277
+ if not template_path.exists():
278
+ raise FileNotFoundError(f"Template file not found: {template_path}")
279
+ return template_path.read_text(), False
280
+
281
+ if template_dir is not None:
282
+ # Search for template in directory
283
+ if not template_dir.is_dir():
284
+ raise ValueError(f"Template directory does not exist: {template_dir}")
285
+
286
+ # Find a matching template
287
+ if document_path:
288
+ matching_template = find_matching_template(template_dir, document_path)
289
+ if matching_template:
290
+ return matching_template.read_text(), False
291
+
292
+ # No matching template found, issue warning
293
+ print(
294
+ f"Warning: No matching template found in {template_dir}. "
295
+ "Reconstruction will proceed, but checks may fail.",
296
+ file=sys.stderr
297
+ )
298
+ return "", True # Empty template, but continue
299
+
300
+ raise ValueError("Either template_path or template_dir must be provided")
@@ -115,3 +115,61 @@ def load_jsonl(file: Path) -> tuple[dict[str, Any], ...]:
115
115
  raise ValueError(f"JSONL file is empty: {file}")
116
116
 
117
117
  return data
118
+
119
+
120
+ def is_edit_event(event: dict[str, Any]) -> bool:
121
+ """
122
+ Check if an event is an edit event (backwards compatible).
123
+
124
+ Events are considered edit events if:
125
+ - They have type="edit", OR
126
+ - They don't have a "type" field (legacy/backwards compatibility)
127
+
128
+ Parameters
129
+ ----------
130
+ event : dict[str, Any]
131
+ Event dictionary from JSONL
132
+
133
+ Returns
134
+ -------
135
+ bool
136
+ True if the event should be processed as an edit event
137
+ """
138
+ event_type = event.get("type")
139
+ # If no type field exists (old format), treat as edit event for backwards compatibility
140
+ # If type field exists, only process if it's "edit"
141
+ return event_type is None or event_type == "edit"
142
+
143
+
144
+ def filter_edit_events(events: tuple[dict[str, Any], ...]) -> tuple[dict[str, Any], ...]:
145
+ """
146
+ Filter events to only include edit events (backwards compatible).
147
+
148
+ Parameters
149
+ ----------
150
+ events : tuple[dict[str, Any], ...]
151
+ All events from JSONL
152
+
153
+ Returns
154
+ -------
155
+ tuple[dict[str, Any], ...]
156
+ Only edit events
157
+ """
158
+ return tuple(e for e in events if is_edit_event(e))
159
+
160
+
161
+ def get_focus_events(events: tuple[dict[str, Any], ...]) -> tuple[dict[str, Any], ...]:
162
+ """
163
+ Extract focus status events from recording.
164
+
165
+ Parameters
166
+ ----------
167
+ events : tuple[dict[str, Any], ...]
168
+ All events from JSONL
169
+
170
+ Returns
171
+ -------
172
+ tuple[dict[str, Any], ...]
173
+ Only focusStatus events with timestamp and focused fields
174
+ """
175
+ return tuple(e for e in events if e.get("type") == "focusStatus")
@@ -0,0 +1,70 @@
1
+ """Output formatting utilities for verification results."""
2
+ import json
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ def write_batch_json_output(
9
+ output_path: Path,
10
+ results: list[dict[str, Any]],
11
+ combined_time_info: dict[str, Any] | None,
12
+ all_verified: bool,
13
+ batch_mode: bool = False,
14
+ ) -> None:
15
+ """
16
+ Write verification results to JSON file (consistent format for single and batch modes).
17
+
18
+ Parameters
19
+ ----------
20
+ output_path : Path
21
+ Path to output JSON file
22
+ results : list[dict[str, Any]]
23
+ List of results from each processed file
24
+ combined_time_info : dict[str, Any] | None
25
+ Combined time information across all files
26
+ all_verified : bool
27
+ Whether all files passed verification
28
+ batch_mode : bool
29
+ Whether this is batch mode (multiple files)
30
+
31
+ Raises
32
+ ------
33
+ Exception
34
+ If file writing fails
35
+ """
36
+ # Convert results to JSON-serializable format
37
+ files_data = []
38
+ for r in results:
39
+ files_data.append({
40
+ "jsonl_file": str(r["jsonl_file"]),
41
+ "document": r["target_document"],
42
+ "verified": r["verified"],
43
+ "time_info": r["time_info"],
44
+ "suspicious_events": r["suspicious_events"],
45
+ "template_diff": r.get("template_diff", ""),
46
+ "reconstructed_code": r["reconstructed"],
47
+ })
48
+
49
+ # Use consistent format for both single and batch modes
50
+ output_data = {
51
+ "batch_mode": batch_mode,
52
+ "total_files": len(results),
53
+ "verified_count": sum(1 for r in results if r["verified"]),
54
+ "all_verified": all_verified,
55
+ }
56
+
57
+ # Only include combined_time_info if present
58
+ if combined_time_info is not None:
59
+ output_data["combined_time_info"] = combined_time_info
60
+
61
+ output_data["files"] = files_data
62
+
63
+ output_path.parent.mkdir(parents=True, exist_ok=True)
64
+ with open(output_path, "w") as f:
65
+ json.dump(output_data, f, indent=2)
66
+
67
+ if batch_mode:
68
+ print(f"Batch results written to {output_path}", file=sys.stderr)
69
+ else:
70
+ print(f"Results written to {output_path}", file=sys.stderr)