cr-proc 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -127,6 +127,8 @@ def reconstruct_file_from_events(
127
127
  ----------
128
128
  events : tuple of dict
129
129
  Each dict should contain:
130
+ - 'type': (optional) event type - only 'edit' events or events without
131
+ type field are processed (backwards compatible)
130
132
  - 'timestamp': ISO 8601 string, e.g., '2026-01-13T22:40:44.137341Z'
131
133
  - 'document': absolute path string of the edited file
132
134
  - 'offset': integer offset (JetBrains Document uses UTF-16 code units)
@@ -163,6 +165,10 @@ def reconstruct_file_from_events(
163
165
  - If the target document cannot be determined.
164
166
  - If an edit cannot be applied (oldFragment not found near offset).
165
167
  """
168
+ # Filter to only edit events (backwards compatible with old format)
169
+ from .load import is_edit_event
170
+ events = tuple(e for e in events if is_edit_event(e))
171
+
166
172
  # Read template content
167
173
  if normalize_newlines:
168
174
  template = _normalize_newlines(template)
@@ -0,0 +1,337 @@
1
+ """Document resolution and filtering utilities."""
2
+ import difflib
3
+ import sys
4
+ from pathlib import Path, PureWindowsPath, PurePosixPath
5
+ from typing import Any
6
+
7
+
8
+ def _normalize_document_path(doc_path: str) -> tuple[str, str]:
9
+ """
10
+ Normalize a document path to extract filename and stem.
11
+
12
+ Handles both Windows-style (backslash) and Unix-style (forward slash) paths
13
+ regardless of the current platform.
14
+
15
+ Parameters
16
+ ----------
17
+ doc_path : str
18
+ Document path string (may use Windows or Unix separators)
19
+
20
+ Returns
21
+ -------
22
+ tuple[str, str]
23
+ (filename, stem) extracted from the path
24
+ """
25
+ # Try to detect if this is a Windows path (contains backslashes)
26
+ if "\\" in doc_path:
27
+ # Windows-style path
28
+ path_obj = PureWindowsPath(doc_path)
29
+ else:
30
+ # Unix-style path (or just a filename)
31
+ path_obj = PurePosixPath(doc_path)
32
+
33
+ return path_obj.name, path_obj.stem
34
+
35
+
36
+ def find_matching_template(
37
+ template_dir: Path, document_path: str
38
+ ) -> Path | None:
39
+ """
40
+ Find a template file that matches or closely matches the document path.
41
+
42
+ Searches for template files in the given directory that either:
43
+ 1. Exactly match the document filename
44
+ 2. Have the same base name (stem) as the document
45
+ 3. Have a fuzzy match with the document name
46
+
47
+ Parameters
48
+ ----------
49
+ template_dir : Path
50
+ Directory containing template files
51
+ document_path : str
52
+ Path of the document to find a template for
53
+
54
+ Returns
55
+ -------
56
+ Path | None
57
+ Path to the matching template file, or None if no good match found
58
+ """
59
+ if not template_dir.is_dir():
60
+ return None
61
+
62
+ doc_name, doc_stem = _normalize_document_path(document_path)
63
+
64
+ # First, try exact filename match
65
+ exact_match = template_dir / doc_name
66
+ if exact_match.exists() and exact_match.is_file():
67
+ return exact_match
68
+
69
+ # Get all template files
70
+ template_files = list(template_dir.glob("*"))
71
+ template_files = [f for f in template_files if f.is_file()]
72
+
73
+ if not template_files:
74
+ return None
75
+
76
+ # Then try matching by stem (name without extension)
77
+ stem_matches = [f for f in template_files if f.stem == doc_stem]
78
+ if stem_matches:
79
+ return stem_matches[0]
80
+
81
+ # Try matching by just the stem component from document (in case doc has extra prefixes/suffixes)
82
+ # For example: "cs111-homework0-CPT.py" -> stem is "cs111-homework0-CPT", but we want to match "cs111-homework0.py"
83
+ # So try matching just the core part before any suffixes like -CPT, -CPY, etc
84
+ for sep in ['-', '_']:
85
+ if sep in doc_stem:
86
+ base_stem = doc_stem.split(sep)[0]
87
+ base_matches = [f for f in template_files if f.stem == base_stem or f.stem.startswith(base_stem + sep)]
88
+ if base_matches:
89
+ return base_matches[0]
90
+
91
+ # Finally, try to find the closest match using sequence matching with lower threshold
92
+ matches = difflib.get_close_matches(doc_name, [f.name for f in template_files], n=1, cutoff=0.4)
93
+ if matches:
94
+ return template_dir / matches[0]
95
+
96
+ # Last resort: try fuzzy matching on stem
97
+ matches = difflib.get_close_matches(doc_stem, [f.stem for f in template_files], n=1, cutoff=0.4)
98
+ if matches:
99
+ stem_match_files = [f for f in template_files if f.stem == matches[0]]
100
+ if stem_match_files:
101
+ return stem_match_files[0]
102
+
103
+ return None
104
+
105
+
106
+ def get_normalized_document_key(doc_path: str) -> tuple[str, str]:
107
+ """
108
+ Get a normalized key for a document based on filename and extension.
109
+
110
+ This helps identify documents that are the same but with different paths.
111
+ Handles both Windows and Unix style paths correctly.
112
+
113
+ Parameters
114
+ ----------
115
+ doc_path : str
116
+ Document path (may use Windows or Unix separators)
117
+
118
+ Returns
119
+ -------
120
+ tuple[str, str]
121
+ (filename_with_extension, extension) for grouping similar documents
122
+ """
123
+ filename, _ = _normalize_document_path(doc_path)
124
+ # Get extension from filename
125
+ if '.' in filename:
126
+ extension = '.' + filename.rsplit('.', 1)[1]
127
+ else:
128
+ extension = ''
129
+ return (filename, extension)
130
+
131
+
132
+ def group_documents_by_name(docs: list[str]) -> dict[tuple[str, str], list[str]]:
133
+ """
134
+ Group documents by their normalized key (name + extension).
135
+
136
+ Documents with the same filename and extension but different paths
137
+ are considered the same file (likely renamed).
138
+
139
+ Parameters
140
+ ----------
141
+ docs : list[str]
142
+ List of document paths from recording
143
+
144
+ Returns
145
+ -------
146
+ dict[tuple[str, str], list[str]]
147
+ Dictionary mapping (filename, extension) to list of paths with that name
148
+ """
149
+ groups = {}
150
+ for doc in docs:
151
+ key = get_normalized_document_key(doc)
152
+ if key not in groups:
153
+ groups[key] = []
154
+ groups[key].append(doc)
155
+ return groups
156
+
157
+
158
+ def get_recorded_documents(events: tuple[dict[str, Any], ...]) -> list[str]:
159
+ """
160
+ Extract unique document paths from recording events.
161
+
162
+ Parameters
163
+ ----------
164
+ events : tuple[dict[str, Any], ...]
165
+ Recording events loaded from JSONL
166
+
167
+ Returns
168
+ -------
169
+ list[str]
170
+ Sorted list of unique document paths
171
+ """
172
+ documents = {
173
+ e.get("document")
174
+ for e in events
175
+ if "document" in e and e.get("document") is not None
176
+ }
177
+ return sorted([d for d in documents if d is not None])
178
+
179
+
180
+ def filter_events_by_document(
181
+ events: tuple[dict[str, Any], ...], document: str | None
182
+ ) -> tuple[dict[str, Any], ...]:
183
+ """
184
+ Filter events to only those for a specific document.
185
+
186
+ Parameters
187
+ ----------
188
+ events : tuple[dict[str, Any], ...]
189
+ All recording events
190
+ document : str | None
191
+ Document path to filter by, or None to return all events
192
+
193
+ Returns
194
+ -------
195
+ tuple[dict[str, Any], ...]
196
+ Filtered events
197
+ """
198
+ if document:
199
+ return tuple(e for e in events if e.get("document") == document)
200
+ return events
201
+
202
+
203
+ def resolve_document(
204
+ docs: list[str], template_path: Path | None, override: str | None
205
+ ) -> str | None:
206
+ """
207
+ Determine which document from the recording to process.
208
+
209
+ Handles deduplication of documents with same name/extension (treating them as renames).
210
+
211
+ Parameters
212
+ ----------
213
+ docs : list[str]
214
+ List of document paths found in the recording
215
+ template_path : Path | None
216
+ Path to template file or directory (used for extension matching if it's a file)
217
+ override : str | None
218
+ Explicit document name or path override
219
+
220
+ Returns
221
+ -------
222
+ str | None
223
+ The resolved document path, or None if no documents exist
224
+
225
+ Raises
226
+ ------
227
+ ValueError
228
+ If document resolution is ambiguous or the override doesn't match
229
+ """
230
+ if not docs:
231
+ return None
232
+
233
+ # Group documents by name/extension to handle renames
234
+ doc_groups = group_documents_by_name(docs)
235
+
236
+ # For ambiguity checking, use the groups (deduplicated by name)
237
+ unique_docs = [paths[0] for paths in doc_groups.values()]
238
+
239
+ if override:
240
+ matches = [
241
+ d for d in unique_docs
242
+ if d.endswith(override) or _normalize_document_path(d)[0] == override
243
+ ]
244
+ if not matches:
245
+ raise ValueError(
246
+ f"No document in recording matches '{override}'. Available: {unique_docs}"
247
+ )
248
+ if len(matches) > 1:
249
+ raise ValueError(
250
+ f"Ambiguous document override '{override}'. Matches: {matches}"
251
+ )
252
+ return matches[0]
253
+
254
+ # If template_path is provided and is a file (not directory), use its extension for matching
255
+ if template_path and template_path.is_file():
256
+ template_ext = template_path.suffix
257
+ ext_matches = [
258
+ d for d in unique_docs
259
+ if _normalize_document_path(d)[0].endswith(template_ext)
260
+ ]
261
+ if len(ext_matches) == 1:
262
+ return ext_matches[0]
263
+ if len(ext_matches) > 1:
264
+ raise ValueError(
265
+ f"Multiple documents share extension '{template_ext}': {ext_matches}. "
266
+ "Use --document to choose one."
267
+ )
268
+
269
+ if len(unique_docs) == 1:
270
+ return unique_docs[0]
271
+
272
+ raise ValueError(
273
+ "Could not determine document to process. Use --document to select one. "
274
+ f"Available documents: {unique_docs}"
275
+ )
276
+
277
+
278
+ def resolve_template_file(
279
+ template_path: Path | None,
280
+ template_dir: Path | None,
281
+ document_path: str | None
282
+ ) -> tuple[str, bool]:
283
+ """
284
+ Resolve which template file to use and whether a warning was issued.
285
+
286
+ Supports both a direct template file path or a directory to search for templates.
287
+ If using a directory and no exact match is found, a warning is issued but
288
+ reconstruction will still proceed.
289
+
290
+ Parameters
291
+ ----------
292
+ template_path : Path | None
293
+ Direct path to a template file (if provided)
294
+ template_dir : Path | None
295
+ Directory to search for template files (if provided)
296
+ document_path : str | None
297
+ Path of the document being processed (for matching)
298
+
299
+ Returns
300
+ -------
301
+ tuple[str, bool]
302
+ (template_content, had_warning) where had_warning indicates if template was not found
303
+ but reconstruction is proceeding anyway
304
+
305
+ Raises
306
+ ------
307
+ FileNotFoundError
308
+ If a direct template path is specified but not found
309
+ ValueError
310
+ If neither template_path nor template_dir is provided
311
+ """
312
+ if template_path is not None:
313
+ # Direct template file path provided
314
+ if not template_path.exists():
315
+ raise FileNotFoundError(f"Template file not found: {template_path}")
316
+ return template_path.read_text(), False
317
+
318
+ if template_dir is not None:
319
+ # Search for template in directory
320
+ if not template_dir.is_dir():
321
+ raise ValueError(f"Template directory does not exist: {template_dir}")
322
+
323
+ # Find a matching template
324
+ if document_path:
325
+ matching_template = find_matching_template(template_dir, document_path)
326
+ if matching_template:
327
+ return matching_template.read_text(), False
328
+
329
+ # No matching template found, issue warning
330
+ print(
331
+ f"Warning: No matching template found in {template_dir}. "
332
+ "Reconstruction will proceed, but checks may fail.",
333
+ file=sys.stderr
334
+ )
335
+ return "", True # Empty template, but continue
336
+
337
+ raise ValueError("Either template_path or template_dir must be provided")
@@ -115,3 +115,61 @@ def load_jsonl(file: Path) -> tuple[dict[str, Any], ...]:
115
115
  raise ValueError(f"JSONL file is empty: {file}")
116
116
 
117
117
  return data
118
+
119
+
120
+ def is_edit_event(event: dict[str, Any]) -> bool:
121
+ """
122
+ Check if an event is an edit event (backwards compatible).
123
+
124
+ Events are considered edit events if:
125
+ - They have type="edit", OR
126
+ - They don't have a "type" field (legacy/backwards compatibility)
127
+
128
+ Parameters
129
+ ----------
130
+ event : dict[str, Any]
131
+ Event dictionary from JSONL
132
+
133
+ Returns
134
+ -------
135
+ bool
136
+ True if the event should be processed as an edit event
137
+ """
138
+ event_type = event.get("type")
139
+ # If no type field exists (old format), treat as edit event for backwards compatibility
140
+ # If type field exists, only process if it's "edit"
141
+ return event_type is None or event_type == "edit"
142
+
143
+
144
+ def filter_edit_events(events: tuple[dict[str, Any], ...]) -> tuple[dict[str, Any], ...]:
145
+ """
146
+ Filter events to only include edit events (backwards compatible).
147
+
148
+ Parameters
149
+ ----------
150
+ events : tuple[dict[str, Any], ...]
151
+ All events from JSONL
152
+
153
+ Returns
154
+ -------
155
+ tuple[dict[str, Any], ...]
156
+ Only edit events
157
+ """
158
+ return tuple(e for e in events if is_edit_event(e))
159
+
160
+
161
+ def get_focus_events(events: tuple[dict[str, Any], ...]) -> tuple[dict[str, Any], ...]:
162
+ """
163
+ Extract focus status events from recording.
164
+
165
+ Parameters
166
+ ----------
167
+ events : tuple[dict[str, Any], ...]
168
+ All events from JSONL
169
+
170
+ Returns
171
+ -------
172
+ tuple[dict[str, Any], ...]
173
+ Only focusStatus events with timestamp and focused fields
174
+ """
175
+ return tuple(e for e in events if e.get("type") == "focusStatus")
@@ -0,0 +1,70 @@
1
+ """Output formatting utilities for verification results."""
2
+ import json
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ def write_batch_json_output(
9
+ output_path: Path,
10
+ results: list[dict[str, Any]],
11
+ combined_time_info: dict[str, Any] | None,
12
+ all_verified: bool,
13
+ batch_mode: bool = False,
14
+ ) -> None:
15
+ """
16
+ Write verification results to JSON file (consistent format for single and batch modes).
17
+
18
+ Parameters
19
+ ----------
20
+ output_path : Path
21
+ Path to output JSON file
22
+ results : list[dict[str, Any]]
23
+ List of results from each processed file
24
+ combined_time_info : dict[str, Any] | None
25
+ Combined time information across all files
26
+ all_verified : bool
27
+ Whether all files passed verification
28
+ batch_mode : bool
29
+ Whether this is batch mode (multiple files)
30
+
31
+ Raises
32
+ ------
33
+ Exception
34
+ If file writing fails
35
+ """
36
+ # Convert results to JSON-serializable format
37
+ files_data = []
38
+ for r in results:
39
+ files_data.append({
40
+ "jsonl_file": str(r["jsonl_file"]),
41
+ "document": r["target_document"],
42
+ "verified": r["verified"],
43
+ "time_info": r["time_info"],
44
+ "suspicious_events": r["suspicious_events"],
45
+ "template_diff": r.get("template_diff", ""),
46
+ "reconstructed_code": r["reconstructed"],
47
+ })
48
+
49
+ # Use consistent format for both single and batch modes
50
+ output_data = {
51
+ "batch_mode": batch_mode,
52
+ "total_files": len(results),
53
+ "verified_count": sum(1 for r in results if r["verified"]),
54
+ "all_verified": all_verified,
55
+ }
56
+
57
+ # Only include combined_time_info if present
58
+ if combined_time_info is not None:
59
+ output_data["combined_time_info"] = combined_time_info
60
+
61
+ output_data["files"] = files_data
62
+
63
+ output_path.parent.mkdir(parents=True, exist_ok=True)
64
+ with open(output_path, "w") as f:
65
+ json.dump(output_data, f, indent=2)
66
+
67
+ if batch_mode:
68
+ print(f"Batch results written to {output_path}", file=sys.stderr)
69
+ else:
70
+ print(f"Results written to {output_path}", file=sys.stderr)