cr-proc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cr_proc-0.1.0/PKG-INFO +142 -0
- cr_proc-0.1.0/README.md +129 -0
- cr_proc-0.1.0/pyproject.toml +29 -0
- cr_proc-0.1.0/src/code_recorder_processor/__init__.py +0 -0
- cr_proc-0.1.0/src/code_recorder_processor/api/build.py +212 -0
- cr_proc-0.1.0/src/code_recorder_processor/api/load.py +100 -0
- cr_proc-0.1.0/src/code_recorder_processor/api/verify.py +412 -0
- cr_proc-0.1.0/src/code_recorder_processor/cli.py +183 -0
cr_proc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cr_proc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Ethan Dye
|
|
6
|
+
Author-email: mrtops03@gmail.com
|
|
7
|
+
Requires-Python: >=3.14
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
10
|
+
Requires-Dist: py-jsonl (>=1.3.22,<2.0.0)
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# `code_recorder_processor`
|
|
14
|
+
|
|
15
|
+
[](https://github.com/BYU-CS-Course-Ops/code_recorder_processor/actions/workflows/ci.yml)
|
|
16
|
+
|
|
17
|
+
This contains code to process and verify the `*.recorder.jsonl.gz` files that
|
|
18
|
+
are produced by the
|
|
19
|
+
[jetbrains-recorder](https://github.com/BYU-CS-Course-Ops/jetbrains-recorder).
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
Install the package and its dependencies using Poetry:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
poetry install
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
The processor can be run using the `cr_proc` command with two arguments:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
poetry run cr_proc <path-to-jsonl-file> <path-to-template-file>
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Arguments
|
|
38
|
+
|
|
39
|
+
- `<path-to-jsonl-file>`: Path to the compressed JSONL file
|
|
40
|
+
(`*.recorder.jsonl.gz`) produced by the jetbrains-recorder
|
|
41
|
+
- `<path-to-template-file>`: Path to the initial template file that was recorded
|
|
42
|
+
|
|
43
|
+
### Options
|
|
44
|
+
|
|
45
|
+
- `--time-limit MINUTES`: (Optional) Maximum allowed time in minutes between the
|
|
46
|
+
first and last edit in the recording. If the elapsed time exceeds this limit,
|
|
47
|
+
the recording is flagged as suspicious. Useful for detecting unusually long
|
|
48
|
+
work sessions or potential external assistance.
|
|
49
|
+
|
|
50
|
+
### Example
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
poetry run cr_proc homework0.recording.jsonl.gz homework0.py
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
With time limit flag:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
poetry run cr_proc homework0.recording.jsonl.gz homework0.py --time-limit 30
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
This will flag the recording if more than 30 minutes elapsed between the first and last edit.
|
|
63
|
+
|
|
64
|
+
The processor will:
|
|
65
|
+
|
|
66
|
+
1. Load the recorded events from the JSONL file
|
|
67
|
+
2. Verify that the initial event matches the template (allowances for newline
|
|
68
|
+
differences are made)
|
|
69
|
+
3. Reconstruct the final file state by applying all recorded events
|
|
70
|
+
4. Output the reconstructed file contents to stdout
|
|
71
|
+
|
|
72
|
+
### Output
|
|
73
|
+
|
|
74
|
+
The reconstructed file is printed to stdout. Any warnings or errors are printed
|
|
75
|
+
to stderr, including:
|
|
76
|
+
|
|
77
|
+
- The document path being processed
|
|
78
|
+
- Suspicious copy-paste and AI activity indicators
|
|
79
|
+
|
|
80
|
+
### Suspicious Activity Detection
|
|
81
|
+
|
|
82
|
+
The processor automatically detects and reports three types of suspicious activity
|
|
83
|
+
patterns:
|
|
84
|
+
|
|
85
|
+
#### 1. Time Limit Exceeded
|
|
86
|
+
|
|
87
|
+
When the `--time-limit` flag is specified, the processor flags recordings where
|
|
88
|
+
the elapsed time between the first and last edit exceeds the specified limit.
|
|
89
|
+
This can indicate unusually long work sessions or potential external assistance.
|
|
90
|
+
|
|
91
|
+
**Example warning:**
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
Time limit exceeded!
|
|
95
|
+
Limit: 30 minutes
|
|
96
|
+
Elapsed: 45.5 minutes
|
|
97
|
+
First edit: 2025-01-15T10:00:00+00:00
|
|
98
|
+
Last edit: 2025-01-15T10:45:30+00:00
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### 2. External Copy-Paste (Multi-line Pastes)
|
|
102
|
+
|
|
103
|
+
The processor flags multi-line additions (more than one line) that do not appear
|
|
104
|
+
to be copied from within the document itself. These indicate content pasted from
|
|
105
|
+
external sources.
|
|
106
|
+
|
|
107
|
+
**Example warning:**
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
Event #15 (multi-line external paste): 5 lines, 156 chars - newFragment: def helper_function():...
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
#### 3. Rapid One-line Pastes (AI Indicator)
|
|
114
|
+
|
|
115
|
+
When 3 or more single-line pastes occur within a 1-second window, this is
|
|
116
|
+
flagged as a potential AI activity indicator. Human typing does not typically
|
|
117
|
+
produce this pattern; rapid sequential pastes suggest automated code generation.
|
|
118
|
+
|
|
119
|
+
**Example warning:**
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
Events #42-#44 (rapid one-line pastes (AI indicator)): 3 lines, 89 chars
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Error Handling
|
|
126
|
+
|
|
127
|
+
If verification fails (the recorded initial state doesn't match the template),
|
|
128
|
+
the processor will:
|
|
129
|
+
|
|
130
|
+
- Print an error message to stderr
|
|
131
|
+
- Display a diff showing the differences
|
|
132
|
+
- Exit with status code 1
|
|
133
|
+
|
|
134
|
+
If file loading or processing errors occur, the processor will:
|
|
135
|
+
|
|
136
|
+
- Print a descriptive error message to stderr
|
|
137
|
+
- Exit with status code 1
|
|
138
|
+
|
|
139
|
+
## Future Ideas
|
|
140
|
+
|
|
141
|
+
- Check for odd typing behavior
|
|
142
|
+
|
cr_proc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# `code_recorder_processor`
|
|
2
|
+
|
|
3
|
+
[](https://github.com/BYU-CS-Course-Ops/code_recorder_processor/actions/workflows/ci.yml)
|
|
4
|
+
|
|
5
|
+
This contains code to process and verify the `*.recorder.jsonl.gz` files that
|
|
6
|
+
are produced by the
|
|
7
|
+
[jetbrains-recorder](https://github.com/BYU-CS-Course-Ops/jetbrains-recorder).
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Install the package and its dependencies using Poetry:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
poetry install
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
The processor can be run using the `cr_proc` command with two arguments:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
poetry run cr_proc <path-to-jsonl-file> <path-to-template-file>
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Arguments
|
|
26
|
+
|
|
27
|
+
- `<path-to-jsonl-file>`: Path to the compressed JSONL file
|
|
28
|
+
(`*.recorder.jsonl.gz`) produced by the jetbrains-recorder
|
|
29
|
+
- `<path-to-template-file>`: Path to the initial template file that was recorded
|
|
30
|
+
|
|
31
|
+
### Options
|
|
32
|
+
|
|
33
|
+
- `--time-limit MINUTES`: (Optional) Maximum allowed time in minutes between the
|
|
34
|
+
first and last edit in the recording. If the elapsed time exceeds this limit,
|
|
35
|
+
the recording is flagged as suspicious. Useful for detecting unusually long
|
|
36
|
+
work sessions or potential external assistance.
|
|
37
|
+
|
|
38
|
+
### Example
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
poetry run cr_proc homework0.recording.jsonl.gz homework0.py
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
With time limit flag:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
poetry run cr_proc homework0.recording.jsonl.gz homework0.py --time-limit 30
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
This will flag the recording if more than 30 minutes elapsed between the first and last edit.
|
|
51
|
+
|
|
52
|
+
The processor will:
|
|
53
|
+
|
|
54
|
+
1. Load the recorded events from the JSONL file
|
|
55
|
+
2. Verify that the initial event matches the template (allowances for newline
|
|
56
|
+
differences are made)
|
|
57
|
+
3. Reconstruct the final file state by applying all recorded events
|
|
58
|
+
4. Output the reconstructed file contents to stdout
|
|
59
|
+
|
|
60
|
+
### Output
|
|
61
|
+
|
|
62
|
+
The reconstructed file is printed to stdout. Any warnings or errors are printed
|
|
63
|
+
to stderr, including:
|
|
64
|
+
|
|
65
|
+
- The document path being processed
|
|
66
|
+
- Suspicious copy-paste and AI activity indicators
|
|
67
|
+
|
|
68
|
+
### Suspicious Activity Detection
|
|
69
|
+
|
|
70
|
+
The processor automatically detects and reports three types of suspicious activity
|
|
71
|
+
patterns:
|
|
72
|
+
|
|
73
|
+
#### 1. Time Limit Exceeded
|
|
74
|
+
|
|
75
|
+
When the `--time-limit` flag is specified, the processor flags recordings where
|
|
76
|
+
the elapsed time between the first and last edit exceeds the specified limit.
|
|
77
|
+
This can indicate unusually long work sessions or potential external assistance.
|
|
78
|
+
|
|
79
|
+
**Example warning:**
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
Time limit exceeded!
|
|
83
|
+
Limit: 30 minutes
|
|
84
|
+
Elapsed: 45.5 minutes
|
|
85
|
+
First edit: 2025-01-15T10:00:00+00:00
|
|
86
|
+
Last edit: 2025-01-15T10:45:30+00:00
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### 2. External Copy-Paste (Multi-line Pastes)
|
|
90
|
+
|
|
91
|
+
The processor flags multi-line additions (more than one line) that do not appear
|
|
92
|
+
to be copied from within the document itself. These indicate content pasted from
|
|
93
|
+
external sources.
|
|
94
|
+
|
|
95
|
+
**Example warning:**
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
Event #15 (multi-line external paste): 5 lines, 156 chars - newFragment: def helper_function():...
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### 3. Rapid One-line Pastes (AI Indicator)
|
|
102
|
+
|
|
103
|
+
When 3 or more single-line pastes occur within a 1-second window, this is
|
|
104
|
+
flagged as a potential AI activity indicator. Human typing does not typically
|
|
105
|
+
produce this pattern; rapid sequential pastes suggest automated code generation.
|
|
106
|
+
|
|
107
|
+
**Example warning:**
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
Events #42-#44 (rapid one-line pastes (AI indicator)): 3 lines, 89 chars
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Error Handling
|
|
114
|
+
|
|
115
|
+
If verification fails (the recorded initial state doesn't match the template),
|
|
116
|
+
the processor will:
|
|
117
|
+
|
|
118
|
+
- Print an error message to stderr
|
|
119
|
+
- Display a diff showing the differences
|
|
120
|
+
- Exit with status code 1
|
|
121
|
+
|
|
122
|
+
If file loading or processing errors occur, the processor will:
|
|
123
|
+
|
|
124
|
+
- Print a descriptive error message to stderr
|
|
125
|
+
- Exit with status code 1
|
|
126
|
+
|
|
127
|
+
## Future Ideas
|
|
128
|
+
|
|
129
|
+
- Check for odd typing behavior
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "cr_proc"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Ethan Dye",email = "mrtops03@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.14"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"py-jsonl (>=1.3.22,<2.0.0)"
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[tool.poetry]
|
|
15
|
+
packages = [{include = "code_recorder_processor", from = "src"}]
|
|
16
|
+
|
|
17
|
+
[tool.poetry.scripts]
|
|
18
|
+
cr_proc = "code_recorder_processor.cli:main"
|
|
19
|
+
test = "pytest:main"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
23
|
+
build-backend = "poetry.core.masonry.api"
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"mdformat (>=1.0.0,<2.0.0)",
|
|
28
|
+
"pytest (>=9.0.2,<10.0.0)"
|
|
29
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _parse_ts(ts: str) -> datetime:
|
|
6
|
+
"""Parse ISO 8601 timestamps that end with 'Z' (UTC)."""
|
|
7
|
+
# Convert trailing Z to +00:00 for fromisoformat
|
|
8
|
+
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _normalize_newlines(text: str) -> str:
|
|
12
|
+
"""Normalize CRLF to LF for consistent offset handling."""
|
|
13
|
+
return text.replace("\r\n", "\n")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _apply_event_exact_text(doc: str, offset: int, old: str, new: str) -> str:
|
|
17
|
+
"""Apply an event assuming Python string indices (Unicode code points)."""
|
|
18
|
+
return doc[:offset] + new + doc[offset + len(old):]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _apply_event_fuzzy_text(doc: str, offset: int, old: str, new: str, window: int) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Apply an event with a fuzzy fallback in Python string space:
|
|
24
|
+
- If old == '', it's a pure insertion at offset (clamped).
|
|
25
|
+
- If exact match fails, search for old around the offset within ±window chars and
|
|
26
|
+
replace the nearest occurrence.
|
|
27
|
+
"""
|
|
28
|
+
if old == "":
|
|
29
|
+
off = max(0, min(offset, len(doc)))
|
|
30
|
+
return doc[:off] + new + doc[off:]
|
|
31
|
+
|
|
32
|
+
# Try exact match first
|
|
33
|
+
if doc[offset:offset + len(old)] == old:
|
|
34
|
+
return _apply_event_exact_text(doc, offset, old, new)
|
|
35
|
+
|
|
36
|
+
# Fuzzy search around the offset
|
|
37
|
+
start = max(0, offset - window)
|
|
38
|
+
end = min(len(doc), offset + window)
|
|
39
|
+
best_pos, best_dist = None, None
|
|
40
|
+
|
|
41
|
+
i = start
|
|
42
|
+
while True:
|
|
43
|
+
i = doc.find(old, i, end)
|
|
44
|
+
if i == -1:
|
|
45
|
+
break
|
|
46
|
+
dist = abs(i - offset)
|
|
47
|
+
if best_dist is None or dist < best_dist:
|
|
48
|
+
best_pos, best_dist = i, dist
|
|
49
|
+
i += 1
|
|
50
|
+
|
|
51
|
+
if best_pos is not None:
|
|
52
|
+
return _apply_event_exact_text(doc, best_pos, old, new)
|
|
53
|
+
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Old fragment not found near offset {offset}. old={old!r}, new={new!r}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _apply_event_exact_utf16(doc_bytes: bytes, offset_units: int, old: str, new: str) -> bytes:
|
|
60
|
+
"""
|
|
61
|
+
Apply an event in UTF-16-LE byte space (JetBrains Document uses UTF-16 code units):
|
|
62
|
+
offset_units is measured in code units; each unit is 2 bytes in UTF-16-LE.
|
|
63
|
+
"""
|
|
64
|
+
# Convert offset in code units to byte index
|
|
65
|
+
bidx = offset_units * 2
|
|
66
|
+
old_b = old.encode("utf-16-le")
|
|
67
|
+
new_b = new.encode("utf-16-le")
|
|
68
|
+
return doc_bytes[:bidx] + new_b + doc_bytes[bidx + len(old_b):]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _apply_event_fuzzy_utf16(doc_bytes: bytes, offset_units: int, old: str, new: str, window_units: int) -> bytes:
|
|
72
|
+
"""
|
|
73
|
+
Fuzzy apply in UTF-16-LE byte space:
|
|
74
|
+
- If old == '', pure insertion at offset_units.
|
|
75
|
+
- Else try exact match, otherwise search for old within ±window_units and replace nearest occurrence.
|
|
76
|
+
"""
|
|
77
|
+
bidx = offset_units * 2
|
|
78
|
+
old_b = old.encode("utf-16-le")
|
|
79
|
+
new_b = new.encode("utf-16-le")
|
|
80
|
+
|
|
81
|
+
if old == "":
|
|
82
|
+
# Insertion: clamp to bounds
|
|
83
|
+
bidx = max(0, min(bidx, len(doc_bytes)))
|
|
84
|
+
return doc_bytes[:bidx] + new_b + doc_bytes[bidx:]
|
|
85
|
+
|
|
86
|
+
# Exact match
|
|
87
|
+
if doc_bytes[bidx:bidx + len(old_b)] == old_b:
|
|
88
|
+
return _apply_event_exact_utf16(doc_bytes, offset_units, old, new)
|
|
89
|
+
|
|
90
|
+
# Fuzzy search around offset
|
|
91
|
+
start_b = max(0, bidx - window_units * 2)
|
|
92
|
+
end_b = min(len(doc_bytes), bidx + window_units * 2)
|
|
93
|
+
|
|
94
|
+
best_pos_b, best_dist_units = None, None
|
|
95
|
+
i = start_b
|
|
96
|
+
while True:
|
|
97
|
+
i = doc_bytes.find(old_b, i, end_b)
|
|
98
|
+
if i == -1:
|
|
99
|
+
break
|
|
100
|
+
dist_units = abs((i // 2) - offset_units)
|
|
101
|
+
if best_dist_units is None or dist_units < best_dist_units:
|
|
102
|
+
best_pos_b, best_dist_units = i, dist_units
|
|
103
|
+
i += 1
|
|
104
|
+
|
|
105
|
+
if best_pos_b is not None:
|
|
106
|
+
return doc_bytes[:best_pos_b] + new_b + doc_bytes[best_pos_b + len(old_b):]
|
|
107
|
+
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Old fragment not found near offset {offset_units} (UTF-16 units). "
|
|
110
|
+
f"old={old!r}, new={new!r}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def reconstruct_file_from_events(
|
|
115
|
+
events: tuple[dict[str, Any], ...],
|
|
116
|
+
template: str,
|
|
117
|
+
document_path: Optional[str] = None,
|
|
118
|
+
*,
|
|
119
|
+
utf16_mode: bool = False,
|
|
120
|
+
window: int = 200,
|
|
121
|
+
normalize_newlines: bool = True,
|
|
122
|
+
) -> str:
|
|
123
|
+
"""
|
|
124
|
+
Reconstruct the final document by replaying PyCharm/IntelliJ edit events.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
events : tuple of dict
|
|
129
|
+
Each dict should contain:
|
|
130
|
+
- 'timestamp': ISO 8601 string, e.g., '2026-01-13T22:40:44.137341Z'
|
|
131
|
+
- 'document': absolute path string of the edited file
|
|
132
|
+
- 'offset': integer offset (JetBrains Document uses UTF-16 code units)
|
|
133
|
+
- 'oldFragment': string being replaced/removed at offset
|
|
134
|
+
- 'newFragment': string inserted at offset
|
|
135
|
+
template_path : Path
|
|
136
|
+
Path to the template file content to use as the starting point.
|
|
137
|
+
document_path : Optional[str]
|
|
138
|
+
If provided, only events matching this path will be applied. If None,
|
|
139
|
+
the function tries to infer:
|
|
140
|
+
- If any event matches str(template_path), use those.
|
|
141
|
+
- If there's only one distinct document in events, use that.
|
|
142
|
+
- Otherwise raises ValueError.
|
|
143
|
+
utf16_mode : bool (default: False)
|
|
144
|
+
If True, treat offsets as UTF-16 code units and apply changes in UTF-16-LE
|
|
145
|
+
byte space (safer for emoji/astral chars). If False, operate on Python
|
|
146
|
+
string indices (fine for ASCII/BMP-only source).
|
|
147
|
+
window : int (default: 200)
|
|
148
|
+
Fuzzy search window radius around the intended offset. Used when exact
|
|
149
|
+
match of oldFragment at offset fails; the nearest occurrence within the
|
|
150
|
+
window is replaced.
|
|
151
|
+
normalize_newlines : bool (default: True)
|
|
152
|
+
If True, normalize CRLF to LF in the starting template content. This
|
|
153
|
+
prevents offset drift if events were recorded using LF.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
str
|
|
158
|
+
The reconstructed final document content.
|
|
159
|
+
|
|
160
|
+
Raises
|
|
161
|
+
------
|
|
162
|
+
ValueError
|
|
163
|
+
- If the target document cannot be determined.
|
|
164
|
+
- If an edit cannot be applied (oldFragment not found near offset).
|
|
165
|
+
"""
|
|
166
|
+
# Read template content
|
|
167
|
+
if normalize_newlines:
|
|
168
|
+
template = _normalize_newlines(template)
|
|
169
|
+
|
|
170
|
+
# Decide which document's events to replay
|
|
171
|
+
docs = {e.get("document") for e in events}
|
|
172
|
+
target_doc = document_path
|
|
173
|
+
template_doc_str = str(template)
|
|
174
|
+
|
|
175
|
+
if target_doc is None:
|
|
176
|
+
if template_doc_str in docs:
|
|
177
|
+
target_doc = template_doc_str
|
|
178
|
+
elif len(docs) == 1:
|
|
179
|
+
target_doc = next(iter(docs))
|
|
180
|
+
else:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
"Ambiguous target document: provide document_path explicitly. "
|
|
183
|
+
f"Found documents: {sorted(d for d in docs if d is not None)}"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Filter events to the target document and sort by timestamp
|
|
187
|
+
evs = [e for e in events if e.get("document") == target_doc]
|
|
188
|
+
evs.sort(key=lambda e: _parse_ts(e["timestamp"]))
|
|
189
|
+
|
|
190
|
+
if not evs:
|
|
191
|
+
# No events for target_doc; return template unchanged
|
|
192
|
+
return template
|
|
193
|
+
|
|
194
|
+
if utf16_mode:
|
|
195
|
+
# Work in UTF-16-LE byte space
|
|
196
|
+
doc_bytes = template.encode("utf-16-le")
|
|
197
|
+
for e in evs:
|
|
198
|
+
offset_units = int(e["offset"])
|
|
199
|
+
old = e.get("oldFragment", "")
|
|
200
|
+
new = e.get("newFragment", "")
|
|
201
|
+
doc_bytes = _apply_event_fuzzy_utf16(doc_bytes, offset_units, old, new, window_units=window)
|
|
202
|
+
# Decode back to text
|
|
203
|
+
return doc_bytes.decode("utf-16-le")
|
|
204
|
+
else:
|
|
205
|
+
# Work in Python string space
|
|
206
|
+
doc = template
|
|
207
|
+
for e in evs:
|
|
208
|
+
offset = int(e["offset"])
|
|
209
|
+
old = e.get("oldFragment", "")
|
|
210
|
+
new = e.get("newFragment", "")
|
|
211
|
+
doc = _apply_event_fuzzy_text(doc, offset, old, new, window=window)
|
|
212
|
+
return doc
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import jsonl
|
|
2
|
+
import zlib
|
|
3
|
+
from gzip import BadGzipFile, open as gzip_open
|
|
4
|
+
from io import StringIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_jsonl(file: Path) -> tuple[dict[str, Any], ...]:
|
|
10
|
+
"""
|
|
11
|
+
Load JSONL data from a file (gzip compressed or plain text).
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
file : Path
|
|
16
|
+
Path to the JSONL file (can be .jsonl or .jsonl.gz)
|
|
17
|
+
|
|
18
|
+
Returns
|
|
19
|
+
-------
|
|
20
|
+
tuple[dict[str, Any], ...]
|
|
21
|
+
Tuple of dictionaries parsed from the JSONL file
|
|
22
|
+
|
|
23
|
+
Raises
|
|
24
|
+
------
|
|
25
|
+
FileNotFoundError
|
|
26
|
+
If the file does not exist
|
|
27
|
+
IOError
|
|
28
|
+
If there's an error reading the file
|
|
29
|
+
ValueError
|
|
30
|
+
If the file contains invalid JSONL data
|
|
31
|
+
"""
|
|
32
|
+
if not file.exists():
|
|
33
|
+
raise FileNotFoundError(f"File not found: {file}")
|
|
34
|
+
|
|
35
|
+
if not file.is_file():
|
|
36
|
+
raise ValueError(f"Path is not a file: {file}")
|
|
37
|
+
|
|
38
|
+
def _load_jsonl(source: Any) -> tuple[dict[str, Any], ...]:
|
|
39
|
+
return tuple(jsonl.load(source))
|
|
40
|
+
|
|
41
|
+
def _read_magic(path: Path) -> bytes:
|
|
42
|
+
try:
|
|
43
|
+
with path.open("rb") as fh:
|
|
44
|
+
return fh.read(2)
|
|
45
|
+
except FileNotFoundError:
|
|
46
|
+
raise
|
|
47
|
+
except OSError:
|
|
48
|
+
return b""
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
data = _load_jsonl(file)
|
|
52
|
+
except BadGzipFile:
|
|
53
|
+
magic = _read_magic(file)
|
|
54
|
+
looks_gzip = magic == b"\x1f\x8b"
|
|
55
|
+
|
|
56
|
+
# If it looks like gzip, try an explicit gzip open before giving up.
|
|
57
|
+
if looks_gzip:
|
|
58
|
+
try:
|
|
59
|
+
with gzip_open(file, "rt", encoding="utf-8") as gz:
|
|
60
|
+
data = _load_jsonl(gz)
|
|
61
|
+
except (BadGzipFile, OSError):
|
|
62
|
+
data = None
|
|
63
|
+
else:
|
|
64
|
+
data = None
|
|
65
|
+
|
|
66
|
+
if data is None:
|
|
67
|
+
# If gzip stream is broken, attempt a lenient zlib decompress to salvage content.
|
|
68
|
+
try:
|
|
69
|
+
raw = file.read_bytes()
|
|
70
|
+
dobj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
71
|
+
text_bytes = dobj.decompress(raw) + dobj.flush()
|
|
72
|
+
text = text_bytes.decode("utf-8", errors="replace")
|
|
73
|
+
data = _load_jsonl(StringIO(text))
|
|
74
|
+
except Exception:
|
|
75
|
+
data = None
|
|
76
|
+
|
|
77
|
+
if data is None:
|
|
78
|
+
# Fall back to plain text even if the header hinted gzip.
|
|
79
|
+
try:
|
|
80
|
+
with file.open("r", encoding="utf-8", errors="replace") as plain_file:
|
|
81
|
+
data = _load_jsonl(plain_file)
|
|
82
|
+
except FileNotFoundError as e:
|
|
83
|
+
raise FileNotFoundError(f"Error reading file {file}: {e}")
|
|
84
|
+
except ValueError as e:
|
|
85
|
+
raise ValueError(f"Invalid JSONL format in {file} (plain read fallback): {e}")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
raise IOError(
|
|
88
|
+
f"Error loading JSONL file {file} without compression (magic={magic!r}): {type(e).__name__}: {e}"
|
|
89
|
+
)
|
|
90
|
+
except FileNotFoundError as e:
|
|
91
|
+
raise FileNotFoundError(f"Error reading file {file}: {e}")
|
|
92
|
+
except ValueError as e:
|
|
93
|
+
raise ValueError(f"Invalid JSONL format in {file}: {e}")
|
|
94
|
+
except Exception as e:
|
|
95
|
+
raise IOError(f"Error loading JSONL file {file}: {type(e).__name__}: {e}")
|
|
96
|
+
|
|
97
|
+
if not data:
|
|
98
|
+
raise ValueError(f"JSONL file is empty: {file}")
|
|
99
|
+
|
|
100
|
+
return data
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
import difflib
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _normalize_newlines(text: str) -> str:
|
|
7
|
+
"""Normalize CRLF to LF to avoid offset and diff noise."""
|
|
8
|
+
return text.replace("\r\n", "\n")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_only_whitespace_differences(template: str, actual: str) -> bool:
|
|
12
|
+
"""
|
|
13
|
+
Return True if `actual` can be derived from `template` by changing only
|
|
14
|
+
whitespace (spaces, tabs, newlines). All non-whitespace characters must
|
|
15
|
+
appear in the same order with no additions, deletions, or substitutions.
|
|
16
|
+
"""
|
|
17
|
+
t = _normalize_newlines(template)
|
|
18
|
+
a = _normalize_newlines(actual)
|
|
19
|
+
|
|
20
|
+
lt, la = len(t), len(a)
|
|
21
|
+
i = j = 0
|
|
22
|
+
|
|
23
|
+
while True:
|
|
24
|
+
# Skip any whitespace on both sides
|
|
25
|
+
while i < lt and t[i].isspace():
|
|
26
|
+
i += 1
|
|
27
|
+
while j < la and a[j].isspace():
|
|
28
|
+
j += 1
|
|
29
|
+
|
|
30
|
+
if i >= lt or j >= la:
|
|
31
|
+
break
|
|
32
|
+
|
|
33
|
+
if t[i] != a[j]:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
i += 1
|
|
37
|
+
j += 1
|
|
38
|
+
|
|
39
|
+
# Ensure no remaining non-whitespace characters on either side
|
|
40
|
+
while i < lt:
|
|
41
|
+
if not t[i].isspace():
|
|
42
|
+
return False
|
|
43
|
+
i += 1
|
|
44
|
+
|
|
45
|
+
while j < la:
|
|
46
|
+
if not a[j].isspace():
|
|
47
|
+
return False
|
|
48
|
+
j += 1
|
|
49
|
+
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def verify_template(template: str, jsonData: tuple[dict[str, Any], ...]) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Verify the initial event is a faithful snapshot of the template,
|
|
56
|
+
allowing only whitespace differences in the event's newFragment.
|
|
57
|
+
|
|
58
|
+
Also validates that the recorder wrote an initial snapshot
|
|
59
|
+
(oldFragment == newFragment on the first record).
|
|
60
|
+
|
|
61
|
+
Returns the verified fragment text.
|
|
62
|
+
Raises ValueError if verification fails.
|
|
63
|
+
"""
|
|
64
|
+
if not jsonData:
|
|
65
|
+
raise ValueError("jsonData is empty")
|
|
66
|
+
|
|
67
|
+
first = jsonData[0]
|
|
68
|
+
new_frag = _normalize_newlines(first["newFragment"])
|
|
69
|
+
old_frag = _normalize_newlines(first["oldFragment"])
|
|
70
|
+
temp_norm = _normalize_newlines(template)
|
|
71
|
+
|
|
72
|
+
# Recorder must have written an initial full snapshot:
|
|
73
|
+
if new_frag != old_frag:
|
|
74
|
+
raise ValueError("oldFragment does not match newFragment (no initial snapshot)")
|
|
75
|
+
|
|
76
|
+
# Accept exact match OR match that differs only by whitespace
|
|
77
|
+
if new_frag == temp_norm:
|
|
78
|
+
return new_frag
|
|
79
|
+
|
|
80
|
+
if is_only_whitespace_differences(temp_norm, new_frag):
|
|
81
|
+
return new_frag
|
|
82
|
+
|
|
83
|
+
raise ValueError("newFragment does not match template (differs by more than whitespace)")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def template_diff(template: str, jsonData: tuple[dict[str, Any], ...]) -> str:
|
|
87
|
+
"""
|
|
88
|
+
Produce a unified diff between the given template and the first event's newFragment.
|
|
89
|
+
If the only differences are whitespace, return ''.
|
|
90
|
+
"""
|
|
91
|
+
if not jsonData:
|
|
92
|
+
return ""
|
|
93
|
+
|
|
94
|
+
template_norm = _normalize_newlines(template)
|
|
95
|
+
actual_norm = _normalize_newlines(jsonData[0]["newFragment"])
|
|
96
|
+
|
|
97
|
+
# Suppress diff if only whitespace differences
|
|
98
|
+
if is_only_whitespace_differences(template_norm, actual_norm):
|
|
99
|
+
return ""
|
|
100
|
+
|
|
101
|
+
# Generate a proper unified diff (headers on separate lines)
|
|
102
|
+
t_lines = template_norm.splitlines(keepends=True)
|
|
103
|
+
a_lines = actual_norm.splitlines(keepends=True)
|
|
104
|
+
|
|
105
|
+
diff_iter = difflib.unified_diff(
|
|
106
|
+
t_lines,
|
|
107
|
+
a_lines,
|
|
108
|
+
fromfile="template",
|
|
109
|
+
tofile="actual",
|
|
110
|
+
n=3,
|
|
111
|
+
lineterm="\n",
|
|
112
|
+
)
|
|
113
|
+
return "".join(diff_iter)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _detect_multiline_external_pastes(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
|
|
117
|
+
"""
|
|
118
|
+
Detect multi-line copy-paste events from external sources.
|
|
119
|
+
|
|
120
|
+
Flags newFragments that are significant in length (more than one line)
|
|
121
|
+
and do not appear to be copied from within the document itself.
|
|
122
|
+
|
|
123
|
+
Returns a list of suspicious multi-line paste events.
|
|
124
|
+
"""
|
|
125
|
+
suspicious_events = []
|
|
126
|
+
|
|
127
|
+
# Build a history of all document content seen so far
|
|
128
|
+
document_history = set()
|
|
129
|
+
|
|
130
|
+
for idx, event in enumerate(jsonData):
|
|
131
|
+
old_frag = _normalize_newlines(event.get("oldFragment", ""))
|
|
132
|
+
new_frag = _normalize_newlines(event.get("newFragment", ""))
|
|
133
|
+
|
|
134
|
+
# Skip if no actual change
|
|
135
|
+
if new_frag == old_frag or new_frag.strip() == "":
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Only check multi-line content (more than 2 lines means at least 2 actual lines)
|
|
139
|
+
new_lines = new_frag.split("\n")
|
|
140
|
+
if len(new_lines) <= 2: # Single line or line + empty
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
# Check if the new content appears to be from within the document
|
|
144
|
+
is_internal_copy = False
|
|
145
|
+
|
|
146
|
+
# Check if new_frag content was present in any previous fragments
|
|
147
|
+
for hist_content in document_history:
|
|
148
|
+
# Ignore tiny fragments; they appear everywhere and cause false positives
|
|
149
|
+
if len(hist_content) < 20:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
# Require substantial overlap in size to count as an internal copy
|
|
153
|
+
similar_length = (
|
|
154
|
+
len(hist_content) >= 0.8 * len(new_frag)
|
|
155
|
+
and len(hist_content) <= 1.25 * len(new_frag)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if new_frag == hist_content:
|
|
159
|
+
is_internal_copy = True
|
|
160
|
+
break
|
|
161
|
+
|
|
162
|
+
if new_frag in hist_content and similar_length:
|
|
163
|
+
is_internal_copy = True
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
if hist_content in new_frag and similar_length:
|
|
167
|
+
is_internal_copy = True
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
# Also check if it's in the old fragment (internal move/copy)
|
|
171
|
+
if not is_internal_copy and old_frag and (new_frag in old_frag or old_frag in new_frag):
|
|
172
|
+
is_internal_copy = True
|
|
173
|
+
|
|
174
|
+
if not is_internal_copy:
|
|
175
|
+
suspicious_events.append({
|
|
176
|
+
"event_index": idx,
|
|
177
|
+
"line_count": len(new_lines),
|
|
178
|
+
"char_count": len(new_frag),
|
|
179
|
+
"reason": "multi-line external paste",
|
|
180
|
+
"newFragment": new_frag[:100] + ("..." if len(new_frag) > 100 else ""),
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
# Update history after analysis so the current fragment cannot mask itself
|
|
184
|
+
if len(old_frag) > 1:
|
|
185
|
+
document_history.add(old_frag)
|
|
186
|
+
if len(new_frag) > 1:
|
|
187
|
+
document_history.add(new_frag)
|
|
188
|
+
|
|
189
|
+
return suspicious_events
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _detect_rapid_paste_sequences(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
|
|
193
|
+
"""
|
|
194
|
+
Detect rapid sequences of one-line pastes (AI assistance indicator).
|
|
195
|
+
|
|
196
|
+
Identifies clusters of 3+ one-line paste events occurring within 1 second,
|
|
197
|
+
which may indicate AI-assisted code generation.
|
|
198
|
+
|
|
199
|
+
Returns a list of suspicious rapid-paste events.
|
|
200
|
+
"""
|
|
201
|
+
suspicious_events = []
|
|
202
|
+
|
|
203
|
+
# Track one-line paste events for rapid-paste detection
|
|
204
|
+
one_line_pastes = []
|
|
205
|
+
|
|
206
|
+
for idx, event in enumerate(jsonData):
|
|
207
|
+
new_frag = _normalize_newlines(event.get("newFragment", ""))
|
|
208
|
+
old_frag = _normalize_newlines(event.get("oldFragment", ""))
|
|
209
|
+
timestamp = event.get("timestamp")
|
|
210
|
+
|
|
211
|
+
# Skip if no timestamp or no change
|
|
212
|
+
if not timestamp or new_frag == old_frag or new_frag.strip() == "":
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
# Check if newFragment is a single line (2 elements = 1 line + trailing \n)
|
|
216
|
+
new_lines = new_frag.split("\n")
|
|
217
|
+
if len(new_lines) == 2:
|
|
218
|
+
# Heuristic: if it's more than a few characters, it might be pasted
|
|
219
|
+
if len(new_frag.strip()) > 5:
|
|
220
|
+
one_line_pastes.append({
|
|
221
|
+
"event_index": idx,
|
|
222
|
+
"timestamp": timestamp,
|
|
223
|
+
"content": new_frag
|
|
224
|
+
})
|
|
225
|
+
|
|
226
|
+
# Analyze one-line pastes for rapid clusters
|
|
227
|
+
if not one_line_pastes:
|
|
228
|
+
return suspicious_events
|
|
229
|
+
|
|
230
|
+
def parse_ts(ts_str: str) -> datetime:
|
|
231
|
+
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
232
|
+
|
|
233
|
+
i = 0
|
|
234
|
+
while i < len(one_line_pastes):
|
|
235
|
+
cluster = [one_line_pastes[i]]
|
|
236
|
+
cluster_start = parse_ts(one_line_pastes[i]["timestamp"])
|
|
237
|
+
|
|
238
|
+
# Look ahead for more pastes within 1 second
|
|
239
|
+
j = i + 1
|
|
240
|
+
while j < len(one_line_pastes):
|
|
241
|
+
current_time = parse_ts(one_line_pastes[j]["timestamp"])
|
|
242
|
+
if (current_time - cluster_start).total_seconds() <= 1.0:
|
|
243
|
+
cluster.append(one_line_pastes[j])
|
|
244
|
+
j += 1
|
|
245
|
+
else:
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
# If we found 3+ one-line pastes within 1 second, flag it
|
|
249
|
+
if len(cluster) >= 3:
|
|
250
|
+
event_indices = [p["event_index"] for p in cluster]
|
|
251
|
+
suspicious_events.append({
|
|
252
|
+
"event_index": event_indices[0],
|
|
253
|
+
"event_indices": event_indices,
|
|
254
|
+
"line_count": len(cluster),
|
|
255
|
+
"char_count": sum(len(p["content"]) for p in cluster),
|
|
256
|
+
"reason": "rapid one-line pastes (AI indicator)",
|
|
257
|
+
"newFragment": f"{len(cluster)} one-line pastes in 1 second",
|
|
258
|
+
})
|
|
259
|
+
|
|
260
|
+
i = j if j > i + 1 else i + 1
|
|
261
|
+
|
|
262
|
+
return suspicious_events
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def detect_external_copypaste(jsonData: tuple[dict[str, Any], ...]) -> list[dict[str, Any]]:
|
|
266
|
+
"""
|
|
267
|
+
Detect copy-paste events from external sources and AI-assisted coding patterns.
|
|
268
|
+
|
|
269
|
+
Combines detection of:
|
|
270
|
+
1. Multi-line external paste events (content not from within document)
|
|
271
|
+
2. Rapid one-line paste sequences (potential AI assistance indicator)
|
|
272
|
+
|
|
273
|
+
Returns a list of all suspicious events with metadata.
|
|
274
|
+
"""
|
|
275
|
+
suspicious_events = []
|
|
276
|
+
|
|
277
|
+
# Detect multi-line external pastes
|
|
278
|
+
suspicious_events.extend(_detect_multiline_external_pastes(jsonData))
|
|
279
|
+
|
|
280
|
+
# Detect rapid one-line paste sequences (AI indicator)
|
|
281
|
+
suspicious_events.extend(_detect_rapid_paste_sequences(jsonData))
|
|
282
|
+
|
|
283
|
+
return suspicious_events
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def check_time_limit(jsonData: tuple[dict[str, Any], ...], time_limit_minutes: int | None) -> dict[str, Any] | None:
|
|
287
|
+
"""
|
|
288
|
+
Check if the time between first and last edit exceeds the specified time limit.
|
|
289
|
+
|
|
290
|
+
Tracks elapsed editing time across sessions by summing actual editing time within
|
|
291
|
+
each session (excluding gaps between sessions). For the time limit check, compares
|
|
292
|
+
the span from the first timestamp to the last timestamp overall.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
jsonData : tuple[dict[str, Any], ...]
|
|
297
|
+
The event data from the JSONL file
|
|
298
|
+
time_limit_minutes : int | None
|
|
299
|
+
Maximum allowed time in minutes between first and last overall edit.
|
|
300
|
+
If None, no time limit is enforced.
|
|
301
|
+
|
|
302
|
+
Returns
|
|
303
|
+
-------
|
|
304
|
+
dict[str, Any] | None
|
|
305
|
+
A dictionary with time limit and elapsed time info.
|
|
306
|
+
Contains 'exceeds_limit' flag and always includes 'minutes_elapsed'.
|
|
307
|
+
"""
|
|
308
|
+
if not jsonData:
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
def parse_ts(ts_str: str) -> datetime:
|
|
312
|
+
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
313
|
+
|
|
314
|
+
# Identify session boundaries: sessions start at indices where oldFragment == newFragment (non-empty)
|
|
315
|
+
session_starts = [0] # First session always starts at index 0
|
|
316
|
+
for idx in range(1, len(jsonData)):
|
|
317
|
+
old_frag = jsonData[idx].get("oldFragment", "")
|
|
318
|
+
new_frag = jsonData[idx].get("newFragment", "")
|
|
319
|
+
if old_frag == new_frag and old_frag.strip() != "":
|
|
320
|
+
session_starts.append(idx)
|
|
321
|
+
|
|
322
|
+
# Add sentinel to mark end of last session
|
|
323
|
+
session_starts.append(len(jsonData))
|
|
324
|
+
|
|
325
|
+
# Find first and last timestamps overall
|
|
326
|
+
first_timestamp_overall = None
|
|
327
|
+
last_timestamp_overall = None
|
|
328
|
+
|
|
329
|
+
for event in jsonData:
|
|
330
|
+
if event.get("timestamp"):
|
|
331
|
+
if first_timestamp_overall is None:
|
|
332
|
+
first_timestamp_overall = event["timestamp"]
|
|
333
|
+
last_timestamp_overall = event["timestamp"]
|
|
334
|
+
|
|
335
|
+
if first_timestamp_overall is None or last_timestamp_overall is None:
|
|
336
|
+
# Not enough events with timestamps
|
|
337
|
+
return None
|
|
338
|
+
|
|
339
|
+
# Calculate elapsed time by summing editing time within each session
|
|
340
|
+
total_minutes_elapsed = 0.0
|
|
341
|
+
|
|
342
|
+
for i in range(len(session_starts) - 1):
|
|
343
|
+
session_start = session_starts[i]
|
|
344
|
+
session_end = session_starts[i + 1]
|
|
345
|
+
|
|
346
|
+
# Find first and last events with timestamps in this session
|
|
347
|
+
first_event = None
|
|
348
|
+
last_event = None
|
|
349
|
+
|
|
350
|
+
for event in jsonData[session_start:session_end]:
|
|
351
|
+
if event.get("timestamp"):
|
|
352
|
+
if first_event is None:
|
|
353
|
+
first_event = event
|
|
354
|
+
last_event = event
|
|
355
|
+
|
|
356
|
+
# If this session has timestamped events, add its elapsed time
|
|
357
|
+
if first_event is not None and last_event is not None:
|
|
358
|
+
try:
|
|
359
|
+
first_time = parse_ts(first_event["timestamp"])
|
|
360
|
+
last_time = parse_ts(last_event["timestamp"])
|
|
361
|
+
session_diff = last_time - first_time
|
|
362
|
+
total_minutes_elapsed += session_diff.total_seconds() / 60
|
|
363
|
+
except (ValueError, KeyError):
|
|
364
|
+
# Timestamp parsing failed for this session, skip it
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
# For time limit check, use the span from first to last timestamp overall
|
|
368
|
+
try:
|
|
369
|
+
first_time_overall = parse_ts(first_timestamp_overall)
|
|
370
|
+
last_time_overall = parse_ts(last_timestamp_overall)
|
|
371
|
+
overall_span_minutes = (last_time_overall - first_time_overall).total_seconds() / 60
|
|
372
|
+
except (ValueError, KeyError):
|
|
373
|
+
# Timestamp parsing failed
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
result = {
|
|
377
|
+
"time_limit_minutes": time_limit_minutes,
|
|
378
|
+
"minutes_elapsed": round(total_minutes_elapsed, 2),
|
|
379
|
+
"first_timestamp": first_timestamp_overall,
|
|
380
|
+
"last_timestamp": last_timestamp_overall,
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
# For time limit check, compare the overall span (first to last timestamp) against the limit
|
|
384
|
+
if time_limit_minutes is not None:
|
|
385
|
+
result["exceeds_limit"] = overall_span_minutes > time_limit_minutes
|
|
386
|
+
else:
|
|
387
|
+
result["exceeds_limit"] = False
|
|
388
|
+
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def verify(template: str, jsonData: tuple[dict[str, Any], ...]) -> tuple[str, list[dict[str, Any]]]:
|
|
393
|
+
"""
|
|
394
|
+
Comprehensive verification of recorded code events.
|
|
395
|
+
|
|
396
|
+
Performs:
|
|
397
|
+
1. Template verification (initial snapshot matches template)
|
|
398
|
+
2. External copy-paste detection
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
tuple: (verified_template_text, list_of_suspicious_copypaste_events)
|
|
402
|
+
|
|
403
|
+
Raises:
|
|
404
|
+
ValueError: If template verification fails
|
|
405
|
+
"""
|
|
406
|
+
# Verify template
|
|
407
|
+
verified_template = verify_template(template, jsonData)
|
|
408
|
+
|
|
409
|
+
# Detect external copy-paste events
|
|
410
|
+
suspicious_events = detect_external_copypaste(jsonData)
|
|
411
|
+
|
|
412
|
+
return verified_template, suspicious_events
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from .api.load import load_jsonl
|
|
7
|
+
from .api.verify import verify, template_diff, check_time_limit
|
|
8
|
+
from .api.build import reconstruct_file_from_events
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
description="Process and verify code recorder JSONL files"
|
|
14
|
+
)
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"jsonl_file",
|
|
17
|
+
type=Path,
|
|
18
|
+
help="Path to the compressed JSONL file (*.recording.jsonl.gz)",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"template_file",
|
|
22
|
+
type=Path,
|
|
23
|
+
help="Path to the initial template file that was recorded",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--time-limit",
|
|
27
|
+
type=int,
|
|
28
|
+
default=None,
|
|
29
|
+
help="Maximum allowed time in minutes between first and last edit. If exceeded, recording is flagged.",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--document",
|
|
33
|
+
type=str,
|
|
34
|
+
default=None,
|
|
35
|
+
help=("Document path or filename to process from the recording. "
|
|
36
|
+
"Defaults to the document whose extension matches the template file."),
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--output-json",
|
|
40
|
+
type=Path,
|
|
41
|
+
default=None,
|
|
42
|
+
help="Path to output JSON file with verification results (time info and suspicious events).",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
args = parser.parse_args()
|
|
46
|
+
|
|
47
|
+
# Load JSONL file first to get document path
|
|
48
|
+
try:
|
|
49
|
+
jsonData = load_jsonl(args.jsonl_file)
|
|
50
|
+
except FileNotFoundError as e:
|
|
51
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
52
|
+
return 1
|
|
53
|
+
except (ValueError, IOError) as e:
|
|
54
|
+
print(f"Error loading JSONL file: {e}", file=sys.stderr)
|
|
55
|
+
return 1
|
|
56
|
+
|
|
57
|
+
# Decide which recorded document to process
|
|
58
|
+
documents = {e.get("document") for e in jsonData if "document" in e and e.get("document") is not None}
|
|
59
|
+
recorded_docs = sorted([d for d in documents if d is not None])
|
|
60
|
+
|
|
61
|
+
def resolve_document(docs: list[str], template_path: Path, override: str | None) -> str | None:
|
|
62
|
+
if not docs:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
if override:
|
|
66
|
+
matches = [d for d in docs if d.endswith(override) or Path(d).name == override]
|
|
67
|
+
if not matches:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"No document in recording matches '{override}'. Available: {docs}"
|
|
70
|
+
)
|
|
71
|
+
if len(matches) > 1:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Ambiguous document override '{override}'. Matches: {matches}"
|
|
74
|
+
)
|
|
75
|
+
return matches[0]
|
|
76
|
+
|
|
77
|
+
template_ext = template_path.suffix
|
|
78
|
+
ext_matches = [d for d in docs if Path(d).suffix == template_ext]
|
|
79
|
+
if len(ext_matches) == 1:
|
|
80
|
+
return ext_matches[0]
|
|
81
|
+
if len(ext_matches) > 1:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Multiple documents share extension '{template_ext}': {ext_matches}. "
|
|
84
|
+
"Use --document to choose one."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if len(docs) == 1:
|
|
88
|
+
return docs[0]
|
|
89
|
+
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"Could not determine document to process. Use --document to select one. "
|
|
92
|
+
f"Available documents: {docs}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
target_document = resolve_document(recorded_docs, args.template_file, args.document)
|
|
97
|
+
except ValueError as e:
|
|
98
|
+
print(f"Error determining document: {e}", file=sys.stderr)
|
|
99
|
+
return 1
|
|
100
|
+
|
|
101
|
+
if target_document:
|
|
102
|
+
doc_events = tuple(e for e in jsonData if e.get("document") == target_document)
|
|
103
|
+
if not doc_events:
|
|
104
|
+
print(f"Error: No events found for document '{target_document}'", file=sys.stderr)
|
|
105
|
+
return 1
|
|
106
|
+
else:
|
|
107
|
+
doc_events = jsonData
|
|
108
|
+
|
|
109
|
+
print(f"Processing: {target_document or args.template_file}", file=sys.stderr)
|
|
110
|
+
|
|
111
|
+
# Read template file
|
|
112
|
+
try:
|
|
113
|
+
templateData = args.template_file.read_text()
|
|
114
|
+
except FileNotFoundError:
|
|
115
|
+
print(f"Error: Template file not found: {args.template_file}", file=sys.stderr)
|
|
116
|
+
return 1
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(f"Error reading template file: {e}", file=sys.stderr)
|
|
119
|
+
return 1
|
|
120
|
+
|
|
121
|
+
# Check time limit and display elapsed time
|
|
122
|
+
time_info = check_time_limit(doc_events, args.time_limit)
|
|
123
|
+
if time_info:
|
|
124
|
+
print(f"Elapsed editing time: {time_info['minutes_elapsed']} minutes", file=sys.stderr)
|
|
125
|
+
print(f"Time span (first to last edit): {(datetime.fromisoformat(time_info['last_timestamp'].replace('Z', '+00:00')) - datetime.fromisoformat(time_info['first_timestamp'].replace('Z', '+00:00'))).total_seconds() / 60:.2f} minutes", file=sys.stderr)
|
|
126
|
+
if time_info['exceeds_limit']:
|
|
127
|
+
print(f"\nTime limit exceeded!", file=sys.stderr)
|
|
128
|
+
print(f" Limit: {time_info['time_limit_minutes']} minutes", file=sys.stderr)
|
|
129
|
+
print(f" First edit: {time_info['first_timestamp']}", file=sys.stderr)
|
|
130
|
+
print(f" Last edit: {time_info['last_timestamp']}", file=sys.stderr)
|
|
131
|
+
|
|
132
|
+
# Verify and process
|
|
133
|
+
try:
|
|
134
|
+
templateData, suspicious_events = verify(templateData, doc_events)
|
|
135
|
+
print(reconstruct_file_from_events(doc_events, templateData, document_path=target_document))
|
|
136
|
+
|
|
137
|
+
# Prepare results for JSON output
|
|
138
|
+
results = {
|
|
139
|
+
"document": target_document or str(args.template_file),
|
|
140
|
+
"time_info": time_info,
|
|
141
|
+
"suspicious_events": suspicious_events,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if suspicious_events:
|
|
145
|
+
print("\nSuspicious copy-paste events detected:", file=sys.stderr)
|
|
146
|
+
for ev in suspicious_events:
|
|
147
|
+
reason = ev.get('reason', 'unknown')
|
|
148
|
+
indices = ev.get('event_indices', [ev['event_index']])
|
|
149
|
+
if len(indices) > 1:
|
|
150
|
+
print(f" Events #{indices[0]}-#{indices[-1]} ({reason}): "
|
|
151
|
+
f"{ev['line_count']} lines, {ev['char_count']} chars", file=sys.stderr)
|
|
152
|
+
else:
|
|
153
|
+
print(f" Event #{ev['event_index']} ({reason}): "
|
|
154
|
+
f"{ev['line_count']} lines, {ev['char_count']} chars - "
|
|
155
|
+
f"newFragment: {ev['newFragment']}", file=sys.stderr)
|
|
156
|
+
else:
|
|
157
|
+
print("Success! No suspicious events detected.", file=sys.stderr)
|
|
158
|
+
|
|
159
|
+
# Write JSON output if requested
|
|
160
|
+
if args.output_json:
|
|
161
|
+
try:
|
|
162
|
+
args.output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
with open(args.output_json, 'w') as f:
|
|
164
|
+
json.dump(results, f, indent=2)
|
|
165
|
+
print(f"Results written to {args.output_json}", file=sys.stderr)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
print(f"Error writing JSON output: {e}", file=sys.stderr)
|
|
168
|
+
return 1
|
|
169
|
+
except ValueError as e:
|
|
170
|
+
print("File failed verification from template!", file=sys.stderr)
|
|
171
|
+
print(str(e), file=sys.stderr)
|
|
172
|
+
try:
|
|
173
|
+
print(template_diff(templateData, doc_events), file=sys.stderr)
|
|
174
|
+
except Exception:
|
|
175
|
+
pass
|
|
176
|
+
return 1
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print(f"Error processing file: {type(e).__name__}: {e}", file=sys.stderr)
|
|
179
|
+
return 1
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == "__main__":
|
|
183
|
+
sys.exit(main())
|