argot-engine 0.2.7__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {argot_engine-0.2.7 → argot_engine-0.2.8}/PKG-INFO +1 -1
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/check.py +7 -2
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/explain.py +59 -50
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot_engine.egg-info/PKG-INFO +1 -1
- {argot_engine-0.2.7 → argot_engine-0.2.8}/pyproject.toml +1 -1
- {argot_engine-0.2.7 → argot_engine-0.2.8}/README.md +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/__init__.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/__main__.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/dataset.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/extract.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/fetch.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/git_walk.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/jepa/__init__.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/jepa/encoder.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/jepa/model.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/jepa/predictor.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/jepa/sigreg.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/__init__.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_check.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_explain.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_extract_smoke.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_fetch.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_git_walk.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_jepa.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_tokenize.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_train_smoke.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tests/test_validate.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/tokenize.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/train.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot/validate.py +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot_engine.egg-info/SOURCES.txt +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot_engine.egg-info/dependency_links.txt +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot_engine.egg-info/entry_points.txt +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot_engine.egg-info/requires.txt +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/argot_engine.egg-info/top_level.txt +0 -0
- {argot_engine-0.2.7 → argot_engine-0.2.8}/setup.cfg +0 -0
|
@@ -177,8 +177,13 @@ def main() -> None:
|
|
|
177
177
|
|
|
178
178
|
results.sort(key=lambda r: r[0], reverse=True)
|
|
179
179
|
|
|
180
|
+
col_w = 55
|
|
181
|
+
|
|
182
|
+
def _trunc(fp: str) -> str:
|
|
183
|
+
return fp if len(fp) <= col_w else "..." + fp[-(col_w - 3) :]
|
|
184
|
+
|
|
180
185
|
t = args.threshold
|
|
181
|
-
print(f"{'SURPRISE':>9} {'TAG':<10} {'FILE':<
|
|
186
|
+
print(f"{'SURPRISE':>9} {'TAG':<10} {'FILE':<{col_w}} {'LINE':>5} REF")
|
|
182
187
|
for score, fp, line, ref in results:
|
|
183
188
|
if score <= t:
|
|
184
189
|
tag = "ok"
|
|
@@ -188,7 +193,7 @@ def main() -> None:
|
|
|
188
193
|
tag = "suspicious"
|
|
189
194
|
else:
|
|
190
195
|
tag = "foreign"
|
|
191
|
-
print(f"{score:>9.4f} {tag:<10} {fp:<
|
|
196
|
+
print(f"{score:>9.4f} {tag:<10} {_trunc(fp):<{col_w}} {line:>5} {ref}")
|
|
192
197
|
|
|
193
198
|
if any(s > args.threshold for s, *_ in results):
|
|
194
199
|
sys.exit(1)
|
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
|
11
11
|
import pygit2
|
|
12
12
|
import torch
|
|
13
13
|
|
|
14
|
-
from argot.check import _resolve_shas
|
|
14
|
+
from argot.check import _resolve_shas, _workdir_patches
|
|
15
15
|
from argot.git_walk import walk_commits
|
|
16
16
|
from argot.jepa.encoder import TokenEncoder
|
|
17
17
|
from argot.jepa.model import JEPAArgot
|
|
@@ -68,7 +68,7 @@ def _score_dataset(
|
|
|
68
68
|
def main() -> None:
|
|
69
69
|
parser = argparse.ArgumentParser(description="Explain style anomalies in a git ref")
|
|
70
70
|
parser.add_argument("repo_path")
|
|
71
|
-
parser.add_argument("ref")
|
|
71
|
+
parser.add_argument("ref", nargs="?", default="")
|
|
72
72
|
parser.add_argument("--model", default=".argot/model.pkl")
|
|
73
73
|
parser.add_argument("--dataset", default=".argot/dataset.jsonl")
|
|
74
74
|
parser.add_argument("--threshold-percentile", type=float, default=75.0)
|
|
@@ -105,62 +105,71 @@ def main() -> None:
|
|
|
105
105
|
style_examples = select_style_examples(scored_dataset, n=args.examples)
|
|
106
106
|
example_texts = [" ".join(t["text"] for t in r["hunk_tokens"]) for r in style_examples]
|
|
107
107
|
|
|
108
|
-
repo = pygit2.Repository(args.repo_path)
|
|
109
|
-
shas = _resolve_shas(repo, args.ref)
|
|
110
|
-
if not shas:
|
|
111
|
-
sys.exit(0)
|
|
112
|
-
|
|
113
108
|
context_lines = 50
|
|
114
|
-
with torch.no_grad():
|
|
115
|
-
for commit, file_path, post_blob, hunks in walk_commits(args.repo_path, shas):
|
|
116
|
-
lang = language_for_path(file_path)
|
|
117
|
-
if lang is None:
|
|
118
|
-
continue
|
|
119
|
-
try:
|
|
120
|
-
source_lines = post_blob.decode("utf-8", errors="replace").splitlines()
|
|
121
|
-
except Exception:
|
|
122
|
-
continue
|
|
123
|
-
|
|
124
|
-
for hunk in hunks:
|
|
125
|
-
hunk_start = hunk.new_start - 1
|
|
126
|
-
hunk_end = hunk_start + hunk.new_lines
|
|
127
|
-
if hunk_start < 0 or hunk_end > len(source_lines):
|
|
128
|
-
continue
|
|
129
109
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
110
|
+
def _emit_patches(patches: Any, commit_label: str) -> None:
|
|
111
|
+
with torch.no_grad():
|
|
112
|
+
for file_path, post_blob, hunks in patches:
|
|
113
|
+
lang = language_for_path(file_path)
|
|
114
|
+
if lang is None:
|
|
115
|
+
continue
|
|
116
|
+
try:
|
|
117
|
+
source_lines = post_blob.decode("utf-8", errors="replace").splitlines()
|
|
118
|
+
except Exception:
|
|
119
|
+
continue
|
|
133
120
|
|
|
134
|
-
|
|
135
|
-
|
|
121
|
+
for hunk in hunks:
|
|
122
|
+
hunk_start = hunk.new_start - 1
|
|
123
|
+
hunk_end = hunk_start + hunk.new_lines
|
|
124
|
+
if hunk_start < 0 or hunk_end > len(source_lines):
|
|
125
|
+
continue
|
|
136
126
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
hunk_vec = torch.tensor(
|
|
141
|
-
vectorizer.transform([hunk_text]).toarray(), dtype=torch.float32
|
|
142
|
-
)
|
|
127
|
+
before_start = max(0, hunk_start - context_lines)
|
|
128
|
+
ctx_tokens = tokenize_lines(source_lines, lang, before_start, hunk_start)
|
|
129
|
+
hunk_tokens = tokenize_lines(source_lines, lang, hunk_start, hunk_end)
|
|
143
130
|
|
|
144
|
-
|
|
145
|
-
|
|
131
|
+
ctx_text = " ".join(t.text for t in ctx_tokens)
|
|
132
|
+
hunk_text = " ".join(t.text for t in hunk_tokens)
|
|
146
133
|
|
|
147
|
-
|
|
148
|
-
|
|
134
|
+
ctx_vec = torch.tensor(
|
|
135
|
+
vectorizer.transform([ctx_text]).toarray(), dtype=torch.float32
|
|
136
|
+
)
|
|
137
|
+
hunk_vec = torch.tensor(
|
|
138
|
+
vectorizer.transform([hunk_text]).toarray(), dtype=torch.float32
|
|
139
|
+
)
|
|
149
140
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
141
|
+
score = model.surprise(ctx_vec, hunk_vec).item()
|
|
142
|
+
pct = percentile_rank(score, distribution)
|
|
143
|
+
|
|
144
|
+
if pct < args.threshold_percentile:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
print(
|
|
148
|
+
json.dumps(
|
|
149
|
+
{
|
|
150
|
+
"file_path": file_path,
|
|
151
|
+
"line": hunk.new_start,
|
|
152
|
+
"commit": commit_label,
|
|
153
|
+
"surprise": round(score, 4),
|
|
154
|
+
"percentile": round(pct, 1),
|
|
155
|
+
"hunk_text": hunk_text,
|
|
156
|
+
"context_text": ctx_text,
|
|
157
|
+
"style_examples": example_texts,
|
|
158
|
+
}
|
|
159
|
+
)
|
|
162
160
|
)
|
|
163
|
-
|
|
161
|
+
|
|
162
|
+
if args.ref == "":
|
|
163
|
+
_emit_patches(_workdir_patches(args.repo_path), "workdir")
|
|
164
|
+
else:
|
|
165
|
+
repo = pygit2.Repository(args.repo_path)
|
|
166
|
+
shas = _resolve_shas(repo, args.ref)
|
|
167
|
+
if not shas:
|
|
168
|
+
sys.exit(0)
|
|
169
|
+
_emit_patches(
|
|
170
|
+
((fp, blob, hunks) for _, fp, blob, hunks in walk_commits(args.repo_path, shas)),
|
|
171
|
+
args.ref,
|
|
172
|
+
)
|
|
164
173
|
|
|
165
174
|
|
|
166
175
|
if __name__ == "__main__":
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|