agent-write-gate 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_write_gate-0.1.0.dist-info/METADATA +276 -0
- agent_write_gate-0.1.0.dist-info/RECORD +18 -0
- agent_write_gate-0.1.0.dist-info/WHEEL +5 -0
- agent_write_gate-0.1.0.dist-info/entry_points.txt +2 -0
- agent_write_gate-0.1.0.dist-info/licenses/LICENSE +21 -0
- agent_write_gate-0.1.0.dist-info/top_level.txt +1 -0
- agentgate/__init__.py +5 -0
- agentgate/adapter.py +142 -0
- agentgate/apply_patch.py +74 -0
- agentgate/checks/__init__.py +1 -0
- agentgate/checks/cjk.py +81 -0
- agentgate/checks/unicode_safety.py +277 -0
- agentgate/cli.py +550 -0
- agentgate/config.py +171 -0
- agentgate/model.py +34 -0
- agentgate/policy.py +94 -0
- agentgate/registry.py +61 -0
- agentgate/report.py +247 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""unicode_safety.py -- Unicode safety check (AG-BIDI, AG-INVIS, AG-HOMO).
|
|
2
|
+
|
|
3
|
+
Rules:
|
|
4
|
+
AG-BIDI (high, always): bidi control characters U+202A-U+202E, U+2066-U+2069.
|
|
5
|
+
Essentially no legitimate use in source files; Trojan-Source vector.
|
|
6
|
+
AG-INVIS (high, code-context only): zero-width and invisible chars
|
|
7
|
+
U+200B, U+2060, U+FEFF (when not BOM at offset 0), U+00AD.
|
|
8
|
+
Only flagged when the file is treated as code AND the char sits
|
|
9
|
+
inside an identifier/string run.
|
|
10
|
+
U+200C/U+200D (ZWNJ/ZWJ) and U+200E/U+200F (LRM/RLM) are NOT
|
|
11
|
+
in AG-INVIS by default -- legitimate in Arabic/Persian/Indic text
|
|
12
|
+
and emoji ZWJ sequences.
|
|
13
|
+
strict_zerowidth=true: adds ZWNJ/ZWJ only inside ASCII-identifier runs.
|
|
14
|
+
AG-HOMO (medium, opt-in): Latin-looking Cyrillic/Greek codepoints inside
|
|
15
|
+
an otherwise-ASCII identifier.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
import unicodedata
|
|
22
|
+
from typing import List, TYPE_CHECKING
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from ..model import WriteEvent, Issue
|
|
26
|
+
from ..config import GateConfig
|
|
27
|
+
|
|
28
|
+
from ..model import Issue
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Bidi control characters
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
# U+202A LEFT-TO-RIGHT EMBEDDING
|
|
35
|
+
# U+202B RIGHT-TO-LEFT EMBEDDING
|
|
36
|
+
# U+202C POP DIRECTIONAL FORMATTING
|
|
37
|
+
# U+202D LEFT-TO-RIGHT OVERRIDE
|
|
38
|
+
# U+202E RIGHT-TO-LEFT OVERRIDE
|
|
39
|
+
# U+2066 LEFT-TO-RIGHT ISOLATE
|
|
40
|
+
# U+2067 RIGHT-TO-LEFT ISOLATE
|
|
41
|
+
# U+2068 FIRST STRONG ISOLATE
|
|
42
|
+
# U+2069 POP DIRECTIONAL ISOLATE
|
|
43
|
+
_BIDI_CONTROLS = frozenset(range(0x202A, 0x202F)) | frozenset(range(0x2066, 0x206A))
|
|
44
|
+
|
|
45
|
+
_BIDI_NAMES = {
|
|
46
|
+
0x202A: "LEFT-TO-RIGHT EMBEDDING",
|
|
47
|
+
0x202B: "RIGHT-TO-LEFT EMBEDDING",
|
|
48
|
+
0x202C: "POP DIRECTIONAL FORMATTING",
|
|
49
|
+
0x202D: "LEFT-TO-RIGHT OVERRIDE",
|
|
50
|
+
0x202E: "RIGHT-TO-LEFT OVERRIDE",
|
|
51
|
+
0x2066: "LEFT-TO-RIGHT ISOLATE",
|
|
52
|
+
0x2067: "RIGHT-TO-LEFT ISOLATE",
|
|
53
|
+
0x2068: "FIRST STRONG ISOLATE",
|
|
54
|
+
0x2069: "POP DIRECTIONAL ISOLATE",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
# Invisible characters flagged in code context
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
# U+200B ZERO WIDTH SPACE
|
|
62
|
+
# U+2060 WORD JOINER
|
|
63
|
+
# U+FEFF ZERO WIDTH NO-BREAK SPACE (BOM when at offset 0, else stray)
|
|
64
|
+
# U+00AD SOFT HYPHEN
|
|
65
|
+
_INVIS_CODE = frozenset([0x200B, 0x2060, 0xFEFF, 0x00AD])
|
|
66
|
+
|
|
67
|
+
_INVIS_NAMES = {
|
|
68
|
+
0x200B: "ZERO WIDTH SPACE",
|
|
69
|
+
0x2060: "WORD JOINER",
|
|
70
|
+
0xFEFF: "ZERO WIDTH NO-BREAK SPACE (stray BOM)",
|
|
71
|
+
0x00AD: "SOFT HYPHEN",
|
|
72
|
+
0x200C: "ZERO WIDTH NON-JOINER",
|
|
73
|
+
0x200D: "ZERO WIDTH JOINER",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Optional strict_zerowidth additions (ZWNJ/ZWJ in ASCII-identifier context)
|
|
77
|
+
_STRICT_ZEROWIDTH = frozenset([0x200C, 0x200D])
|
|
78
|
+
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# Homoglyph: Cyrillic and Greek codepoints that look Latin
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
# Cyrillic letters that visually resemble ASCII Latin
|
|
84
|
+
_CYRILLIC_HOMO = frozenset([
|
|
85
|
+
0x0430, # а (looks like a)
|
|
86
|
+
0x0435, # е (looks like e)
|
|
87
|
+
0x043E, # о (looks like o)
|
|
88
|
+
0x0440, # р (looks like p)
|
|
89
|
+
0x0441, # с (looks like c)
|
|
90
|
+
0x0445, # х (looks like x)
|
|
91
|
+
0x0410, # А (looks like A)
|
|
92
|
+
0x0412, # В (looks like B)
|
|
93
|
+
0x0415, # Е (looks like E)
|
|
94
|
+
0x041A, # К (looks like K)
|
|
95
|
+
0x041C, # М (looks like M)
|
|
96
|
+
0x041D, # Н (looks like H)
|
|
97
|
+
0x041E, # О (looks like O)
|
|
98
|
+
0x0420, # Р (looks like P)
|
|
99
|
+
0x0421, # С (looks like C)
|
|
100
|
+
0x0422, # Т (looks like T)
|
|
101
|
+
0x0425, # Х (looks like X)
|
|
102
|
+
0x0443, # у (looks like y)
|
|
103
|
+
])
|
|
104
|
+
|
|
105
|
+
# Greek letters that visually resemble ASCII Latin
|
|
106
|
+
_GREEK_HOMO = frozenset([
|
|
107
|
+
0x03B1, # α (looks like a)
|
|
108
|
+
0x03B5, # ε (looks like e)
|
|
109
|
+
0x03B9, # ι (looks like i)
|
|
110
|
+
0x03BD, # ν (looks like v)
|
|
111
|
+
0x03BF, # ο (looks like o)
|
|
112
|
+
0x03C1, # ρ (looks like p)
|
|
113
|
+
0x03C5, # υ (looks like u)
|
|
114
|
+
0x0391, # Α (looks like A)
|
|
115
|
+
0x0392, # Β (looks like B)
|
|
116
|
+
0x0395, # Ε (looks like E)
|
|
117
|
+
0x0396, # Ζ (looks like Z)
|
|
118
|
+
0x0397, # Η (looks like H)
|
|
119
|
+
0x0399, # Ι (looks like I)
|
|
120
|
+
0x039A, # Κ (looks like K)
|
|
121
|
+
0x039C, # Μ (looks like M)
|
|
122
|
+
0x039D, # Ν (looks like N)
|
|
123
|
+
0x039F, # Ο (looks like O)
|
|
124
|
+
0x03A1, # Ρ (looks like P)
|
|
125
|
+
0x03A4, # Τ (looks like T)
|
|
126
|
+
0x03A5, # Υ (looks like Y)
|
|
127
|
+
0x03A7, # Χ (looks like X)
|
|
128
|
+
])
|
|
129
|
+
|
|
130
|
+
_ALL_HOMO = _CYRILLIC_HOMO | _GREEK_HOMO
|
|
131
|
+
|
|
132
|
+
# ---------------------------------------------------------------------------
|
|
133
|
+
# File profile detection
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
def _is_code_profile(file_path: str, code_extensions: list) -> bool:
|
|
137
|
+
"""Return True if the file should be treated as source code."""
|
|
138
|
+
if not file_path or file_path in ("<stdin>", ""):
|
|
139
|
+
return False # unknown -> doc profile (permissive)
|
|
140
|
+
lower = file_path.lower()
|
|
141
|
+
for ext in code_extensions:
|
|
142
|
+
if lower.endswith(ext):
|
|
143
|
+
return True
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
# Context detection: is a character inside an identifier/string run?
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
# An identifier/string run: ASCII letters, digits, _, quotes, common code chars
|
|
152
|
+
_IDENT_CHARS = re.compile(r'[\w"\'`]')
|
|
153
|
+
|
|
154
|
+
def _in_identifier_or_string(line: str, col0: int) -> bool:
|
|
155
|
+
"""Check if position col0 (0-indexed) is within an identifier or string token."""
|
|
156
|
+
if col0 <= 0 or col0 >= len(line):
|
|
157
|
+
return False
|
|
158
|
+
# Check chars before and after
|
|
159
|
+
before = line[col0 - 1] if col0 > 0 else " "
|
|
160
|
+
after = line[col0 + 1] if col0 + 1 < len(line) else " "
|
|
161
|
+
return bool(_IDENT_CHARS.match(before) or _IDENT_CHARS.match(after))
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _in_ascii_identifier(line: str, col0: int) -> bool:
|
|
165
|
+
"""Check if position is within an ASCII-only identifier run."""
|
|
166
|
+
if col0 <= 0 or col0 >= len(line):
|
|
167
|
+
return False
|
|
168
|
+
# Walk back to find start of identifier run
|
|
169
|
+
start = col0
|
|
170
|
+
while start > 0 and (line[start - 1].isascii() and (line[start - 1].isalnum() or line[start - 1] in "_")):
|
|
171
|
+
start -= 1
|
|
172
|
+
# Walk forward to find end
|
|
173
|
+
end = col0
|
|
174
|
+
while end + 1 < len(line) and (line[end + 1].isascii() and (line[end + 1].isalnum() or line[end + 1] in "_")):
|
|
175
|
+
end += 1
|
|
176
|
+
# Valid ASCII identifier must have at least one char before/after
|
|
177
|
+
prefix = line[start:col0]
|
|
178
|
+
suffix = line[col0 + 1:end + 1]
|
|
179
|
+
return bool(prefix or suffix)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ---------------------------------------------------------------------------
|
|
183
|
+
# Main check function
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
def run(event: "WriteEvent", cfg: "GateConfig") -> List["Issue"]:
|
|
187
|
+
"""Run Unicode safety checks on a WriteEvent. Returns list of Issues."""
|
|
188
|
+
issues: List[Issue] = []
|
|
189
|
+
content = event.content
|
|
190
|
+
if not content:
|
|
191
|
+
return issues
|
|
192
|
+
|
|
193
|
+
is_code = _is_code_profile(event.file_path, cfg.unicode.code_extensions)
|
|
194
|
+
lines = content.splitlines()
|
|
195
|
+
|
|
196
|
+
# Track absolute character offset to detect BOM at offset 0
|
|
197
|
+
abs_offset = 0
|
|
198
|
+
|
|
199
|
+
for line_no, line in enumerate(lines, start=1):
|
|
200
|
+
for col0, ch in enumerate(line):
|
|
201
|
+
cp = ord(ch)
|
|
202
|
+
col1 = col0 + 1 # 1-based column
|
|
203
|
+
|
|
204
|
+
# --- AG-BIDI: always flagged regardless of profile ---
|
|
205
|
+
if cp in _BIDI_CONTROLS:
|
|
206
|
+
name = _BIDI_NAMES.get(cp, unicodedata.name(ch, f"U+{cp:04X}"))
|
|
207
|
+
issues.append(Issue(
|
|
208
|
+
check="unicode",
|
|
209
|
+
rule_id="AG-BIDI",
|
|
210
|
+
severity="high",
|
|
211
|
+
line=line_no,
|
|
212
|
+
col=col1,
|
|
213
|
+
message=f"U+{cp:04X} {name}",
|
|
214
|
+
excerpt=repr(ch),
|
|
215
|
+
suggestion="Remove the bidi control char; it visually reorders source.",
|
|
216
|
+
))
|
|
217
|
+
abs_offset += 1
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
# --- AG-INVIS: code profile only, inside identifier/string run ---
|
|
221
|
+
if is_code and cp in _INVIS_CODE:
|
|
222
|
+
# Special case: FEFF at absolute offset 0 is a BOM (benign)
|
|
223
|
+
if cp == 0xFEFF and abs_offset == 0 and line_no == 1 and col0 == 0:
|
|
224
|
+
abs_offset += 1
|
|
225
|
+
continue
|
|
226
|
+
if _in_identifier_or_string(line, col0):
|
|
227
|
+
name = _INVIS_NAMES.get(cp, unicodedata.name(ch, f"U+{cp:04X}"))
|
|
228
|
+
issues.append(Issue(
|
|
229
|
+
check="unicode",
|
|
230
|
+
rule_id="AG-INVIS",
|
|
231
|
+
severity="high",
|
|
232
|
+
line=line_no,
|
|
233
|
+
col=col1,
|
|
234
|
+
message=f"U+{cp:04X} {name} inside identifier/string",
|
|
235
|
+
excerpt=repr(ch),
|
|
236
|
+
suggestion="Remove the invisible character; it can hide malicious code.",
|
|
237
|
+
))
|
|
238
|
+
|
|
239
|
+
# --- AG-INVIS strict_zerowidth: ZWNJ/ZWJ in ASCII-identifier runs ---
|
|
240
|
+
elif is_code and cfg.unicode.strict_zerowidth and cp in _STRICT_ZEROWIDTH:
|
|
241
|
+
if _in_ascii_identifier(line, col0):
|
|
242
|
+
name = _INVIS_NAMES.get(cp, unicodedata.name(ch, f"U+{cp:04X}"))
|
|
243
|
+
issues.append(Issue(
|
|
244
|
+
check="unicode",
|
|
245
|
+
rule_id="AG-INVIS",
|
|
246
|
+
severity="high",
|
|
247
|
+
line=line_no,
|
|
248
|
+
col=col1,
|
|
249
|
+
message=f"U+{cp:04X} {name} inside ASCII identifier (strict_zerowidth)",
|
|
250
|
+
excerpt=repr(ch),
|
|
251
|
+
suggestion="Remove ZWNJ/ZWJ from ASCII identifier; use only in appropriate script contexts.",
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
# --- AG-HOMO: opt-in, medium severity ---
|
|
255
|
+
if cfg.unicode.homoglyph and cp in _ALL_HOMO:
|
|
256
|
+
# Only flag if the surrounding identifier is otherwise ASCII
|
|
257
|
+
if _in_ascii_identifier(line, col0):
|
|
258
|
+
try:
|
|
259
|
+
name = unicodedata.name(ch, f"U+{cp:04X}")
|
|
260
|
+
except Exception:
|
|
261
|
+
name = f"U+{cp:04X}"
|
|
262
|
+
issues.append(Issue(
|
|
263
|
+
check="unicode",
|
|
264
|
+
rule_id="AG-HOMO",
|
|
265
|
+
severity="medium",
|
|
266
|
+
line=line_no,
|
|
267
|
+
col=col1,
|
|
268
|
+
message=f"U+{cp:04X} {name} looks like ASCII but is Cyrillic/Greek",
|
|
269
|
+
excerpt=repr(ch),
|
|
270
|
+
suggestion="Replace with the visually identical ASCII character.",
|
|
271
|
+
))
|
|
272
|
+
|
|
273
|
+
abs_offset += 1
|
|
274
|
+
# Account for newline character
|
|
275
|
+
abs_offset += 1
|
|
276
|
+
|
|
277
|
+
return issues
|