agent-write-gate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,277 @@
1
+ """unicode_safety.py -- Unicode safety check (AG-BIDI, AG-INVIS, AG-HOMO).
2
+
3
+ Rules:
4
+ AG-BIDI (high, always): bidi control characters U+202A-U+202E, U+2066-U+2069.
5
+ Essentially no legitimate use in source files; Trojan-Source vector.
6
+ AG-INVIS (high, code-context only): zero-width and invisible chars
7
+ U+200B, U+2060, U+FEFF (when not BOM at offset 0), U+00AD.
8
+ Only flagged when the file is treated as code AND the char sits
9
+ inside an identifier/string run.
10
+ U+200C/U+200D (ZWNJ/ZWJ) and U+200E/U+200F (LRM/RLM) are NOT
11
+ in AG-INVIS by default -- legitimate in Arabic/Persian/Indic text
12
+ and emoji ZWJ sequences.
13
+ strict_zerowidth=true: adds ZWNJ/ZWJ only inside ASCII-identifier runs.
14
+ AG-HOMO (medium, opt-in): Latin-looking Cyrillic/Greek codepoints inside
15
+ an otherwise-ASCII identifier.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import re
21
+ import unicodedata
22
+ from typing import List, TYPE_CHECKING
23
+
24
+ if TYPE_CHECKING:
25
+ from ..model import WriteEvent, Issue
26
+ from ..config import GateConfig
27
+
28
+ from ..model import Issue
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Bidi control characters
32
+ # ---------------------------------------------------------------------------
33
+
34
+ # U+202A LEFT-TO-RIGHT EMBEDDING
35
+ # U+202B RIGHT-TO-LEFT EMBEDDING
36
+ # U+202C POP DIRECTIONAL FORMATTING
37
+ # U+202D LEFT-TO-RIGHT OVERRIDE
38
+ # U+202E RIGHT-TO-LEFT OVERRIDE
39
+ # U+2066 LEFT-TO-RIGHT ISOLATE
40
+ # U+2067 RIGHT-TO-LEFT ISOLATE
41
+ # U+2068 FIRST STRONG ISOLATE
42
+ # U+2069 POP DIRECTIONAL ISOLATE
43
+ _BIDI_CONTROLS = frozenset(range(0x202A, 0x202F)) | frozenset(range(0x2066, 0x206A))
44
+
45
+ _BIDI_NAMES = {
46
+ 0x202A: "LEFT-TO-RIGHT EMBEDDING",
47
+ 0x202B: "RIGHT-TO-LEFT EMBEDDING",
48
+ 0x202C: "POP DIRECTIONAL FORMATTING",
49
+ 0x202D: "LEFT-TO-RIGHT OVERRIDE",
50
+ 0x202E: "RIGHT-TO-LEFT OVERRIDE",
51
+ 0x2066: "LEFT-TO-RIGHT ISOLATE",
52
+ 0x2067: "RIGHT-TO-LEFT ISOLATE",
53
+ 0x2068: "FIRST STRONG ISOLATE",
54
+ 0x2069: "POP DIRECTIONAL ISOLATE",
55
+ }
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Invisible characters flagged in code context
59
+ # ---------------------------------------------------------------------------
60
+
61
+ # U+200B ZERO WIDTH SPACE
62
+ # U+2060 WORD JOINER
63
+ # U+FEFF ZERO WIDTH NO-BREAK SPACE (BOM when at offset 0, else stray)
64
+ # U+00AD SOFT HYPHEN
65
+ _INVIS_CODE = frozenset([0x200B, 0x2060, 0xFEFF, 0x00AD])
66
+
67
+ _INVIS_NAMES = {
68
+ 0x200B: "ZERO WIDTH SPACE",
69
+ 0x2060: "WORD JOINER",
70
+ 0xFEFF: "ZERO WIDTH NO-BREAK SPACE (stray BOM)",
71
+ 0x00AD: "SOFT HYPHEN",
72
+ 0x200C: "ZERO WIDTH NON-JOINER",
73
+ 0x200D: "ZERO WIDTH JOINER",
74
+ }
75
+
76
+ # Optional strict_zerowidth additions (ZWNJ/ZWJ in ASCII-identifier context)
77
+ _STRICT_ZEROWIDTH = frozenset([0x200C, 0x200D])
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # Homoglyph: Cyrillic and Greek codepoints that look Latin
81
+ # ---------------------------------------------------------------------------
82
+
83
+ # Cyrillic letters that visually resemble ASCII Latin
84
+ _CYRILLIC_HOMO = frozenset([
85
+ 0x0430, # а (looks like a)
86
+ 0x0435, # е (looks like e)
87
+ 0x043E, # о (looks like o)
88
+ 0x0440, # р (looks like p)
89
+ 0x0441, # с (looks like c)
90
+ 0x0445, # х (looks like x)
91
+ 0x0410, # А (looks like A)
92
+ 0x0412, # В (looks like B)
93
+ 0x0415, # Е (looks like E)
94
+ 0x041A, # К (looks like K)
95
+ 0x041C, # М (looks like M)
96
+ 0x041D, # Н (looks like H)
97
+ 0x041E, # О (looks like O)
98
+ 0x0420, # Р (looks like P)
99
+ 0x0421, # С (looks like C)
100
+ 0x0422, # Т (looks like T)
101
+ 0x0425, # Х (looks like X)
102
+ 0x0443, # у (looks like y)
103
+ ])
104
+
105
+ # Greek letters that visually resemble ASCII Latin
106
+ _GREEK_HOMO = frozenset([
107
+ 0x03B1, # α (looks like a)
108
+ 0x03B5, # ε (looks like e)
109
+ 0x03B9, # ι (looks like i)
110
+ 0x03BD, # ν (looks like v)
111
+ 0x03BF, # ο (looks like o)
112
+ 0x03C1, # ρ (looks like p)
113
+ 0x03C5, # υ (looks like u)
114
+ 0x0391, # Α (looks like A)
115
+ 0x0392, # Β (looks like B)
116
+ 0x0395, # Ε (looks like E)
117
+ 0x0396, # Ζ (looks like Z)
118
+ 0x0397, # Η (looks like H)
119
+ 0x0399, # Ι (looks like I)
120
+ 0x039A, # Κ (looks like K)
121
+ 0x039C, # Μ (looks like M)
122
+ 0x039D, # Ν (looks like N)
123
+ 0x039F, # Ο (looks like O)
124
+ 0x03A1, # Ρ (looks like P)
125
+ 0x03A4, # Τ (looks like T)
126
+ 0x03A5, # Υ (looks like Y)
127
+ 0x03A7, # Χ (looks like X)
128
+ ])
129
+
130
+ _ALL_HOMO = _CYRILLIC_HOMO | _GREEK_HOMO
131
+
132
+ # ---------------------------------------------------------------------------
133
+ # File profile detection
134
+ # ---------------------------------------------------------------------------
135
+
136
+ def _is_code_profile(file_path: str, code_extensions: list) -> bool:
137
+ """Return True if the file should be treated as source code."""
138
+ if not file_path or file_path in ("<stdin>", ""):
139
+ return False # unknown -> doc profile (permissive)
140
+ lower = file_path.lower()
141
+ for ext in code_extensions:
142
+ if lower.endswith(ext):
143
+ return True
144
+ return False
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Context detection: is a character inside an identifier/string run?
149
+ # ---------------------------------------------------------------------------
150
+
151
+ # An identifier/string run: ASCII letters, digits, _, quotes, common code chars
152
+ _IDENT_CHARS = re.compile(r'[\w"\'`]')
153
+
154
+ def _in_identifier_or_string(line: str, col0: int) -> bool:
155
+ """Check if position col0 (0-indexed) is within an identifier or string token."""
156
+ if col0 <= 0 or col0 >= len(line):
157
+ return False
158
+ # Check chars before and after
159
+ before = line[col0 - 1] if col0 > 0 else " "
160
+ after = line[col0 + 1] if col0 + 1 < len(line) else " "
161
+ return bool(_IDENT_CHARS.match(before) or _IDENT_CHARS.match(after))
162
+
163
+
164
+ def _in_ascii_identifier(line: str, col0: int) -> bool:
165
+ """Check if position is within an ASCII-only identifier run."""
166
+ if col0 <= 0 or col0 >= len(line):
167
+ return False
168
+ # Walk back to find start of identifier run
169
+ start = col0
170
+ while start > 0 and (line[start - 1].isascii() and (line[start - 1].isalnum() or line[start - 1] in "_")):
171
+ start -= 1
172
+ # Walk forward to find end
173
+ end = col0
174
+ while end + 1 < len(line) and (line[end + 1].isascii() and (line[end + 1].isalnum() or line[end + 1] in "_")):
175
+ end += 1
176
+ # Valid ASCII identifier must have at least one char before/after
177
+ prefix = line[start:col0]
178
+ suffix = line[col0 + 1:end + 1]
179
+ return bool(prefix or suffix)
180
+
181
+
182
+ # ---------------------------------------------------------------------------
183
+ # Main check function
184
+ # ---------------------------------------------------------------------------
185
+
186
+ def run(event: "WriteEvent", cfg: "GateConfig") -> List["Issue"]:
187
+ """Run Unicode safety checks on a WriteEvent. Returns list of Issues."""
188
+ issues: List[Issue] = []
189
+ content = event.content
190
+ if not content:
191
+ return issues
192
+
193
+ is_code = _is_code_profile(event.file_path, cfg.unicode.code_extensions)
194
+ lines = content.splitlines()
195
+
196
+ # Track absolute character offset to detect BOM at offset 0
197
+ abs_offset = 0
198
+
199
+ for line_no, line in enumerate(lines, start=1):
200
+ for col0, ch in enumerate(line):
201
+ cp = ord(ch)
202
+ col1 = col0 + 1 # 1-based column
203
+
204
+ # --- AG-BIDI: always flagged regardless of profile ---
205
+ if cp in _BIDI_CONTROLS:
206
+ name = _BIDI_NAMES.get(cp, unicodedata.name(ch, f"U+{cp:04X}"))
207
+ issues.append(Issue(
208
+ check="unicode",
209
+ rule_id="AG-BIDI",
210
+ severity="high",
211
+ line=line_no,
212
+ col=col1,
213
+ message=f"U+{cp:04X} {name}",
214
+ excerpt=repr(ch),
215
+ suggestion="Remove the bidi control char; it visually reorders source.",
216
+ ))
217
+ abs_offset += 1
218
+ continue
219
+
220
+ # --- AG-INVIS: code profile only, inside identifier/string run ---
221
+ if is_code and cp in _INVIS_CODE:
222
+ # Special case: FEFF at absolute offset 0 is a BOM (benign)
223
+ if cp == 0xFEFF and abs_offset == 0 and line_no == 1 and col0 == 0:
224
+ abs_offset += 1
225
+ continue
226
+ if _in_identifier_or_string(line, col0):
227
+ name = _INVIS_NAMES.get(cp, unicodedata.name(ch, f"U+{cp:04X}"))
228
+ issues.append(Issue(
229
+ check="unicode",
230
+ rule_id="AG-INVIS",
231
+ severity="high",
232
+ line=line_no,
233
+ col=col1,
234
+ message=f"U+{cp:04X} {name} inside identifier/string",
235
+ excerpt=repr(ch),
236
+ suggestion="Remove the invisible character; it can hide malicious code.",
237
+ ))
238
+
239
+ # --- AG-INVIS strict_zerowidth: ZWNJ/ZWJ in ASCII-identifier runs ---
240
+ elif is_code and cfg.unicode.strict_zerowidth and cp in _STRICT_ZEROWIDTH:
241
+ if _in_ascii_identifier(line, col0):
242
+ name = _INVIS_NAMES.get(cp, unicodedata.name(ch, f"U+{cp:04X}"))
243
+ issues.append(Issue(
244
+ check="unicode",
245
+ rule_id="AG-INVIS",
246
+ severity="high",
247
+ line=line_no,
248
+ col=col1,
249
+ message=f"U+{cp:04X} {name} inside ASCII identifier (strict_zerowidth)",
250
+ excerpt=repr(ch),
251
+ suggestion="Remove ZWNJ/ZWJ from ASCII identifier; use only in appropriate script contexts.",
252
+ ))
253
+
254
+ # --- AG-HOMO: opt-in, medium severity ---
255
+ if cfg.unicode.homoglyph and cp in _ALL_HOMO:
256
+ # Only flag if the surrounding identifier is otherwise ASCII
257
+ if _in_ascii_identifier(line, col0):
258
+ try:
259
+ name = unicodedata.name(ch, f"U+{cp:04X}")
260
+ except Exception:
261
+ name = f"U+{cp:04X}"
262
+ issues.append(Issue(
263
+ check="unicode",
264
+ rule_id="AG-HOMO",
265
+ severity="medium",
266
+ line=line_no,
267
+ col=col1,
268
+ message=f"U+{cp:04X} {name} looks like ASCII but is Cyrillic/Greek",
269
+ excerpt=repr(ch),
270
+ suggestion="Replace with the visually identical ASCII character.",
271
+ ))
272
+
273
+ abs_offset += 1
274
+ # Account for newline character
275
+ abs_offset += 1
276
+
277
+ return issues