encoding-doctor 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ """
2
+ encoding-doctor
3
+ ---------------
4
+ Scan, fix, and verify file encoding issues.
5
+
6
+ Usage:
7
+ from encoding_doctor.scanner import scan_directory
8
+ from encoding_doctor.fixer import fix_directory
9
+ from encoding_doctor.verifier import verify_directory
10
+
11
+ Or via CLI:
12
+ enc-doctor scan ./my_project
13
+ enc-doctor fix ./my_project
14
+ enc-doctor verify ./my_project
15
+ """
16
+
17
+ __version__ = "0.1.0"
18
+ __author__ = "Stateflow Labs"
19
+ __license__ = "MIT"
encoding_doctor/cli.py ADDED
@@ -0,0 +1,244 @@
1
+ """
2
+ cli.py
3
+ ------
4
+ Command-line interface for encoding-doctor.
5
+
6
+ Commands:
7
+ enc-doctor scan <path> -- detect encoding issues (FREE)
8
+ enc-doctor fix <path> -- fix detected issues (LICENSE)
9
+ enc-doctor verify <path> -- verify files are clean (LICENSE)
10
+ enc-doctor restore <file> -- restore a .bak file (LICENSE)
11
+ enc-doctor activate <key> -- activate license key
12
+ enc-doctor deactivate -- deactivate license (free up seat)
13
+ enc-doctor license -- show license status
14
+
15
+ Flags:
16
+ --dry-run (fix only) show what would change without writing
17
+ --all (scan only) show clean files too
18
+ """
19
+
20
+ import sys
21
+ import os
22
+
23
+
24
+ def _check_path(path: str):
25
+ if not os.path.exists(path):
26
+ print(f"ERROR: path not found: {path}")
27
+ sys.exit(1)
28
+
29
+
30
+ def cmd_scan(path: str, show_all: bool = False):
31
+ from .scanner import scan_directory, scan_file
32
+ from .reporter import print_scan_report
33
+
34
+ if os.path.isfile(path):
35
+ report = scan_file(path)
36
+ reports = [report] if report else []
37
+ else:
38
+ reports = scan_directory(path)
39
+
40
+ print_scan_report(reports, show_clean=show_all)
41
+
42
+ has_issues = any(r.has_issues for r in reports)
43
+ sys.exit(1 if has_issues else 0)
44
+
45
+
46
+ def cmd_fix(path: str, dry_run: bool = False):
47
+ from .license import require_license
48
+ require_license("fix")
49
+
50
+ from .scanner import scan_directory, scan_file
51
+ from .fixer import fix_directory
52
+ from .reporter import print_fix_report
53
+
54
+ print("\033[93mWARNING: enc-doctor fix modifies files in-place.\033[0m")
55
+ print("\033[2mBackups will be created as .bak files.\033[0m")
56
+ print("\033[2mRun 'enc-doctor scan' first if you have not already.\033[0m\n")
57
+
58
+ if os.path.isfile(path):
59
+ report = scan_file(path)
60
+ reports = [report] if report else []
61
+ else:
62
+ reports = scan_directory(path)
63
+
64
+ results = fix_directory(reports, dry_run=dry_run)
65
+ print_fix_report(results, dry_run=dry_run)
66
+
67
+
68
+ def cmd_verify(path: str):
69
+ from .license import require_license
70
+ require_license("verify")
71
+
72
+ from .verifier import verify_directory, verify_file
73
+ from .reporter import print_verify_report
74
+
75
+ if os.path.isfile(path):
76
+ results = [verify_file(path)]
77
+ else:
78
+ results = verify_directory(path)
79
+
80
+ print_verify_report(results)
81
+
82
+ all_ok = all(r.ok for r in results)
83
+ sys.exit(0 if all_ok else 1)
84
+
85
+
86
+ def cmd_restore(path: str):
87
+ from .license import require_license
88
+ require_license("restore")
89
+
90
+ from .verifier import restore_backup
91
+ bak = path + ".bak"
92
+ if not os.path.exists(bak):
93
+ print(f"ERROR: no backup found at {bak}")
94
+ sys.exit(1)
95
+ restored = restore_backup(path)
96
+ if restored:
97
+ print(f"\033[92mRestored: {path}\033[0m")
98
+ print(f"\033[2mFrom: {bak}\033[0m")
99
+ else:
100
+ print("ERROR: restore failed")
101
+ sys.exit(1)
102
+
103
+
104
+ def cmd_activate(key: str):
105
+ from .license import activate
106
+ ok, msg = activate(key)
107
+ if ok:
108
+ print(f"\n \033[92m✓ {msg}\033[0m")
109
+ print()
110
+ print(" \033[2mLicense saved to ~/.encoding-doctor/license.json\033[0m")
111
+ print()
112
+ print(" Unlocked commands:")
113
+ print(" scan \033[92m✓ free\033[0m")
114
+ print(" fix \033[92m✓ licensed\033[0m")
115
+ print(" verify \033[92m✓ licensed\033[0m")
116
+ print(" restore \033[92m✓ licensed\033[0m")
117
+ print()
118
+ else:
119
+ print(f"\n \033[91m✗ {msg}\033[0m\n")
120
+ sys.exit(1)
121
+
122
+
123
+ def cmd_deactivate():
124
+ from .license import deactivate
125
+ ok, msg = deactivate()
126
+ if ok:
127
+ print(f"\n \033[92m✓ {msg}\033[0m\n")
128
+ else:
129
+ print(f"\n \033[91m✗ {msg}\033[0m\n")
130
+ sys.exit(1)
131
+
132
+
133
+ def cmd_license_status():
134
+ from .license import status
135
+ info = status()
136
+ print()
137
+ if info["active"]:
138
+ print(" \033[92mLicense: ACTIVE\033[0m")
139
+ if info.get("customer"):
140
+ print(f" Customer : {info['customer']}")
141
+ if info.get("key_preview"):
142
+ print(f" Key : {info['key_preview']}")
143
+ if info.get("activated_at"):
144
+ print(f" Since : {info['activated_at'][:10]}")
145
+ print()
146
+ print(" All commands unlocked.")
147
+ else:
148
+ print(" \033[93mLicense: NOT ACTIVATED\033[0m")
149
+ print()
150
+ print(" Free: scan")
151
+ print(" Paid: fix, verify, restore")
152
+ print()
153
+ print(" Get license → https://stateflow.dev/encoding-doctor")
154
+ print(" Then run: enc-doctor activate <YOUR-KEY>")
155
+ print()
156
+
157
+
158
+ def main():
159
+ args = sys.argv[1:]
160
+
161
+ if not args or args[0] in ("-h", "--help"):
162
+ print("""
163
+ encoding-doctor v0.2.0
164
+
165
+ Usage:
166
+ enc-doctor scan <path> [--all]
167
+ enc-doctor fix <path> [--dry-run]
168
+ enc-doctor verify <path>
169
+ enc-doctor restore <file>
170
+ enc-doctor activate <license-key>
171
+ enc-doctor deactivate
172
+ enc-doctor license
173
+
174
+ Commands:
175
+ scan Detect encoding issues — free forever
176
+ fix Fix detected issues — requires license
177
+ verify Confirm all files are valid UTF-8 — requires license
178
+ restore Restore a file from .bak backup — requires license
179
+ activate Activate your license key
180
+ deactivate Free up your license seat (e.g. when switching machines)
181
+ license Show current license status
182
+
183
+ Flags:
184
+ --all (scan) also show clean files
185
+ --dry-run (fix) show what would change without writing files
186
+
187
+ Get a license → https://stateflow.dev/encoding-doctor
188
+ """)
189
+ sys.exit(0)
190
+
191
+ command = args[0]
192
+ rest = args[1:]
193
+
194
+ if command == "scan":
195
+ if not rest:
196
+ print("ERROR: provide a path. Usage: enc-doctor scan <path>")
197
+ sys.exit(1)
198
+ path = rest[0]
199
+ _check_path(path)
200
+ show_all = "--all" in rest
201
+ cmd_scan(path, show_all=show_all)
202
+
203
+ elif command == "fix":
204
+ if not rest:
205
+ print("ERROR: provide a path. Usage: enc-doctor fix <path>")
206
+ sys.exit(1)
207
+ path = rest[0]
208
+ _check_path(path)
209
+ dry_run = "--dry-run" in rest
210
+ cmd_fix(path, dry_run=dry_run)
211
+
212
+ elif command == "verify":
213
+ if not rest:
214
+ print("ERROR: provide a path. Usage: enc-doctor verify <path>")
215
+ sys.exit(1)
216
+ path = rest[0]
217
+ _check_path(path)
218
+ cmd_verify(path)
219
+
220
+ elif command == "restore":
221
+ if not rest:
222
+ print("ERROR: provide a file path. Usage: enc-doctor restore <file>")
223
+ sys.exit(1)
224
+ cmd_restore(rest[0])
225
+
226
+ elif command == "activate":
227
+ if not rest:
228
+ print("ERROR: provide a license key. Usage: enc-doctor activate <key>")
229
+ sys.exit(1)
230
+ cmd_activate(rest[0])
231
+
232
+ elif command == "deactivate":
233
+ cmd_deactivate()
234
+
235
+ elif command == "license":
236
+ cmd_license_status()
237
+
238
+ else:
239
+ print(f"ERROR: unknown command '{command}'. Run 'enc-doctor --help'")
240
+ sys.exit(1)
241
+
242
+
243
+ if __name__ == "__main__":
244
+ main()
@@ -0,0 +1,85 @@
1
+ """
2
+ fixer.py
3
+ --------
4
+ Fixes encoding issues detected by scanner.py.
5
+ Always creates .bak backup before modifying any file.
6
+
7
+ Based on real fix operations performed on adaptive_runtime source files.
8
+ """
9
+
10
+ import os
11
+ import shutil
12
+ from .scanner import MOJIBAKE_PATTERNS, BOM, FileReport
13
+
14
+
15
+ def _backup(path: str) -> str:
16
+ bak = path + ".bak"
17
+ shutil.copy2(path, bak)
18
+ return bak
19
+
20
+
21
+ def fix_file(report: FileReport, dry_run: bool = False) -> dict:
22
+ """
23
+ Fix all fixable issues in a file.
24
+ Returns dict with keys: path, fixed, skipped, backup, changes
25
+ """
26
+ result = {"path": report.path, "fixed": [], "skipped": [], "backup": None, "changes": 0}
27
+
28
+ if not report.fixable:
29
+ result["skipped"] = [i.label for i in report.issues if not i.fixable]
30
+ return result
31
+
32
+ with open(report.path, "rb") as f:
33
+ raw = f.read()
34
+
35
+ original = raw
36
+
37
+ # Fix BOM
38
+ if raw.startswith(BOM):
39
+ raw = raw[3:]
40
+ result["fixed"].append("BOM stripped")
41
+
42
+ # Fix mojibake (byte-level replacement)
43
+ for bad_bytes, good_bytes, label in MOJIBAKE_PATTERNS:
44
+ n = raw.count(bad_bytes)
45
+ if n:
46
+ raw = raw.replace(bad_bytes, good_bytes)
47
+ result["fixed"].append(f"mojibake fixed: {label} ({n}x)")
48
+
49
+ # Fix CRLF -> LF
50
+ if b"\r\n" in raw:
51
+ count = raw.count(b"\r\n")
52
+ raw = raw.replace(b"\r\n", b"\n")
53
+ result["fixed"].append(f"CRLF -> LF ({count} lines)")
54
+
55
+ # Fix null bytes
56
+ if b"\x00" in raw:
57
+ count = raw.count(b"\x00")
58
+ raw = raw.replace(b"\x00", b"")
59
+ result["fixed"].append(f"null bytes removed ({count})")
60
+
61
+ if raw == original:
62
+ return result
63
+
64
+ result["changes"] = len(result["fixed"])
65
+
66
+ if not dry_run:
67
+ result["backup"] = _backup(report.path)
68
+ with open(report.path, "wb") as f:
69
+ f.write(raw)
70
+
71
+ return result
72
+
73
+
74
+ def fix_directory(reports: list, dry_run: bool = False) -> list:
75
+ """
76
+ Fix all fixable files from a list of FileReport.
77
+ Returns list of fix result dicts.
78
+ """
79
+ results = []
80
+ for report in reports:
81
+ if report.has_issues and report.fixable:
82
+ result = fix_file(report, dry_run=dry_run)
83
+ if result["changes"] > 0 or result["skipped"]:
84
+ results.append(result)
85
+ return results
@@ -0,0 +1,193 @@
1
+ """
2
+ license.py
3
+ ----------
4
+ License validation for encoding-doctor.
5
+ Uses Lemon Squeezy API for activation/deactivation.
6
+
7
+ Commands added to CLI:
8
+ enc-doctor activate <key> -- activate license key
9
+ enc-doctor deactivate -- deactivate (free up seat)
10
+ enc-doctor license -- show license status
11
+
12
+ Free: scan
13
+ Paid: fix, verify, restore
14
+ """
15
+
16
+ import json
17
+ import os
18
+ import platform
19
+ import urllib.request
20
+ import urllib.error
21
+ from pathlib import Path
22
+
23
+ # ── Lemon Squeezy config ──────────────────────────────────────────────────────
24
+ # Ganti dengan Store ID & Product ID kamu setelah produk dibuat di Lemon Squeezy
25
+ LS_STORE_ID = "399875" # Settings → Store → Store ID
26
+ LS_PRODUCT_ID = "1123336" # Products → klik produk → ID di URL
27
+
28
+ ACTIVATE_URL = "https://api.lemonsqueezy.com/v1/licenses/activate"
29
+ DEACTIVATE_URL = "https://api.lemonsqueezy.com/v1/licenses/deactivate"
30
+ VALIDATE_URL = "https://api.lemonsqueezy.com/v1/licenses/validate"
31
+
32
+ # ── Local storage ─────────────────────────────────────────────────────────────
33
+ LICENSE_DIR = Path.home() / ".encoding-doctor"
34
+ LICENSE_FILE = LICENSE_DIR / "license.json"
35
+
36
+ # ── Commands that require a license ──────────────────────────────────────────
37
+ PAID_COMMANDS = ("fix", "verify", "restore")
38
+ FREE_COMMANDS = ("scan",)
39
+
40
+
41
+ # ─────────────────────────────────────────────────────────────────────────────
42
+ # Internal helpers
43
+ # ─────────────────────────────────────────────────────────────────────────────
44
+
45
+ def _save(payload: dict):
46
+ LICENSE_DIR.mkdir(parents=True, exist_ok=True)
47
+ LICENSE_FILE.write_text(json.dumps(payload, indent=2))
48
+
49
+
50
+ def _load() -> dict | None:
51
+ try:
52
+ return json.loads(LICENSE_FILE.read_text())
53
+ except Exception:
54
+ return None
55
+
56
+
57
+ def _post(url: str, data: dict) -> dict:
58
+ body = json.dumps(data).encode()
59
+ req = urllib.request.Request(
60
+ url,
61
+ data=body,
62
+ headers={"Content-Type": "application/json", "Accept": "application/json"},
63
+ method="POST",
64
+ )
65
+ with urllib.request.urlopen(req, timeout=10) as r:
66
+ return json.loads(r.read())
67
+
68
+
69
+ # ─────────────────────────────────────────────────────────────────────────────
70
+ # Public API (called from cli.py)
71
+ # ─────────────────────────────────────────────────────────────────────────────
72
+
73
+ def activate(key: str) -> tuple[bool, str]:
74
+ """
75
+ Activate a Lemon Squeezy license key.
76
+ Saves license.json locally on success.
77
+ """
78
+ key = key.strip()
79
+ if not key:
80
+ return False, "License key cannot be empty."
81
+
82
+ try:
83
+ resp = _post(ACTIVATE_URL, {
84
+ "license_key": key,
85
+ "instance_name": platform.node() or "encoding-doctor",
86
+ })
87
+ except urllib.error.HTTPError as e:
88
+ try:
89
+ body = json.loads(e.read())
90
+ return False, f"Activation failed: {body.get('error', e.reason)}"
91
+ except Exception:
92
+ return False, f"Activation failed: HTTP {e.code}"
93
+ except Exception as e:
94
+ return False, f"Network error: {e}"
95
+
96
+ if not resp.get("activated"):
97
+ return False, resp.get("error", "Activation failed — check your key and try again.")
98
+
99
+ payload = {
100
+ "key": key,
101
+ "instance_id": resp["instance"]["id"],
102
+ "customer": resp["meta"].get("customer_email", ""),
103
+ "store_id": resp["meta"].get("store_id", ""),
104
+ "product_id": resp["meta"].get("product_id", ""),
105
+ "activated_at": resp["license_key"]["created_at"],
106
+ }
107
+ _save(payload)
108
+
109
+ customer = payload["customer"] or "user"
110
+ return True, f"License activated. Welcome, {customer}!"
111
+
112
+
113
+ def deactivate() -> tuple[bool, str]:
114
+ """
115
+ Deactivate this machine's license seat.
116
+ Removes license.json so the key can be used on another machine.
117
+ """
118
+ data = _load()
119
+ if not data:
120
+ return False, "No active license found on this machine."
121
+
122
+ try:
123
+ _post(DEACTIVATE_URL, {
124
+ "license_key": data["key"],
125
+ "instance_id": data["instance_id"],
126
+ })
127
+ except Exception:
128
+ pass # Even if API call fails, remove local file
129
+
130
+ try:
131
+ LICENSE_FILE.unlink()
132
+ except Exception:
133
+ pass
134
+
135
+ return True, "License deactivated. You can now activate on another machine."
136
+
137
+
138
+ def validate() -> tuple[bool, str]:
139
+ """
140
+ Offline check: is this machine licensed?
141
+ Returns (True, customer_email) or (False, reason).
142
+ Does NOT call the network — fast, always works offline.
143
+ """
144
+ data = _load()
145
+ if not data:
146
+ return False, "not_activated"
147
+ if not data.get("key") or not data.get("instance_id"):
148
+ return False, "corrupted"
149
+ return True, data.get("customer", "licensed")
150
+
151
+
152
+ def status() -> dict:
153
+ """Return a dict with license info for display."""
154
+ data = _load()
155
+ if not data:
156
+ return {"active": False}
157
+ return {
158
+ "active": True,
159
+ "customer": data.get("customer", ""),
160
+ "activated_at": data.get("activated_at", ""),
161
+ "key_preview": data["key"][:4] + "-****-****-" + data["key"][-4:],
162
+ }
163
+
164
+
165
+ # ─────────────────────────────────────────────────────────────────────────────
166
+ # Gate (call this before any paid command)
167
+ # ─────────────────────────────────────────────────────────────────────────────
168
+
169
+ def require_license(command: str):
170
+ """
171
+ Call at the top of fix / verify / restore.
172
+ Prints a clear message and exits if not licensed.
173
+ """
174
+ if command not in PAID_COMMANDS:
175
+ return # free command, no check needed
176
+
177
+ ok, _ = validate()
178
+ if ok:
179
+ return # licensed, proceed
180
+
181
+ # Not licensed — print gate message and exit
182
+ print()
183
+ print(" \033[93menc-doctor: license required\033[0m")
184
+ print()
185
+ print(f" '\033[1m{command}\033[0m' is a paid feature.")
186
+ print(f" '\033[1mscan\033[0m' is free — run that first to see what needs fixing.")
187
+ print()
188
+ print(" To unlock fix, verify, and restore:")
189
+ print(" 1. Get your license → https://stateflow.dev/encoding-doctor")
190
+ print(" 2. Run: enc-doctor activate <YOUR-KEY>")
191
+ print()
192
+ import sys
193
+ sys.exit(1)
@@ -0,0 +1,107 @@
1
+ """
2
+ reporter.py
3
+ -----------
4
+ Formats scan, fix, and verify results for terminal output.
5
+ """
6
+
7
+ import os
8
+
9
+ GREEN = "\033[92m"
10
+ YELLOW = "\033[93m"
11
+ RED = "\033[91m"
12
+ CYAN = "\033[96m"
13
+ DIM = "\033[2m"
14
+ RESET = "\033[0m"
15
+ BOLD = "\033[1m"
16
+
17
+
18
+ def _rel(path: str) -> str:
19
+ try:
20
+ return os.path.relpath(path)
21
+ except ValueError:
22
+ return path
23
+
24
+
25
+ def print_scan_report(reports: list, show_clean: bool = False):
26
+ issues = [r for r in reports if r.has_issues]
27
+ clean = [r for r in reports if not r.has_issues]
28
+
29
+ print(f"\n{BOLD}SCAN REPORT{RESET}")
30
+ print(f"{DIM}{'='*60}{RESET}")
31
+ print(f" Total files scanned : {len(reports)}")
32
+ print(f" Files with issues : {YELLOW}{len(issues)}{RESET}")
33
+ print(f" Clean files : {GREEN}{len(clean)}{RESET}")
34
+ print()
35
+
36
+ if not issues:
37
+ print(f" {GREEN}All files are clean.{RESET}")
38
+ return
39
+
40
+ for report in issues:
41
+ print(f" {YELLOW}WARN{RESET} {_rel(report.path)}")
42
+ for issue in report.issues:
43
+ count_str = f" ({issue.count}x)" if issue.count else ""
44
+ fix_str = "" if issue.fixable else f" {DIM}[manual fix required]{RESET}"
45
+ print(f" {DIM}>{RESET} {issue.label}{count_str}{fix_str}")
46
+
47
+ if show_clean:
48
+ for report in clean:
49
+ print(f" {GREEN}OK{RESET} {_rel(report.path)}")
50
+
51
+ print()
52
+ print(f" {YELLOW}Run 'enc-doctor fix <path>' to repair fixable issues.{RESET}")
53
+ print(f" {DIM}Review this report carefully before running fix.{RESET}")
54
+ print()
55
+
56
+
57
+ def print_fix_report(results: list, dry_run: bool = False):
58
+ label = "DRY RUN" if dry_run else "FIX REPORT"
59
+ print(f"\n{BOLD}{label}{RESET}")
60
+ print(f"{DIM}{'='*60}{RESET}")
61
+
62
+ total_changes = 0
63
+ for r in results:
64
+ if r["changes"]:
65
+ print(f" {GREEN}FIXED{RESET} {_rel(r['path'])}")
66
+ for change in r["fixed"]:
67
+ print(f" {DIM}>{RESET} {change}")
68
+ if r["backup"] and not dry_run:
69
+ print(f" {DIM}backup: {_rel(r['backup'])}{RESET}")
70
+ total_changes += r["changes"]
71
+ if r["skipped"]:
72
+ print(f" {YELLOW}SKIP{RESET} {_rel(r['path'])}")
73
+ for skip in r["skipped"]:
74
+ print(f" {DIM}>{RESET} {skip} (manual fix required)")
75
+
76
+ print()
77
+ if total_changes:
78
+ action = "Would fix" if dry_run else "Fixed"
79
+ print(f" {GREEN}{action} {total_changes} issue(s).{RESET}")
80
+ if not dry_run:
81
+ print(f" {DIM}Backups saved as .bak{RESET}")
82
+ print(f" {DIM}Run 'enc-doctor verify <path>' to confirm.{RESET}")
83
+ else:
84
+ print(f" {GREEN}No changes made.{RESET}")
85
+ print()
86
+
87
+
88
+ def print_verify_report(results: list):
89
+ passed = [r for r in results if r.ok]
90
+ failed = [r for r in results if not r.ok]
91
+
92
+ print(f"\n{BOLD}VERIFY REPORT{RESET}")
93
+ print(f"{DIM}{'='*60}{RESET}")
94
+
95
+ for r in failed:
96
+ print(f" {RED}FAIL{RESET} {_rel(r.path)}")
97
+ print(f" {DIM}>{RESET} {r.error}")
98
+
99
+ print()
100
+ print(f" {GREEN}PASS{RESET} {len(passed)} / {len(results)} files")
101
+
102
+ if not failed:
103
+ print(f" {GREEN}All files valid UTF-8. Safe to commit.{RESET}")
104
+ else:
105
+ print(f" {RED}{len(failed)} file(s) still have issues.{RESET}")
106
+ print(f" {DIM}Run 'enc-doctor fix <path>' or restore from .bak{RESET}")
107
+ print()
@@ -0,0 +1,138 @@
1
+ """
2
+ scanner.py
3
+ ----------
4
+ Scans files and directories for encoding issues.
5
+
6
+ Detects:
7
+ - Mojibake (UTF-8 bytes re-encoded as cp1252)
8
+ - BOM (Byte Order Mark)
9
+ - CRLF line endings
10
+ - Null bytes
11
+ - Non-UTF-8 encoding
12
+
13
+ Based on real encoding bugs found in production Python projects on Windows.
14
+ """
15
+
16
+ import os
17
+ from dataclasses import dataclass, field
18
+ from typing import Optional
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Mojibake byte patterns identified from real projects.
22
+ # Each: (corrupt_bytes, correct_utf8_bytes, human_label)
23
+ # ---------------------------------------------------------------------------
24
+ MOJIBAKE_PATTERNS = [
25
+ (b"\xc3\xa2\xe2\x80\xa0\xe2\x80\x99", "\u2192".encode("utf-8"), "arrow ->"),
26
+ (b"\xc3\xa2\xe2\x82\xac\xe2\x80\x9d", "\u2014".encode("utf-8"), "em dash --"),
27
+ (b"\xc3\xa2\xe2\x80\xa2\xc2\xac", "\u2550".encode("utf-8"), "box ="),
28
+ (b"\xc3\xa2\xe2\x80\x9c\xc2\x80", "\u2500".encode("utf-8"), "box -"),
29
+ (b"\xc3\xa2\xe2\x82\xac\xc2\xa2", "\u2022".encode("utf-8"), "bullet"),
30
+ (b"\xc3\xa2\xe2\x82\xac\xe2\x84\xa2", "\u2019".encode("utf-8"), "right quote"),
31
+ (b"\xc3\xa2\xe2\x82\xac\xc5\x93", "\u201c".encode("utf-8"), "left curly quote"),
32
+ (b"\xc3\xa2\xe2\x82\xac\xc2\x9d", "\u201d".encode("utf-8"), "right curly quote"),
33
+ ]
34
+
35
+ BOM = b"\xef\xbb\xbf"
36
+
37
+ TEXT_EXTENSIONS = {
38
+ ".py", ".txt", ".md", ".rst", ".json", ".yaml", ".yml",
39
+ ".toml", ".cfg", ".ini", ".env", ".sh", ".bat", ".ps1",
40
+ ".js", ".ts", ".jsx", ".tsx", ".css", ".html", ".htm",
41
+ ".xml", ".csv", ".tsv", ".sql", ".rb", ".php", ".go",
42
+ }
43
+
44
+ SKIP_DIRS = {
45
+ "__pycache__", ".git", ".venv", "venv", "env", "node_modules",
46
+ ".mypy_cache", ".pytest_cache", "dist", "build", ".eggs",
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class FileIssue:
52
+ kind: str
53
+ label: str
54
+ count: int = 0
55
+ fixable: bool = True
56
+
57
+
58
+ @dataclass
59
+ class FileReport:
60
+ path: str
61
+ encoding: str
62
+ size: int
63
+ issues: list = field(default_factory=list)
64
+
65
+ @property
66
+ def has_issues(self):
67
+ return len(self.issues) > 0
68
+
69
+ @property
70
+ def fixable(self):
71
+ return any(i.fixable for i in self.issues)
72
+
73
+
74
+ def _probe_encoding(raw: bytes) -> str:
75
+ if raw.startswith(BOM):
76
+ return "UTF-8 BOM"
77
+ try:
78
+ raw.decode("utf-8")
79
+ return "UTF-8"
80
+ except UnicodeDecodeError:
81
+ return "non-UTF-8"
82
+
83
+
84
+ def _is_binary(raw: bytes) -> bool:
85
+ sample = raw[:512]
86
+ if not sample:
87
+ return False
88
+ non_text = sum(1 for b in sample if b < 9 or (14 <= b <= 31 and b != 27))
89
+ return (non_text / len(sample)) > 0.30
90
+
91
+
92
+ def scan_file(path: str) -> Optional[FileReport]:
93
+ ext = os.path.splitext(path)[1].lower()
94
+ if ext not in TEXT_EXTENSIONS:
95
+ return None
96
+ try:
97
+ with open(path, "rb") as f:
98
+ raw = f.read()
99
+ except (PermissionError, OSError):
100
+ return None
101
+ if _is_binary(raw):
102
+ return None
103
+
104
+ enc = _probe_encoding(raw)
105
+ report = FileReport(path=path, encoding=enc, size=len(raw))
106
+
107
+ if raw.startswith(BOM):
108
+ report.issues.append(FileIssue(kind="bom", label="BOM (\\xef\\xbb\\xbf) detected", count=1))
109
+
110
+ for bad_bytes, _, label in MOJIBAKE_PATTERNS:
111
+ n = raw.count(bad_bytes)
112
+ if n:
113
+ report.issues.append(FileIssue(kind="mojibake", label=f"mojibake: {label}", count=n))
114
+
115
+ crlf_count = raw.count(b"\r\n")
116
+ if crlf_count:
117
+ report.issues.append(FileIssue(kind="crlf", label="CRLF line endings", count=crlf_count))
118
+
119
+ null_count = raw.count(b"\x00")
120
+ if null_count:
121
+ report.issues.append(FileIssue(kind="null_bytes", label="null bytes", count=null_count))
122
+
123
+ if enc == "non-UTF-8":
124
+ report.issues.append(FileIssue(kind="non_utf8", label="non-UTF-8 encoding", fixable=False))
125
+
126
+ return report
127
+
128
+
129
+ def scan_directory(path: str) -> list:
130
+ reports = []
131
+ for root, dirs, files in os.walk(path):
132
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
133
+ for fname in sorted(files):
134
+ fpath = os.path.join(root, fname)
135
+ report = scan_file(fpath)
136
+ if report is not None:
137
+ reports.append(report)
138
+ return reports
@@ -0,0 +1,67 @@
1
+ """
2
+ verifier.py
3
+ -----------
4
+ Verifies that files are valid UTF-8 after fixing.
5
+ Restores from .bak if needed.
6
+ """
7
+
8
+ import os
9
+ import shutil
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass
14
+ class VerifyResult:
15
+ path: str
16
+ ok: bool
17
+ encoding: str
18
+ error: str = ""
19
+
20
+
21
+ def verify_file(path: str) -> VerifyResult:
22
+ try:
23
+ with open(path, "rb") as f:
24
+ raw = f.read()
25
+
26
+ # Check for BOM (should have been removed by fixer)
27
+ if raw.startswith(b"\xef\xbb\xbf"):
28
+ return VerifyResult(path=path, ok=False, encoding="UTF-8 BOM", error="BOM still present")
29
+
30
+ # Validate UTF-8
31
+ raw.decode("utf-8")
32
+
33
+ # Check for remaining CRLF
34
+ if b"\r\n" in raw:
35
+ return VerifyResult(path=path, ok=False, encoding="UTF-8", error="CRLF still present")
36
+
37
+ return VerifyResult(path=path, ok=True, encoding="UTF-8")
38
+
39
+ except UnicodeDecodeError as e:
40
+ return VerifyResult(path=path, ok=False, encoding="non-UTF-8", error=str(e))
41
+ except (PermissionError, OSError) as e:
42
+ return VerifyResult(path=path, ok=False, encoding="unknown", error=str(e))
43
+
44
+
45
+ def verify_directory(path: str, extensions: set = None) -> list:
46
+ from .scanner import TEXT_EXTENSIONS, SKIP_DIRS
47
+ ext_filter = extensions or TEXT_EXTENSIONS
48
+ results = []
49
+
50
+ for root, dirs, files in os.walk(path):
51
+ dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
52
+ for fname in sorted(files):
53
+ ext = os.path.splitext(fname)[1].lower()
54
+ if ext in ext_filter:
55
+ fpath = os.path.join(root, fname)
56
+ results.append(verify_file(fpath))
57
+
58
+ return results
59
+
60
+
61
+ def restore_backup(path: str) -> bool:
62
+ """Restore file from .bak backup. Returns True if restored."""
63
+ bak = path + ".bak"
64
+ if os.path.exists(bak):
65
+ shutil.copy2(bak, path)
66
+ return True
67
+ return False
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: encoding-doctor
3
+ Version: 0.2.0
4
+ Summary: Scan, fix, and verify file encoding issues. Mojibake, BOM, CRLF, null bytes — fixed in one command.
5
+ Author-email: Stateflow Labs <stateflow.labs@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://stateflow-dev.github.io/stateflowlabs/
8
+ Project-URL: Repository, https://github.com/stateflow-dev/encoding-doctor
9
+ Keywords: encoding,utf-8,mojibake,bom,crlf,file encoding,encoding fix,encoding repair,python encoding,encoding tool,developer tools
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing
21
+ Classifier: Topic :: Utilities
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+
25
+ # encoding-doctor
26
+
27
+ **Scan, fix, and verify file encoding issues across your project in one command.**
28
+
29
+ Fixes mojibake, BOM, CRLF line endings, null bytes, and non-UTF-8 encoding —
30
+ automatically detected and repaired, with backups created before every change.
31
+
32
+ Built from real encoding bugs found in production Python projects on Windows.
33
+
34
+ ---
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ pip install encoding-doctor
40
+ ```
41
+
42
+ ---
43
+
44
+ ## Usage
45
+
46
+ ```bash
47
+ # Step 1 — scan first, always
48
+ enc-doctor scan ./my_project
49
+
50
+ # Step 2 — preview changes without writing
51
+ enc-doctor fix ./my_project --dry-run
52
+
53
+ # Step 3 — fix (backups created automatically as .bak)
54
+ enc-doctor fix ./my_project
55
+
56
+ # Step 4 — verify everything is clean
57
+ enc-doctor verify ./my_project
58
+ ```
59
+
60
+ ---
61
+
62
+ ## What it fixes
63
+
64
+ | Problem | Description |
65
+ |---|---|
66
+ | **Mojibake** | UTF-8 bytes mis-read as cp1252 and saved as garbage |
67
+ | **BOM** | `\xef\xbb\xbf` prefix added by Notepad/Excel that breaks parsers |
68
+ | **CRLF** | Windows `\r\n` mixed with Unix `\n` — causes Git diff noise |
69
+ | **Null bytes** | Binary corruption from FTP or terminal copy-paste |
70
+ | **Non-UTF-8** | Detected and flagged for manual conversion |
71
+
72
+ ---
73
+
74
+ ## Warning
75
+
76
+ > **encoding-doctor modifies files in-place.**
77
+ >
78
+ > - Always run `scan` first and review the report before running `fix`.
79
+ > - Backups are created automatically as `.bak` files.
80
+ > - Run on a Git-tracked project so you can always revert with `git checkout .`
81
+ > - Do not run `fix` on production files without testing first.
82
+ > - `verify` after every fix before committing.
83
+
84
+ ---
85
+
86
+ ## Options
87
+
88
+ ```bash
89
+ enc-doctor scan <path> [--all] # --all shows clean files too
90
+ enc-doctor fix <path> [--dry-run] # --dry-run previews without writing
91
+ enc-doctor verify <path>
92
+ enc-doctor restore <file> # restore single file from .bak
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Run tests
98
+
99
+ ```bash
100
+ pip install pytest
101
+ pytest tests/ -v
102
+ ```
103
+
104
+ ---
105
+
106
+ ## License
107
+
108
+ MIT © [Stateflow Labs](https://github.com/stateflow-dev)
@@ -0,0 +1,12 @@
1
+ encoding_doctor/__init__.py,sha256=fiW5ZlN8xXzfS3i-nxOeZBkgKmYK47kkh7sxfOmqHOM,443
2
+ encoding_doctor/cli.py,sha256=J2Ae2M5mOcn5X8Bl813WbyK_ZB5eAipKVWjZZggWZTY,7026
3
+ encoding_doctor/fixer.py,sha256=PAGb0oYuPqfoyQT8zKaBBFW26jB-bvSWYVASmaYgaIQ,2314
4
+ encoding_doctor/license.py,sha256=w8_sXfecpQXH_YhyfksqwZDXabQWDSjBkm6sFqLHCnU,7369
5
+ encoding_doctor/reporter.py,sha256=_16iToFSUGSr6oZSd-CRDYxScz6Z-rlfLvinunrSYZE,3409
6
+ encoding_doctor/scanner.py,sha256=D67IY6QgInRw1s2aGBwGzDBBVOM6qgiOuim6-I5NhSo,4170
7
+ encoding_doctor/verifier.py,sha256=e2wsVnNzbUY7oDDS1pSj76frGjpph4hswDfLVmMbNYs,1885
8
+ encoding_doctor-0.2.0.dist-info/METADATA,sha256=-4EOOPa8a7ZlAvODn6lcS745MEfegYg14kbhIcz4mY0,3211
9
+ encoding_doctor-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ encoding_doctor-0.2.0.dist-info/entry_points.txt,sha256=O6xaVIl9Yj2yMlwNwSEYaG5yWK_HXDizwev5z3xzbfA,56
11
+ encoding_doctor-0.2.0.dist-info/top_level.txt,sha256=vz2QN3W3Vl13Zc3BZqLUDGFPlXTdO3IEZ3RlwTPxopQ,16
12
+ encoding_doctor-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ enc-doctor = encoding_doctor.cli:main
@@ -0,0 +1 @@
1
+ encoding_doctor