encoding-doctor 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: encoding-doctor
3
+ Version: 0.2.0
4
+ Summary: Scan, fix, and verify file encoding issues. Mojibake, BOM, CRLF, null bytes — fixed in one command.
5
+ Author-email: Stateflow Labs <stateflow.labs@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://stateflow-dev.github.io/stateflowlabs/
8
+ Project-URL: Repository, https://github.com/stateflow-dev/encoding-doctor
9
+ Keywords: encoding,utf-8,mojibake,bom,crlf,file encoding,encoding fix,encoding repair,python encoding,encoding tool,developer tools
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing
21
+ Classifier: Topic :: Utilities
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+
25
+ # encoding-doctor
26
+
27
+ **Scan, fix, and verify file encoding issues across your project in one command.**
28
+
29
+ Fixes mojibake, BOM, CRLF line endings, null bytes, and non-UTF-8 encoding —
30
+ automatically detected and repaired, with backups created before every change.
31
+
32
+ Built from real encoding bugs found in production Python projects on Windows.
33
+
34
+ ---
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ pip install encoding-doctor
40
+ ```
41
+
42
+ ---
43
+
44
+ ## Usage
45
+
46
+ ```bash
47
+ # Step 1 — scan first, always
48
+ enc-doctor scan ./my_project
49
+
50
+ # Step 2 — preview changes without writing
51
+ enc-doctor fix ./my_project --dry-run
52
+
53
+ # Step 3 — fix (backups created automatically as .bak)
54
+ enc-doctor fix ./my_project
55
+
56
+ # Step 4 — verify everything is clean
57
+ enc-doctor verify ./my_project
58
+ ```
59
+
60
+ ---
61
+
62
+ ## What it fixes
63
+
64
+ | Problem | Description |
65
+ |---|---|
66
+ | **Mojibake** | UTF-8 bytes mis-read as cp1252 and saved as garbage |
67
+ | **BOM** | `\xef\xbb\xbf` prefix added by Notepad/Excel that breaks parsers |
68
+ | **CRLF** | Windows `\r\n` mixed with Unix `\n` — causes Git diff noise |
69
+ | **Null bytes** | Binary corruption from FTP or terminal copy-paste |
70
+ | **Non-UTF-8** | Detected and flagged for manual conversion |
71
+
72
+ ---
73
+
74
+ ## Warning
75
+
76
+ > **encoding-doctor modifies files in-place.**
77
+ >
78
+ > - Always run `scan` first and review the report before running `fix`.
79
+ > - Backups are created automatically as `.bak` files.
80
+ > - Run on a Git-tracked project so you can always revert with `git checkout .`
81
+ > - Do not run `fix` on production files without testing first.
82
+ > - `verify` after every fix before committing.
83
+
84
+ ---
85
+
86
+ ## Options
87
+
88
+ ```bash
89
+ enc-doctor scan <path> [--all] # --all shows clean files too
90
+ enc-doctor fix <path> [--dry-run] # --dry-run previews without writing
91
+ enc-doctor verify <path>
92
+ enc-doctor restore <file> # restore single file from .bak
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Run tests
98
+
99
+ ```bash
100
+ pip install pytest
101
+ pytest tests/ -v
102
+ ```
103
+
104
+ ---
105
+
106
+ ## License
107
+
108
+ MIT © [Stateflow Labs](https://github.com/stateflow-dev)
@@ -0,0 +1,84 @@
1
+ # encoding-doctor
2
+
3
+ **Scan, fix, and verify file encoding issues across your project in one command.**
4
+
5
+ Fixes mojibake, BOM, CRLF line endings, null bytes, and non-UTF-8 encoding —
6
+ automatically detected and repaired, with backups created before every change.
7
+
8
+ Built from real encoding bugs found in production Python projects on Windows.
9
+
10
+ ---
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install encoding-doctor
16
+ ```
17
+
18
+ ---
19
+
20
+ ## Usage
21
+
22
+ ```bash
23
+ # Step 1 — scan first, always
24
+ enc-doctor scan ./my_project
25
+
26
+ # Step 2 — preview changes without writing
27
+ enc-doctor fix ./my_project --dry-run
28
+
29
+ # Step 3 — fix (backups created automatically as .bak)
30
+ enc-doctor fix ./my_project
31
+
32
+ # Step 4 — verify everything is clean
33
+ enc-doctor verify ./my_project
34
+ ```
35
+
36
+ ---
37
+
38
+ ## What it fixes
39
+
40
+ | Problem | Description |
41
+ |---|---|
42
+ | **Mojibake** | UTF-8 bytes mis-read as cp1252 and saved as garbage |
43
+ | **BOM** | `\xef\xbb\xbf` prefix added by Notepad/Excel that breaks parsers |
44
+ | **CRLF** | Windows `\r\n` mixed with Unix `\n` — causes Git diff noise |
45
+ | **Null bytes** | Binary corruption from FTP or terminal copy-paste |
46
+ | **Non-UTF-8** | Detected and flagged for manual conversion |
47
+
48
+ ---
49
+
50
+ ## Warning
51
+
52
+ > **encoding-doctor modifies files in-place.**
53
+ >
54
+ > - Always run `scan` first and review the report before running `fix`.
55
+ > - Backups are created automatically as `.bak` files.
56
+ > - Run on a Git-tracked project so you can always revert with `git checkout .`
57
+ > - Do not run `fix` on production files without testing first.
58
+ > - `verify` after every fix before committing.
59
+
60
+ ---
61
+
62
+ ## Options
63
+
64
+ ```bash
65
+ enc-doctor scan <path> [--all] # --all shows clean files too
66
+ enc-doctor fix <path> [--dry-run] # --dry-run previews without writing
67
+ enc-doctor verify <path>
68
+ enc-doctor restore <file> # restore single file from .bak
69
+ ```
70
+
71
+ ---
72
+
73
+ ## Run tests
74
+
75
+ ```bash
76
+ pip install pytest
77
+ pytest tests/ -v
78
+ ```
79
+
80
+ ---
81
+
82
+ ## License
83
+
84
+ MIT © [Stateflow Labs](https://github.com/stateflow-dev)
@@ -0,0 +1,19 @@
1
+ """
2
+ encoding-doctor
3
+ ---------------
4
+ Scan, fix, and verify file encoding issues.
5
+
6
+ Usage:
7
+ from encoding_doctor.scanner import scan_directory
8
+ from encoding_doctor.fixer import fix_directory
9
+ from encoding_doctor.verifier import verify_directory
10
+
11
+ Or via CLI:
12
+ enc-doctor scan ./my_project
13
+ enc-doctor fix ./my_project
14
+ enc-doctor verify ./my_project
15
+ """
16
+
17
+ __version__ = "0.1.0"
18
+ __author__ = "Stateflow Labs"
19
+ __license__ = "MIT"
@@ -0,0 +1,244 @@
1
+ """
2
+ cli.py
3
+ ------
4
+ Command-line interface for encoding-doctor.
5
+
6
+ Commands:
7
+ enc-doctor scan <path> -- detect encoding issues (FREE)
8
+ enc-doctor fix <path> -- fix detected issues (LICENSE)
9
+ enc-doctor verify <path> -- verify files are clean (LICENSE)
10
+ enc-doctor restore <file> -- restore a .bak file (LICENSE)
11
+ enc-doctor activate <key> -- activate license key
12
+ enc-doctor deactivate -- deactivate license (free up seat)
13
+ enc-doctor license -- show license status
14
+
15
+ Flags:
16
+ --dry-run (fix only) show what would change without writing
17
+ --all (scan only) show clean files too
18
+ """
19
+
20
+ import sys
21
+ import os
22
+
23
+
24
+ def _check_path(path: str):
25
+ if not os.path.exists(path):
26
+ print(f"ERROR: path not found: {path}")
27
+ sys.exit(1)
28
+
29
+
30
+ def cmd_scan(path: str, show_all: bool = False):
31
+ from .scanner import scan_directory, scan_file
32
+ from .reporter import print_scan_report
33
+
34
+ if os.path.isfile(path):
35
+ report = scan_file(path)
36
+ reports = [report] if report else []
37
+ else:
38
+ reports = scan_directory(path)
39
+
40
+ print_scan_report(reports, show_clean=show_all)
41
+
42
+ has_issues = any(r.has_issues for r in reports)
43
+ sys.exit(1 if has_issues else 0)
44
+
45
+
46
+ def cmd_fix(path: str, dry_run: bool = False):
47
+ from .license import require_license
48
+ require_license("fix")
49
+
50
+ from .scanner import scan_directory, scan_file
51
+ from .fixer import fix_directory
52
+ from .reporter import print_fix_report
53
+
54
+ print("\033[93mWARNING: enc-doctor fix modifies files in-place.\033[0m")
55
+ print("\033[2mBackups will be created as .bak files.\033[0m")
56
+ print("\033[2mRun 'enc-doctor scan' first if you have not already.\033[0m\n")
57
+
58
+ if os.path.isfile(path):
59
+ report = scan_file(path)
60
+ reports = [report] if report else []
61
+ else:
62
+ reports = scan_directory(path)
63
+
64
+ results = fix_directory(reports, dry_run=dry_run)
65
+ print_fix_report(results, dry_run=dry_run)
66
+
67
+
68
+ def cmd_verify(path: str):
69
+ from .license import require_license
70
+ require_license("verify")
71
+
72
+ from .verifier import verify_directory, verify_file
73
+ from .reporter import print_verify_report
74
+
75
+ if os.path.isfile(path):
76
+ results = [verify_file(path)]
77
+ else:
78
+ results = verify_directory(path)
79
+
80
+ print_verify_report(results)
81
+
82
+ all_ok = all(r.ok for r in results)
83
+ sys.exit(0 if all_ok else 1)
84
+
85
+
86
+ def cmd_restore(path: str):
87
+ from .license import require_license
88
+ require_license("restore")
89
+
90
+ from .verifier import restore_backup
91
+ bak = path + ".bak"
92
+ if not os.path.exists(bak):
93
+ print(f"ERROR: no backup found at {bak}")
94
+ sys.exit(1)
95
+ restored = restore_backup(path)
96
+ if restored:
97
+ print(f"\033[92mRestored: {path}\033[0m")
98
+ print(f"\033[2mFrom: {bak}\033[0m")
99
+ else:
100
+ print("ERROR: restore failed")
101
+ sys.exit(1)
102
+
103
+
104
+ def cmd_activate(key: str):
105
+ from .license import activate
106
+ ok, msg = activate(key)
107
+ if ok:
108
+ print(f"\n \033[92m✓ {msg}\033[0m")
109
+ print()
110
+ print(" \033[2mLicense saved to ~/.encoding-doctor/license.json\033[0m")
111
+ print()
112
+ print(" Unlocked commands:")
113
+ print(" scan \033[92m✓ free\033[0m")
114
+ print(" fix \033[92m✓ licensed\033[0m")
115
+ print(" verify \033[92m✓ licensed\033[0m")
116
+ print(" restore \033[92m✓ licensed\033[0m")
117
+ print()
118
+ else:
119
+ print(f"\n \033[91m✗ {msg}\033[0m\n")
120
+ sys.exit(1)
121
+
122
+
123
+ def cmd_deactivate():
124
+ from .license import deactivate
125
+ ok, msg = deactivate()
126
+ if ok:
127
+ print(f"\n \033[92m✓ {msg}\033[0m\n")
128
+ else:
129
+ print(f"\n \033[91m✗ {msg}\033[0m\n")
130
+ sys.exit(1)
131
+
132
+
133
+ def cmd_license_status():
134
+ from .license import status
135
+ info = status()
136
+ print()
137
+ if info["active"]:
138
+ print(" \033[92mLicense: ACTIVE\033[0m")
139
+ if info.get("customer"):
140
+ print(f" Customer : {info['customer']}")
141
+ if info.get("key_preview"):
142
+ print(f" Key : {info['key_preview']}")
143
+ if info.get("activated_at"):
144
+ print(f" Since : {info['activated_at'][:10]}")
145
+ print()
146
+ print(" All commands unlocked.")
147
+ else:
148
+ print(" \033[93mLicense: NOT ACTIVATED\033[0m")
149
+ print()
150
+ print(" Free: scan")
151
+ print(" Paid: fix, verify, restore")
152
+ print()
153
+ print(" Get license → https://stateflow.dev/encoding-doctor")
154
+ print(" Then run: enc-doctor activate <YOUR-KEY>")
155
+ print()
156
+
157
+
158
+ def main():
159
+ args = sys.argv[1:]
160
+
161
+ if not args or args[0] in ("-h", "--help"):
162
+ print("""
163
+ encoding-doctor v0.2.0
164
+
165
+ Usage:
166
+ enc-doctor scan <path> [--all]
167
+ enc-doctor fix <path> [--dry-run]
168
+ enc-doctor verify <path>
169
+ enc-doctor restore <file>
170
+ enc-doctor activate <license-key>
171
+ enc-doctor deactivate
172
+ enc-doctor license
173
+
174
+ Commands:
175
+ scan Detect encoding issues — free forever
176
+ fix Fix detected issues — requires license
177
+ verify Confirm all files are valid UTF-8 — requires license
178
+ restore Restore a file from .bak backup — requires license
179
+ activate Activate your license key
180
+ deactivate Free up your license seat (e.g. when switching machines)
181
+ license Show current license status
182
+
183
+ Flags:
184
+ --all (scan) also show clean files
185
+ --dry-run (fix) show what would change without writing files
186
+
187
+ Get a license → https://stateflow.dev/encoding-doctor
188
+ """)
189
+ sys.exit(0)
190
+
191
+ command = args[0]
192
+ rest = args[1:]
193
+
194
+ if command == "scan":
195
+ if not rest:
196
+ print("ERROR: provide a path. Usage: enc-doctor scan <path>")
197
+ sys.exit(1)
198
+ path = rest[0]
199
+ _check_path(path)
200
+ show_all = "--all" in rest
201
+ cmd_scan(path, show_all=show_all)
202
+
203
+ elif command == "fix":
204
+ if not rest:
205
+ print("ERROR: provide a path. Usage: enc-doctor fix <path>")
206
+ sys.exit(1)
207
+ path = rest[0]
208
+ _check_path(path)
209
+ dry_run = "--dry-run" in rest
210
+ cmd_fix(path, dry_run=dry_run)
211
+
212
+ elif command == "verify":
213
+ if not rest:
214
+ print("ERROR: provide a path. Usage: enc-doctor verify <path>")
215
+ sys.exit(1)
216
+ path = rest[0]
217
+ _check_path(path)
218
+ cmd_verify(path)
219
+
220
+ elif command == "restore":
221
+ if not rest:
222
+ print("ERROR: provide a file path. Usage: enc-doctor restore <file>")
223
+ sys.exit(1)
224
+ cmd_restore(rest[0])
225
+
226
+ elif command == "activate":
227
+ if not rest:
228
+ print("ERROR: provide a license key. Usage: enc-doctor activate <key>")
229
+ sys.exit(1)
230
+ cmd_activate(rest[0])
231
+
232
+ elif command == "deactivate":
233
+ cmd_deactivate()
234
+
235
+ elif command == "license":
236
+ cmd_license_status()
237
+
238
+ else:
239
+ print(f"ERROR: unknown command '{command}'. Run 'enc-doctor --help'")
240
+ sys.exit(1)
241
+
242
+
243
+ if __name__ == "__main__":
244
+ main()
@@ -0,0 +1,85 @@
1
+ """
2
+ fixer.py
3
+ --------
4
+ Fixes encoding issues detected by scanner.py.
5
+ Always creates .bak backup before modifying any file.
6
+
7
+ Based on real fix operations performed on adaptive_runtime source files.
8
+ """
9
+
10
+ import os
11
+ import shutil
12
+ from .scanner import MOJIBAKE_PATTERNS, BOM, FileReport
13
+
14
+
15
+ def _backup(path: str) -> str:
16
+ bak = path + ".bak"
17
+ shutil.copy2(path, bak)
18
+ return bak
19
+
20
+
21
+ def fix_file(report: FileReport, dry_run: bool = False) -> dict:
22
+ """
23
+ Fix all fixable issues in a file.
24
+ Returns dict with keys: path, fixed, skipped, backup, changes
25
+ """
26
+ result = {"path": report.path, "fixed": [], "skipped": [], "backup": None, "changes": 0}
27
+
28
+ if not report.fixable:
29
+ result["skipped"] = [i.label for i in report.issues if not i.fixable]
30
+ return result
31
+
32
+ with open(report.path, "rb") as f:
33
+ raw = f.read()
34
+
35
+ original = raw
36
+
37
+ # Fix BOM
38
+ if raw.startswith(BOM):
39
+ raw = raw[3:]
40
+ result["fixed"].append("BOM stripped")
41
+
42
+ # Fix mojibake (byte-level replacement)
43
+ for bad_bytes, good_bytes, label in MOJIBAKE_PATTERNS:
44
+ n = raw.count(bad_bytes)
45
+ if n:
46
+ raw = raw.replace(bad_bytes, good_bytes)
47
+ result["fixed"].append(f"mojibake fixed: {label} ({n}x)")
48
+
49
+ # Fix CRLF -> LF
50
+ if b"\r\n" in raw:
51
+ count = raw.count(b"\r\n")
52
+ raw = raw.replace(b"\r\n", b"\n")
53
+ result["fixed"].append(f"CRLF -> LF ({count} lines)")
54
+
55
+ # Fix null bytes
56
+ if b"\x00" in raw:
57
+ count = raw.count(b"\x00")
58
+ raw = raw.replace(b"\x00", b"")
59
+ result["fixed"].append(f"null bytes removed ({count})")
60
+
61
+ if raw == original:
62
+ return result
63
+
64
+ result["changes"] = len(result["fixed"])
65
+
66
+ if not dry_run:
67
+ result["backup"] = _backup(report.path)
68
+ with open(report.path, "wb") as f:
69
+ f.write(raw)
70
+
71
+ return result
72
+
73
+
74
+ def fix_directory(reports: list, dry_run: bool = False) -> list:
75
+ """
76
+ Fix all fixable files from a list of FileReport.
77
+ Returns list of fix result dicts.
78
+ """
79
+ results = []
80
+ for report in reports:
81
+ if report.has_issues and report.fixable:
82
+ result = fix_file(report, dry_run=dry_run)
83
+ if result["changes"] > 0 or result["skipped"]:
84
+ results.append(result)
85
+ return results