binaryornot 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ """BinaryOrNot: ultra-lightweight pure Python package to check if a file is binary or text."""
@@ -0,0 +1,4 @@
1
+ from .check import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
binaryornot/check.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ binaryornot.check
3
+ -----------------
4
+
5
+ Main code for checking if a file is binary or text.
6
+ """
7
+
8
+ import argparse
9
+ import logging
10
+
11
+ from binaryornot.helpers import get_starting_chunk, is_binary_string
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def is_binary(filename):
17
+ """
18
+ :param filename: File to check.
19
+ :returns: True if it's a binary file, otherwise False.
20
+ """
21
+ logger.debug("is_binary: %(filename)r", locals())
22
+
23
+ # Check if the file extension is in a list of known binary types
24
+ # binary_extensions = ['.pyc', ]
25
+ # for ext in binary_extensions:
26
+ # if filename.endswith(ext):
27
+ # return True
28
+
29
+ # Check if the starting chunk is a binary string
30
+ chunk = get_starting_chunk(filename)
31
+ return is_binary_string(chunk)
32
+
33
+
34
+ def main():
35
+ parser = argparse.ArgumentParser(description="Check if a file passed as argument is binary or not")
36
+
37
+ parser.add_argument(
38
+ "filename", help="File name to check for. If the file is not in the same folder, include full path"
39
+ )
40
+
41
+ args = parser.parse_args()
42
+
43
+ print(is_binary(**vars(args)))
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
File without changes
@@ -0,0 +1,50 @@
1
+ format,family,status,magic_hex,test_file,gap_reason,source
2
+ png,image,covered,89504e470d0a1a0a,tests/files/logo.png,,ISO/IEC 15948 s5.2
3
+ jpeg_jfif,image,covered,ffd8ffe0,,,ITU-T T.81 + JFIF APP0 marker
4
+ jpeg_exif,image,covered,ffd8ffe1,,,ITU-T T.81 + Exif APP1 marker
5
+ gif87a,image,covered,474946383761,tests/files/lena.gif,,GIF87a spec header
6
+ gif89a,image,covered,474946383961,tests/isBinaryFile/trunks.gif,,GIF89a spec header
7
+ bmp,image,covered,424d,tests/files/rgb-3c-8b.bmp,,Microsoft BMP format
8
+ tiff_be,image,covered,4d4d002a,tests/files/palette-1c-8b.tiff,,TIFF 6.0 spec s2 (big-endian)
9
+ tiff_le,image,covered,49492a00,,,TIFF 6.0 spec s2 (little-endian)
10
+ ico,image,covered,00000100,,,Microsoft ICO format
11
+ pdf,document,covered,255044462d312e,tests/isBinaryFile/pdf.pdf,,ISO 32000 s7.5.2
12
+ sqlite,database,covered,53514c69746520666f726d6174203300,tests/isBinaryFile/test.sqlite,,SQLite file format s1.2.1
13
+ zip,archive,covered,504b0304,,,APPNOTE.TXT (PKWARE) s4.3.7
14
+ gzip,archive,covered,1f8b08,,,RFC 1952 s2.3
15
+ xz,archive,covered,fd377a585a00,,,XZ file format spec s2.1.1
16
+ elf,executable,covered,7f454c46,,,System V ABI s4 (ELF header)
17
+ macho_32be,executable,covered,feedface,,,Apple Mach-O format reference
18
+ macho_32le,executable,covered,cefaedfe,,,Apple Mach-O format reference
19
+ macho_64be,executable,covered,feedfacf,,,Apple Mach-O format reference
20
+ macho_64le,executable,covered,cffaedfe,tests/isBinaryFile/grep,,Apple Mach-O format reference
21
+ mz,executable,covered,4d5a,,,Microsoft PE/COFF spec (DOS header)
22
+ java_class,executable,covered,cafebabe,,,JVM spec s4.1
23
+ riff,media,covered,52494646,,,Microsoft RIFF spec
24
+ ogg,media,covered,4f676753,,,Ogg bitstream format spec
25
+ flac,media,covered,664c6143,,,FLAC format spec
26
+ wasm,executable,covered,0061736d,,,WebAssembly spec s5.5.1
27
+ woff,font,covered,774f4646,tests/files/glyphiconshalflings-regular.woff,,W3C WOFF spec
28
+ otf,font,covered,4f54544f,tests/files/glyphiconshalflings-regular.otf,,OpenType spec (CFF-based)
29
+ ttf,font,covered,0001000000,tests/files/glyphiconshalflings-regular.ttf,,OpenType/TrueType spec v1.0
30
+ eot,font,covered,,tests/files/glyphiconshalflings-regular.eot,No universal magic; starts with file size,Microsoft EOT spec
31
+ pyc,compiled,covered,,tests/files/hello_world.pyc,Magic varies by Python version,Python importlib source
32
+ ds_store,metadata,covered,0000000142756431,tests/files/.DS_Store,,Apple .DS_Store reverse-engineered format
33
+ raw_rgb,image,covered,,tests/files/pixelstream.rgb,No magic; pure pixel data,
34
+ woff2,font,covered,774f4632,tests/files/test.woff2,,W3C WOFF2 spec
35
+ webp,image,covered,524946460000000057454250,tests/files/logo.webp,,Google WebP container spec (RIFF+WEBP)
36
+ mp4,media,covered,0000001866747970,tests/files/test.mp4,,ISO 14496-12 s4.3 (ftyp box)
37
+ mp3_id3,media,covered,494433,,,ID3v2 spec s3.1
38
+ bzip2,archive,covered,425a68,tests/files/test.bz2,,bzip2 file format
39
+ 7z,archive,covered,377abcaf271c,tests/files/test.7z,,7-Zip format spec
40
+ ole2,document,covered,d0cf11e0a1b11ae1,tests/files/test.doc,,MS-CFB spec s2
41
+ zstd,archive,covered,28b52ffd,tests/files/test.zst,,RFC 8878 s3.1.1
42
+ rar,archive,covered,526172211a07,tests/files/test.rar,,RAR5 tech note
43
+ matroska,media,covered,1a45dfa3,tests/files/test.webm,,EBML/Matroska spec
44
+ midi,media,covered,4d546864,tests/files/test.mid,,MIDI 1.0 spec
45
+ psd,image,covered,38425053,tests/files/test.psd,,Adobe PSD spec
46
+ heif,image,covered,0000001c6674797068656963,tests/files/logo.heic,,ISO 23008-12 (ftyp heic)
47
+ parquet,data,covered,50415231,tests/files/test.parquet,,Apache Parquet format spec
48
+ dex,executable,covered,6465780a,tests/files/test.dex,,Dalvik executable format
49
+ llvm_bc,compiled,covered,4243c0de,tests/files/test.bc,,LLVM bitcode wrapper format
50
+ git_pack,data,covered,5041434b,tests/files/test.pack,,Git pack format spec
@@ -0,0 +1,42 @@
1
+ encoding,family,status,min_bytes,sample_text,gap_reason
2
+ utf-8,unicode,covered,1,"Héllo wörld, café, naïve, résumé.",
3
+ utf-16,unicode_bom,covered,2,"The quick brown fox jumps over the lazy dog.",
4
+ utf-16-le,unicode,covered,8,"Pack my box with five dozen liquor jugs.",
5
+ utf-16-be,unicode,covered,8,"How vexingly quick daft zebras jump.",
6
+ utf-32,unicode_bom,covered,4,"Sphinx of black quartz, judge my vow.",
7
+ utf-32-le,unicode,covered,16,"The five boxing wizards jump quickly.",
8
+ utf-32-be,unicode,covered,16,"Jackdaws love my big sphinx of quartz.",
9
+ utf-7,unicode,covered,1,"Hello, world! This is a test of detection.",
10
+ ascii,single_byte,covered,1,"The quick brown fox jumps over the lazy dog.",
11
+ iso-8859-1,single_byte,covered,4,"Héllo wörld, café, naïve, résumé, Straße.",
12
+ iso-8859-2,single_byte,covered,4,"Příliš žluťoučký kůň úpěl ďábelské ódy.",
13
+ iso-8859-5,single_byte,covered,4,"Съешь же ещё этих мягких французских булок.",
14
+ iso-8859-6,single_byte,covered,4,"هذا نص تجريبي باللغة العربية.",
15
+ iso-8859-7,single_byte,covered,4,"Αυτό είναι ένα δοκιμαστικό κείμενο.",
16
+ iso-8859-15,single_byte,covered,4,"L'œuvre du Père Noël coûte cher en décembre.",
17
+ windows-1250,single_byte,covered,4,"Stróż pchnął kość w quiz gędźb vel fax myjń.",
18
+ windows-1251,single_byte,covered,4,"Широкая электрификация южных губерний.",
19
+ windows-1252,single_byte,covered,4,"Dès Noël où un zéphyr haï me vêt de glaçons.",
20
+ windows-1253,single_byte,covered,4,"Ξεσκεπάζω την ψυχοφθόρα βδελυγμία.",
21
+ windows-1256,single_byte,covered,4,"نص عربي تجريبي للكشف عن الترميز الصحيح.",
22
+ koi8-r,single_byte,covered,4,"Широкая электрификация южных губерний.",
23
+ cp866,single_byte,covered,4,"В чащах юга жил бы цитрус? Да, но фальшивый.",
24
+ cp850,single_byte,covered,4," Strüdel, naïf, tête-à-tête, über alles.",
25
+ cp437,single_byte,covered,4,"Ärger mit Ölförderung in Übersee.",
26
+ tis-620,single_byte,covered,4,"สวัสดีครับ นี่คือข้อความทดสอบภาษาไทย.",
27
+ cp1258,single_byte,covered,4,"Garçon! Un café crème et une pièce montée.",
28
+ mac-roman,single_byte,covered,4,"Là où le zéphyr souffle, les œufs flottent.",
29
+ mac-cyrillic,single_byte,covered,4,"Эх, чужак! Общий съём цен шляп.",
30
+ gb2312,cjk_legacy,covered,20,"你好世界。中文测试文本。欢迎来到这里。",
31
+ gb18030,cjk_legacy,covered,20,"天地玄黄宇宙洪荒日月盈昃辰宿列张。",
32
+ gbk,cjk_legacy,covered,20,"春眠不觉晓处处闻啼鸟夜来风雨声花落知多少。",
33
+ big5,cjk_legacy,covered,20,"你好世界。歡迎來到這裡。請檢查結果。",
34
+ shift_jis,cjk_legacy,covered,20,"こんにちは世界。日本語のテストです。",
35
+ euc-jp,cjk_legacy,covered,20,"吾輩は猫である。名前はまだ無い。",
36
+ euc-kr,cjk_legacy,covered,20,"안녕하세요 세계에 오신 것을 환영합니다.",
37
+ iso-2022-jp,iso2022,covered,10,"東京特許許可局局長今日急遽許可却下。",
38
+ iso-2022-kr,iso2022,gap,10,"안녕하세요 세계에 오신 것을 환영합니다.",SO/SI control bytes around each word inflate control_ratio to 20%
39
+ hz,cjk_escape,covered,4,"Hello World, this is a test.",
40
+ cp037,ebcdic,gap,1,"Hello World, this is a test.",Completely different byte mapping from ASCII
41
+ cp500,ebcdic,gap,1,"Hello World, this is a test.",Completely different byte mapping from ASCII
42
+ ebcdic-cp-us,ebcdic,gap,1,"Hello World, this is a test.",Completely different byte mapping from ASCII
binaryornot/helpers.py ADDED
@@ -0,0 +1,231 @@
1
+ """
2
+ binaryornot.helpers
3
+ -------------------
4
+
5
+ Helper utilities used by BinaryOrNot.
6
+ """
7
+
8
+ import logging
9
+ import math
10
+
11
+ from binaryornot.tree import is_binary as _is_binary_by_features
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def print_as_hex(s):
17
+ """
18
+ Print a string as hex bytes.
19
+ """
20
+ print(":".join(f"{ord(c):x}" for c in s))
21
+
22
+
23
+ def get_starting_chunk(filename, length=128):
24
+ """
25
+ :param filename: File to open and get the first little chunk of.
26
+ :param length: Number of bytes to read, default 128.
27
+ :returns: Starting chunk of bytes.
28
+ """
29
+ # Ensure we open the file in binary mode
30
+ with open(filename, "rb") as f:
31
+ chunk = f.read(length)
32
+ return chunk
33
+
34
+
35
+ # Bytes considered non-text control characters (excluding \t \n \r)
36
+ _CONTROL_BYTES = frozenset(range(0, 32)) - {9, 10, 13}
37
+
38
+
39
+ def _compute_features(chunk):
40
+ """Compute features for the binary/text decision tree.
41
+
42
+ Feature indices:
43
+ 0: null_ratio - fraction of 0x00 bytes
44
+ 1: control_ratio - fraction of control chars (0x01-0x08, 0x0E-0x1F)
45
+ 2: printable_ascii_ratio - fraction of 0x20-0x7E
46
+ 3: high_byte_ratio - fraction of 0x80-0xFF
47
+ 4: utf8_valid - 1.0 if chunk decodes as UTF-8
48
+ 5: even_null_ratio - fraction of even-index bytes that are 0x00
49
+ 6: odd_null_ratio - fraction of odd-index bytes that are 0x00
50
+ 7: byte_entropy - Shannon entropy of byte distribution
51
+ 8-12: BOM flags - UTF-32 LE/BE, UTF-16 LE/BE, UTF-8 BOM
52
+ 13: try_utf16le - 1.0 if chunk decodes as UTF-16-LE
53
+ 14: try_utf16be - 1.0 if chunk decodes as UTF-16-BE
54
+ 15: try_utf32le - 1.0 if chunk decodes as UTF-32-LE
55
+ 16: try_utf32be - 1.0 if chunk decodes as UTF-32-BE
56
+ 17: longest_printable_run - longest run of printable chars / length
57
+ 18: try_gb2312 - 1.0 if chunk decodes as GB2312
58
+ 19: try_big5 - 1.0 if chunk decodes as Big5
59
+ 20: try_shift_jis - 1.0 if chunk decodes as Shift-JIS
60
+ 21: try_euc_jp - 1.0 if chunk decodes as EUC-JP
61
+ 22: try_euc_kr - 1.0 if chunk decodes as EUC-KR
62
+ """
63
+ n = len(chunk)
64
+
65
+ null_count = chunk.count(0)
66
+ control_count = sum(1 for b in chunk if b in _CONTROL_BYTES)
67
+ printable_count = sum(1 for b in chunk if 0x20 <= b <= 0x7E)
68
+ high_count = sum(1 for b in chunk if b >= 0x80)
69
+
70
+ null_ratio = null_count / n
71
+ control_ratio = control_count / n
72
+ printable_ascii_ratio = printable_count / n
73
+ high_byte_ratio = high_count / n
74
+
75
+ try:
76
+ chunk.decode("utf-8")
77
+ utf8_valid = 1.0
78
+ except (UnicodeDecodeError, ValueError):
79
+ utf8_valid = 0.0
80
+
81
+ even_total = (n + 1) // 2
82
+ odd_total = n // 2
83
+ even_nulls = sum(1 for i in range(0, n, 2) if chunk[i] == 0)
84
+ odd_nulls = sum(1 for i in range(1, n, 2) if chunk[i] == 0)
85
+ even_null_ratio = even_nulls / even_total if even_total else 0
86
+ odd_null_ratio = odd_nulls / odd_total if odd_total else 0
87
+
88
+ hist = [0] * 256
89
+ for b in chunk:
90
+ hist[b] += 1
91
+ entropy = 0.0
92
+ for count in hist:
93
+ if count > 0:
94
+ p = count / n
95
+ entropy -= p * math.log2(p)
96
+
97
+ bom_utf32le = 1.0 if chunk[:4] == b"\xff\xfe\x00\x00" else 0.0
98
+ bom_utf32be = 1.0 if chunk[:4] == b"\x00\x00\xfe\xff" else 0.0
99
+ bom_utf16le = 1.0 if chunk[:2] == b"\xff\xfe" and chunk[:4] != b"\xff\xfe\x00\x00" else 0.0
100
+ bom_utf16be = 1.0 if chunk[:2] == b"\xfe\xff" else 0.0
101
+ bom_utf8 = 1.0 if chunk[:3] == b"\xef\xbb\xbf" else 0.0
102
+
103
+ try_utf16le = 0.0
104
+ try_utf16be = 0.0
105
+ try_utf32le = 0.0
106
+ try_utf32be = 0.0
107
+ if n >= 10:
108
+ try:
109
+ chunk.decode("utf-16-le")
110
+ try_utf16le = 1.0
111
+ except (UnicodeDecodeError, ValueError):
112
+ pass
113
+ try:
114
+ chunk.decode("utf-16-be")
115
+ try_utf16be = 1.0
116
+ except (UnicodeDecodeError, ValueError):
117
+ pass
118
+ if n >= 16:
119
+ try:
120
+ chunk.decode("utf-32-le")
121
+ try_utf32le = 1.0
122
+ except (UnicodeDecodeError, ValueError):
123
+ pass
124
+ try:
125
+ chunk.decode("utf-32-be")
126
+ try_utf32be = 1.0
127
+ except (UnicodeDecodeError, ValueError):
128
+ pass
129
+
130
+ max_run = 0
131
+ current_run = 0
132
+ for b in chunk:
133
+ if 0x20 <= b <= 0x7E or b in (9, 10, 13):
134
+ current_run += 1
135
+ if current_run > max_run:
136
+ max_run = current_run
137
+ else:
138
+ current_run = 0
139
+ longest_printable_run = max_run / n
140
+
141
+ def _try_decode(encoding):
142
+ try:
143
+ chunk.decode(encoding)
144
+ return 1.0
145
+ except (UnicodeDecodeError, ValueError):
146
+ return 0.0
147
+
148
+ try_gb2312 = _try_decode("gb2312") if n >= 10 else 0.0
149
+ try_big5 = _try_decode("big5") if n >= 10 else 0.0
150
+ try_shift_jis = _try_decode("shift_jis") if n >= 10 else 0.0
151
+ try_euc_jp = _try_decode("euc-jp") if n >= 10 else 0.0
152
+ try_euc_kr = _try_decode("euc-kr") if n >= 10 else 0.0
153
+
154
+ return [
155
+ null_ratio,
156
+ control_ratio,
157
+ printable_ascii_ratio,
158
+ high_byte_ratio,
159
+ utf8_valid,
160
+ even_null_ratio,
161
+ odd_null_ratio,
162
+ entropy,
163
+ bom_utf32le,
164
+ bom_utf32be,
165
+ bom_utf16le,
166
+ bom_utf16be,
167
+ bom_utf8,
168
+ try_utf16le,
169
+ try_utf16be,
170
+ try_utf32le,
171
+ try_utf32be,
172
+ longest_printable_run,
173
+ try_gb2312,
174
+ try_big5,
175
+ try_shift_jis,
176
+ try_euc_jp,
177
+ try_euc_kr,
178
+ ]
179
+
180
+
181
+ def is_binary_string(bytes_to_check):
182
+ """
183
+ Check if a chunk of bytes appears to be binary or text.
184
+
185
+ Uses a trained decision tree on byte statistics including entropy,
186
+ character class ratios, encoding validity checks, and BOM detection.
187
+
188
+ :param bytes_to_check: A chunk of bytes to check.
189
+ :returns: True if appears to be a binary, otherwise False.
190
+ """
191
+ if not bytes_to_check:
192
+ return False
193
+
194
+ features = _compute_features(bytes_to_check)
195
+ result = _is_binary_by_features(features)
196
+ logger.debug(
197
+ "is_binary_string: %r (features=%r)",
198
+ result,
199
+ dict(
200
+ zip(
201
+ [
202
+ "null",
203
+ "ctrl",
204
+ "ascii",
205
+ "high",
206
+ "utf8",
207
+ "even0",
208
+ "odd0",
209
+ "entropy",
210
+ "bom32le",
211
+ "bom32be",
212
+ "bom16le",
213
+ "bom16be",
214
+ "bom8",
215
+ "try16le",
216
+ "try16be",
217
+ "try32le",
218
+ "try32be",
219
+ "run",
220
+ "gb2312",
221
+ "big5",
222
+ "shiftjis",
223
+ "eucjp",
224
+ "euckr",
225
+ ],
226
+ [f"{v:.3f}" for v in features],
227
+ strict=True,
228
+ )
229
+ ),
230
+ )
231
+ return result
binaryornot/py.typed ADDED
@@ -0,0 +1 @@
1
+ # Marker file for PEP 561
binaryornot/tree.py ADDED
@@ -0,0 +1,230 @@
1
+ """Auto-generated decision tree for binary/text classification.
2
+
3
+ Do not edit by hand. Regenerate with:
4
+ uv run --with 'scikit-learn,numpy,hypothesis' python scripts/train_detector.py
5
+ """
6
+
7
+
8
+ def is_binary(features):
9
+ """Classify a byte chunk as binary or text.
10
+
11
+ Takes the feature list from helpers._compute_features().
12
+ Returns True for binary.
13
+ """
14
+ if features[1] <= 0.003906: # control_ratio
15
+ if features[17] <= 0.003906: # longest_printable_run
16
+ if features[4] <= 0.500000: # utf8_valid
17
+ if features[19] <= 0.500000: # try_big5
18
+ if features[7] <= 1.953445: # byte_entropy
19
+ return True # binary (100.0%, n=1)
20
+ else:
21
+ if features[7] <= 2.749470: # byte_entropy
22
+ return False # text (100.0%, n=1)
23
+ else:
24
+ if features[7] <= 3.004886: # byte_entropy
25
+ return True # binary (100.0%, n=1)
26
+ else:
27
+ return True # binary (100.0%, n=1)
28
+ else:
29
+ if features[21] <= 0.500000: # try_euc_jp
30
+ return False # text (100.0%, n=1)
31
+ else:
32
+ return False # text (100.0%, n=1)
33
+ else:
34
+ if features[7] <= 1.750000: # byte_entropy
35
+ return False # text (100.0%, n=1)
36
+ else:
37
+ return False # text (100.0%, n=1)
38
+ else:
39
+ if features[4] <= 0.500000: # utf8_valid
40
+ if features[17] <= 0.074176: # longest_printable_run
41
+ if features[7] <= 5.159404: # byte_entropy
42
+ if features[7] <= 2.659421: # byte_entropy
43
+ if features[7] <= 2.523218: # byte_entropy
44
+ return False # text (100.0%, n=1)
45
+ else:
46
+ return True # binary (100.0%, n=1)
47
+ else:
48
+ return False # text (100.0%, n=1)
49
+ else:
50
+ return True # binary (100.0%, n=1)
51
+ else:
52
+ if features[7] <= 3.040512: # byte_entropy
53
+ return False # text (100.0%, n=1)
54
+ else:
55
+ if features[17] <= 0.354353: # longest_printable_run
56
+ if features[2] <= 0.615079: # printable_ascii_ratio
57
+ if features[17] <= 0.267943: # longest_printable_run
58
+ if features[7] <= 3.572021: # byte_entropy
59
+ if features[3] <= 0.490000: # high_byte_ratio
60
+ return False # text (100.0%, n=1)
61
+ else:
62
+ return True # binary (66.8%, n=1)
63
+ else:
64
+ if features[2] <= 0.458042: # printable_ascii_ratio
65
+ return True # binary (100.0%, n=1)
66
+ else:
67
+ return True # binary (79.0%, n=0)
68
+ else:
69
+ if features[3] <= 0.538095: # high_byte_ratio
70
+ return True # binary (100.0%, n=1)
71
+ else:
72
+ return False # text (100.0%, n=1)
73
+ else:
74
+ if features[13] <= 0.500000: # try_utf16le
75
+ if features[3] <= 0.348485: # high_byte_ratio
76
+ return True # binary (100.0%, n=1)
77
+ else:
78
+ return False # text (100.0%, n=1)
79
+ else:
80
+ if features[17] <= 0.089844: # longest_printable_run
81
+ return False # text (100.0%, n=1)
82
+ else:
83
+ return False # text (100.0%, n=1)
84
+ else:
85
+ return True # binary (100.0%, n=1)
86
+ else:
87
+ return False # text (100.0%, n=1)
88
+ else:
89
+ if features[0] <= 0.163978: # null_ratio
90
+ if features[4] <= 0.500000: # utf8_valid
91
+ if features[7] <= 3.264621: # byte_entropy
92
+ if features[17] <= 0.052849: # longest_printable_run
93
+ if features[1] <= 0.322917: # control_ratio
94
+ return True # binary (100.0%, n=1)
95
+ else:
96
+ if features[20] <= 0.500000: # try_shift_jis
97
+ return False # text (100.0%, n=1)
98
+ else:
99
+ return True # binary (100.0%, n=1)
100
+ else:
101
+ if features[17] <= 0.348485: # longest_printable_run
102
+ return False # text (100.0%, n=1)
103
+ else:
104
+ if features[7] <= 2.951575: # byte_entropy
105
+ return False # text (100.0%, n=1)
106
+ else:
107
+ if features[3] <= 0.303030: # high_byte_ratio
108
+ return True # binary (100.0%, n=1)
109
+ else:
110
+ return True # binary (100.0%, n=1)
111
+ else:
112
+ if features[1] <= 0.031089: # control_ratio
113
+ if features[2] <= 0.455863: # printable_ascii_ratio
114
+ return True # binary (100.0%, n=1)
115
+ else:
116
+ if features[17] <= 0.082207: # longest_printable_run
117
+ return False # text (100.0%, n=1)
118
+ else:
119
+ if features[14] <= 0.500000: # try_utf16be
120
+ return False # text (100.0%, n=1)
121
+ else:
122
+ return True # binary (100.0%, n=1)
123
+ else:
124
+ if features[10] <= 0.500000: # bom_utf16le
125
+ if features[2] <= 0.455534: # printable_ascii_ratio
126
+ if features[3] <= 0.790570: # high_byte_ratio
127
+ if features[17] <= 0.011719: # longest_printable_run
128
+ if features[2] <= 0.140625: # printable_ascii_ratio
129
+ return True # binary (100.0%, n=1)
130
+ else:
131
+ return False # text (100.0%, n=1)
132
+ else:
133
+ if features[7] <= 3.540884: # byte_entropy
134
+ return True # binary (95.4%, n=1)
135
+ else:
136
+ return True # binary (99.9%, n=1)
137
+ else:
138
+ return False # text (100.0%, n=1)
139
+ else:
140
+ if features[17] <= 0.072300: # longest_printable_run
141
+ if features[1] <= 0.112942: # control_ratio
142
+ if features[7] <= 5.537264: # byte_entropy
143
+ return False # text (100.0%, n=1)
144
+ else:
145
+ return True # binary (100.0%, n=1)
146
+ else:
147
+ return True # binary (100.0%, n=1)
148
+ else:
149
+ if features[7] <= 4.198218: # byte_entropy
150
+ if features[17] <= 0.174242: # longest_printable_run
151
+ return True # binary (53.0%, n=0)
152
+ else:
153
+ return True # binary (95.3%, n=1)
154
+ else:
155
+ if features[17] <= 0.096875: # longest_printable_run
156
+ return True # binary (89.4%, n=0)
157
+ else:
158
+ return True # binary (100.0%, n=1)
159
+ else:
160
+ return False # text (100.0%, n=1)
161
+ else:
162
+ if features[1] <= 0.575000: # control_ratio
163
+ return False # text (100.0%, n=1)
164
+ else:
165
+ return True # binary (100.0%, n=1)
166
+ else:
167
+ if features[3] <= 0.098077: # high_byte_ratio
168
+ if features[8] <= 0.500000: # bom_utf32le
169
+ if features[6] <= 0.125000: # odd_null_ratio
170
+ return False # text (100.0%, n=1)
171
+ else:
172
+ if features[16] <= 0.500000: # try_utf32be
173
+ if features[5] <= 0.341085: # even_null_ratio
174
+ return False # text (100.0%, n=1)
175
+ else:
176
+ if features[15] <= 0.500000: # try_utf32le
177
+ if features[14] <= 0.500000: # try_utf16be
178
+ if features[7] <= 0.864787: # byte_entropy
179
+ return False # text (100.0%, n=1)
180
+ else:
181
+ return True # binary (100.0%, n=1)
182
+ else:
183
+ return True # binary (100.0%, n=1)
184
+ else:
185
+ if features[1] <= 0.925781: # control_ratio
186
+ return False # text (100.0%, n=1)
187
+ else:
188
+ return True # binary (100.0%, n=1)
189
+ else:
190
+ if features[17] <= 0.003906: # longest_printable_run
191
+ return True # binary (100.0%, n=1)
192
+ else:
193
+ return False # text (100.0%, n=1)
194
+ else:
195
+ return False # text (100.0%, n=1)
196
+ else:
197
+ if features[7] <= 5.169243: # byte_entropy
198
+ if features[0] <= 0.765625: # null_ratio
199
+ if features[2] <= 0.378571: # printable_ascii_ratio
200
+ if features[20] <= 0.500000: # try_shift_jis
201
+ return False # text (100.0%, n=1)
202
+ else:
203
+ if features[14] <= 0.500000: # try_utf16be
204
+ if features[7] <= 3.188246: # byte_entropy
205
+ return False # text (100.0%, n=1)
206
+ else:
207
+ return True # binary (100.0%, n=1)
208
+ else:
209
+ return False # text (100.0%, n=1)
210
+ else:
211
+ if features[13] <= 0.500000: # try_utf16le
212
+ if features[3] <= 0.320000: # high_byte_ratio
213
+ return True # binary (100.0%, n=1)
214
+ else:
215
+ return False # text (100.0%, n=1)
216
+ else:
217
+ if features[7] <= 2.892092: # byte_entropy
218
+ return False # text (100.0%, n=1)
219
+ else:
220
+ return False # text (100.0%, n=1)
221
+ else:
222
+ if features[5] <= 0.921875: # even_null_ratio
223
+ if features[2] <= 0.027344: # printable_ascii_ratio
224
+ return True # binary (100.0%, n=1)
225
+ else:
226
+ return True # binary (100.0%, n=1)
227
+ else:
228
+ return False # text (100.0%, n=1)
229
+ else:
230
+ return True # binary (100.0%, n=1)
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.4
2
+ Name: binaryornot
3
+ Version: 0.5.0
4
+ Summary: Ultra-lightweight pure Python package to check if a file is binary or text.
5
+ Project-URL: bugs, https://github.com/binaryornot/binaryornot/issues
6
+ Project-URL: changelog, https://github.com/binaryornot/binaryornot/releases
7
+ Project-URL: documentation, https://binaryornot.github.io/binaryornot/
8
+ Project-URL: homepage, https://github.com/binaryornot/binaryornot
9
+ Author-email: Audrey Roy Greenfeld <aroy@alum.mit.edu>
10
+ Maintainer-email: Audrey Roy Greenfeld <aroy@alum.mit.edu>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Natural Language :: English
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.12
23
+ Description-Content-Type: text/markdown
24
+
25
+ # BinaryOrNot
26
+
27
+ Python library and CLI tool to check if a file is binary or text. Zero dependencies.
28
+
29
+ ```python
30
+ from binaryornot.check import is_binary
31
+
32
+ is_binary("image.png") # True
33
+ is_binary("README.md") # False
34
+ is_binary("data.sqlite") # True
35
+ is_binary("report.csv") # False
36
+ ```
37
+
38
+ ```sh
39
+ $ binaryornot image.png
40
+ True
41
+ ```
42
+
43
+ ## Install
44
+
45
+ ```sh
46
+ pip install binaryornot
47
+ ```
48
+
49
+ ## Why not just check for null bytes?
50
+
51
+ That's the first thing everyone tries. It works until it doesn't:
52
+
53
+ - A UTF-16 text file is full of null bytes. Your tool thinks it's binary and corrupts it.
54
+ - A Big5 or GB2312 text file has high-ASCII bytes everywhere. Looks binary by byte ratios alone.
55
+ - A font file (.woff, .eot) is clearly binary but might not have null bytes in the first chunk.
56
+
57
+ BinaryOrNot reads the first 128 bytes and runs them through a trained decision tree that considers byte ratios, Shannon entropy, encoding validity, BOM detection, and more. It handles all the edge cases above correctly, with zero dependencies.
58
+
59
+ Tested against [37 text encodings and 49 binary formats](https://binaryornot.github.io/binaryornot/usage/), verified by parametrized tests driven from coverage CSVs.
60
+
61
+ ## API
62
+
63
+ One function:
64
+
65
+ ```python
66
+ from binaryornot.check import is_binary
67
+
68
+ is_binary(filename) # returns True or False
69
+ ```
70
+
71
+ There's also `is_binary_string()` if you already have bytes:
72
+
73
+ ```python
74
+ from binaryornot.helpers import is_binary_string
75
+
76
+ is_binary_string(b"\x00\x01\x02") # True
77
+ is_binary_string(b"hello world") # False
78
+ ```
79
+
80
+ [Full documentation](https://binaryornot.github.io/binaryornot/) covers the detection algorithm in detail.
81
+
82
+ ## Credits
83
+
84
+ Created by [Audrey Roy Greenfeld](https://audrey.feldroy.com).
@@ -0,0 +1,14 @@
1
+ binaryornot/__init__.py,sha256=OQjHzR0Rewtt-cTpYKKDAA2XUBpLGuUeb9JRak15Nqs,95
2
+ binaryornot/__main__.py,sha256=OLoOkeoAmMnJvnXLTuzot_UdlUPr1665bvRXMtgzIEM,63
3
+ binaryornot/check.py,sha256=JlZ04JwB6_EklLdhup-PilLB2nTlN0PfndGzru8xs70,1147
4
+ binaryornot/helpers.py,sha256=_ejKNcTMiiKCvED7jGSX6njM_vFCvbmqHcAk-hVweOg,7050
5
+ binaryornot/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
6
+ binaryornot/tree.py,sha256=jpA2_UOV1Sc3L-VW-1DYMYLBfgNJg_lvebf8jOwCj8o,13093
7
+ binaryornot/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ binaryornot/data/binary_formats.csv,sha256=TFDAgDhZPa6StowTdcTxZpxqIYuhFuWhW4QJkQRv_JI,3738
9
+ binaryornot/data/encodings.csv,sha256=8A82R9syvD2whLGFy6X7ndrfuSSuNEsrePCl7WxLk0A,3734
10
+ binaryornot-0.5.0.dist-info/METADATA,sha256=BrJwB8wye4JgNo8xNdlPtu8pOOih_xaT4bdu0zzc0qs,2755
11
+ binaryornot-0.5.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
12
+ binaryornot-0.5.0.dist-info/entry_points.txt,sha256=ZcvM7LFMwzNBFbY4-8IJp2t4dyeDLziu2WGYfINyNew,55
13
+ binaryornot-0.5.0.dist-info/licenses/LICENSE,sha256=A_Rr9DwQHmSvZLn4GKyXg2GCkPclKWiBdZyTyS0veWg,1078
14
+ binaryornot-0.5.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ binaryornot = binaryornot.check:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026, Audrey Roy Greenfeld
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.