binaryornot 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- binaryornot/__init__.py +1 -0
- binaryornot/__main__.py +4 -0
- binaryornot/check.py +47 -0
- binaryornot/data/__init__.py +0 -0
- binaryornot/data/binary_formats.csv +50 -0
- binaryornot/data/encodings.csv +42 -0
- binaryornot/helpers.py +231 -0
- binaryornot/py.typed +1 -0
- binaryornot/tree.py +230 -0
- binaryornot-0.5.0.dist-info/METADATA +84 -0
- binaryornot-0.5.0.dist-info/RECORD +14 -0
- binaryornot-0.5.0.dist-info/WHEEL +4 -0
- binaryornot-0.5.0.dist-info/entry_points.txt +2 -0
- binaryornot-0.5.0.dist-info/licenses/LICENSE +21 -0
binaryornot/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""BinaryOrNot: ultra-lightweight pure Python package to check if a file is binary or text."""
|
binaryornot/__main__.py
ADDED
binaryornot/check.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
binaryornot.check
|
|
3
|
+
-----------------
|
|
4
|
+
|
|
5
|
+
Main code for checking if a file is binary or text.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from binaryornot.helpers import get_starting_chunk, is_binary_string
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def is_binary(filename):
|
|
17
|
+
"""
|
|
18
|
+
:param filename: File to check.
|
|
19
|
+
:returns: True if it's a binary file, otherwise False.
|
|
20
|
+
"""
|
|
21
|
+
logger.debug("is_binary: %(filename)r", locals())
|
|
22
|
+
|
|
23
|
+
# Check if the file extension is in a list of known binary types
|
|
24
|
+
# binary_extensions = ['.pyc', ]
|
|
25
|
+
# for ext in binary_extensions:
|
|
26
|
+
# if filename.endswith(ext):
|
|
27
|
+
# return True
|
|
28
|
+
|
|
29
|
+
# Check if the starting chunk is a binary string
|
|
30
|
+
chunk = get_starting_chunk(filename)
|
|
31
|
+
return is_binary_string(chunk)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main():
|
|
35
|
+
parser = argparse.ArgumentParser(description="Check if a file passed as argument is binary or not")
|
|
36
|
+
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"filename", help="File name to check for. If the file is not in the same folder, include full path"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
args = parser.parse_args()
|
|
42
|
+
|
|
43
|
+
print(is_binary(**vars(args)))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
format,family,status,magic_hex,test_file,gap_reason,source
|
|
2
|
+
png,image,covered,89504e470d0a1a0a,tests/files/logo.png,,ISO/IEC 15948 s5.2
|
|
3
|
+
jpeg_jfif,image,covered,ffd8ffe0,,,ITU-T T.81 + JFIF APP0 marker
|
|
4
|
+
jpeg_exif,image,covered,ffd8ffe1,,,ITU-T T.81 + Exif APP1 marker
|
|
5
|
+
gif87a,image,covered,474946383761,tests/files/lena.gif,,GIF87a spec header
|
|
6
|
+
gif89a,image,covered,474946383961,tests/isBinaryFile/trunks.gif,,GIF89a spec header
|
|
7
|
+
bmp,image,covered,424d,tests/files/rgb-3c-8b.bmp,,Microsoft BMP format
|
|
8
|
+
tiff_be,image,covered,4d4d002a,tests/files/palette-1c-8b.tiff,,TIFF 6.0 spec s2 (big-endian)
|
|
9
|
+
tiff_le,image,covered,49492a00,,,TIFF 6.0 spec s2 (little-endian)
|
|
10
|
+
ico,image,covered,00000100,,,Microsoft ICO format
|
|
11
|
+
pdf,document,covered,255044462d312e,tests/isBinaryFile/pdf.pdf,,ISO 32000 s7.5.2
|
|
12
|
+
sqlite,database,covered,53514c69746520666f726d6174203300,tests/isBinaryFile/test.sqlite,,SQLite file format s1.2.1
|
|
13
|
+
zip,archive,covered,504b0304,,,APPNOTE.TXT (PKWARE) s4.3.7
|
|
14
|
+
gzip,archive,covered,1f8b08,,,RFC 1952 s2.3
|
|
15
|
+
xz,archive,covered,fd377a585a00,,,XZ file format spec s2.1.1
|
|
16
|
+
elf,executable,covered,7f454c46,,,System V ABI s4 (ELF header)
|
|
17
|
+
macho_32be,executable,covered,feedface,,,Apple Mach-O format reference
|
|
18
|
+
macho_32le,executable,covered,cefaedfe,,,Apple Mach-O format reference
|
|
19
|
+
macho_64be,executable,covered,feedfacf,,,Apple Mach-O format reference
|
|
20
|
+
macho_64le,executable,covered,cffaedfe,tests/isBinaryFile/grep,,Apple Mach-O format reference
|
|
21
|
+
mz,executable,covered,4d5a,,,Microsoft PE/COFF spec (DOS header)
|
|
22
|
+
java_class,executable,covered,cafebabe,,,JVM spec s4.1
|
|
23
|
+
riff,media,covered,52494646,,,Microsoft RIFF spec
|
|
24
|
+
ogg,media,covered,4f676753,,,Ogg bitstream format spec
|
|
25
|
+
flac,media,covered,664c6143,,,FLAC format spec
|
|
26
|
+
wasm,executable,covered,0061736d,,,WebAssembly spec s5.5.1
|
|
27
|
+
woff,font,covered,774f4646,tests/files/glyphiconshalflings-regular.woff,,W3C WOFF spec
|
|
28
|
+
otf,font,covered,4f54544f,tests/files/glyphiconshalflings-regular.otf,,OpenType spec (CFF-based)
|
|
29
|
+
ttf,font,covered,0001000000,tests/files/glyphiconshalflings-regular.ttf,,OpenType/TrueType spec v1.0
|
|
30
|
+
eot,font,covered,,tests/files/glyphiconshalflings-regular.eot,No universal magic; starts with file size,Microsoft EOT spec
|
|
31
|
+
pyc,compiled,covered,,tests/files/hello_world.pyc,Magic varies by Python version,Python importlib source
|
|
32
|
+
ds_store,metadata,covered,0000000142756431,tests/files/.DS_Store,,Apple .DS_Store reverse-engineered format
|
|
33
|
+
raw_rgb,image,covered,,tests/files/pixelstream.rgb,No magic; pure pixel data,
|
|
34
|
+
woff2,font,covered,774f4632,tests/files/test.woff2,,W3C WOFF2 spec
|
|
35
|
+
webp,image,covered,524946460000000057454250,tests/files/logo.webp,,Google WebP container spec (RIFF+WEBP)
|
|
36
|
+
mp4,media,covered,0000001866747970,tests/files/test.mp4,,ISO 14496-12 s4.3 (ftyp box)
|
|
37
|
+
mp3_id3,media,covered,494433,,,ID3v2 spec s3.1
|
|
38
|
+
bzip2,archive,covered,425a68,tests/files/test.bz2,,bzip2 file format
|
|
39
|
+
7z,archive,covered,377abcaf271c,tests/files/test.7z,,7-Zip format spec
|
|
40
|
+
ole2,document,covered,d0cf11e0a1b11ae1,tests/files/test.doc,,MS-CFB spec s2
|
|
41
|
+
zstd,archive,covered,28b52ffd,tests/files/test.zst,,RFC 8878 s3.1.1
|
|
42
|
+
rar,archive,covered,526172211a07,tests/files/test.rar,,RAR5 tech note
|
|
43
|
+
matroska,media,covered,1a45dfa3,tests/files/test.webm,,EBML/Matroska spec
|
|
44
|
+
midi,media,covered,4d546864,tests/files/test.mid,,MIDI 1.0 spec
|
|
45
|
+
psd,image,covered,38425053,tests/files/test.psd,,Adobe PSD spec
|
|
46
|
+
heif,image,covered,0000001c6674797068656963,tests/files/logo.heic,,ISO 23008-12 (ftyp heic)
|
|
47
|
+
parquet,data,covered,50415231,tests/files/test.parquet,,Apache Parquet format spec
|
|
48
|
+
dex,executable,covered,6465780a,tests/files/test.dex,,Dalvik executable format
|
|
49
|
+
llvm_bc,compiled,covered,4243c0de,tests/files/test.bc,,LLVM bitcode wrapper format
|
|
50
|
+
git_pack,data,covered,5041434b,tests/files/test.pack,,Git pack format spec
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
encoding,family,status,min_bytes,sample_text,gap_reason
|
|
2
|
+
utf-8,unicode,covered,1,"Héllo wörld, café, naïve, résumé.",
|
|
3
|
+
utf-16,unicode_bom,covered,2,"The quick brown fox jumps over the lazy dog.",
|
|
4
|
+
utf-16-le,unicode,covered,8,"Pack my box with five dozen liquor jugs.",
|
|
5
|
+
utf-16-be,unicode,covered,8,"How vexingly quick daft zebras jump.",
|
|
6
|
+
utf-32,unicode_bom,covered,4,"Sphinx of black quartz, judge my vow.",
|
|
7
|
+
utf-32-le,unicode,covered,16,"The five boxing wizards jump quickly.",
|
|
8
|
+
utf-32-be,unicode,covered,16,"Jackdaws love my big sphinx of quartz.",
|
|
9
|
+
utf-7,unicode,covered,1,"Hello, world! This is a test of detection.",
|
|
10
|
+
ascii,single_byte,covered,1,"The quick brown fox jumps over the lazy dog.",
|
|
11
|
+
iso-8859-1,single_byte,covered,4,"Héllo wörld, café, naïve, résumé, Straße.",
|
|
12
|
+
iso-8859-2,single_byte,covered,4,"Příliš žluťoučký kůň úpěl ďábelské ódy.",
|
|
13
|
+
iso-8859-5,single_byte,covered,4,"Съешь же ещё этих мягких французских булок.",
|
|
14
|
+
iso-8859-6,single_byte,covered,4,"هذا نص تجريبي باللغة العربية.",
|
|
15
|
+
iso-8859-7,single_byte,covered,4,"Αυτό είναι ένα δοκιμαστικό κείμενο.",
|
|
16
|
+
iso-8859-15,single_byte,covered,4,"L'œuvre du Père Noël coûte cher en décembre.",
|
|
17
|
+
windows-1250,single_byte,covered,4,"Stróż pchnął kość w quiz gędźb vel fax myjń.",
|
|
18
|
+
windows-1251,single_byte,covered,4,"Широкая электрификация южных губерний.",
|
|
19
|
+
windows-1252,single_byte,covered,4,"Dès Noël où un zéphyr haï me vêt de glaçons.",
|
|
20
|
+
windows-1253,single_byte,covered,4,"Ξεσκεπάζω την ψυχοφθόρα βδελυγμία.",
|
|
21
|
+
windows-1256,single_byte,covered,4,"نص عربي تجريبي للكشف عن الترميز الصحيح.",
|
|
22
|
+
koi8-r,single_byte,covered,4,"Широкая электрификация южных губерний.",
|
|
23
|
+
cp866,single_byte,covered,4,"В чащах юга жил бы цитрус? Да, но фальшивый.",
|
|
24
|
+
cp850,single_byte,covered,4," Strüdel, naïf, tête-à-tête, über alles.",
|
|
25
|
+
cp437,single_byte,covered,4,"Ärger mit Ölförderung in Übersee.",
|
|
26
|
+
tis-620,single_byte,covered,4,"สวัสดีครับ นี่คือข้อความทดสอบภาษาไทย.",
|
|
27
|
+
cp1258,single_byte,covered,4,"Garçon! Un café crème et une pièce montée.",
|
|
28
|
+
mac-roman,single_byte,covered,4,"Là où le zéphyr souffle, les œufs flottent.",
|
|
29
|
+
mac-cyrillic,single_byte,covered,4,"Эх, чужак! Общий съём цен шляп.",
|
|
30
|
+
gb2312,cjk_legacy,covered,20,"你好世界。中文测试文本。欢迎来到这里。",
|
|
31
|
+
gb18030,cjk_legacy,covered,20,"天地玄黄宇宙洪荒日月盈昃辰宿列张。",
|
|
32
|
+
gbk,cjk_legacy,covered,20,"春眠不觉晓处处闻啼鸟夜来风雨声花落知多少。",
|
|
33
|
+
big5,cjk_legacy,covered,20,"你好世界。歡迎來到這裡。請檢查結果。",
|
|
34
|
+
shift_jis,cjk_legacy,covered,20,"こんにちは世界。日本語のテストです。",
|
|
35
|
+
euc-jp,cjk_legacy,covered,20,"吾輩は猫である。名前はまだ無い。",
|
|
36
|
+
euc-kr,cjk_legacy,covered,20,"안녕하세요 세계에 오신 것을 환영합니다.",
|
|
37
|
+
iso-2022-jp,iso2022,covered,10,"東京特許許可局局長今日急遽許可却下。",
|
|
38
|
+
iso-2022-kr,iso2022,gap,10,"안녕하세요 세계에 오신 것을 환영합니다.",SO/SI control bytes around each word inflate control_ratio to 20%
|
|
39
|
+
hz,cjk_escape,covered,4,"Hello World, this is a test.",
|
|
40
|
+
cp037,ebcdic,gap,1,"Hello World, this is a test.",Completely different byte mapping from ASCII
|
|
41
|
+
cp500,ebcdic,gap,1,"Hello World, this is a test.",Completely different byte mapping from ASCII
|
|
42
|
+
ebcdic-cp-us,ebcdic,gap,1,"Hello World, this is a test.",Completely different byte mapping from ASCII
|
binaryornot/helpers.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""
|
|
2
|
+
binaryornot.helpers
|
|
3
|
+
-------------------
|
|
4
|
+
|
|
5
|
+
Helper utilities used by BinaryOrNot.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
|
|
11
|
+
from binaryornot.tree import is_binary as _is_binary_by_features
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def print_as_hex(s):
|
|
17
|
+
"""
|
|
18
|
+
Print a string as hex bytes.
|
|
19
|
+
"""
|
|
20
|
+
print(":".join(f"{ord(c):x}" for c in s))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_starting_chunk(filename, length=128):
|
|
24
|
+
"""
|
|
25
|
+
:param filename: File to open and get the first little chunk of.
|
|
26
|
+
:param length: Number of bytes to read, default 128.
|
|
27
|
+
:returns: Starting chunk of bytes.
|
|
28
|
+
"""
|
|
29
|
+
# Ensure we open the file in binary mode
|
|
30
|
+
with open(filename, "rb") as f:
|
|
31
|
+
chunk = f.read(length)
|
|
32
|
+
return chunk
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Bytes considered non-text control characters (excluding \t \n \r)
|
|
36
|
+
_CONTROL_BYTES = frozenset(range(0, 32)) - {9, 10, 13}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _compute_features(chunk):
|
|
40
|
+
"""Compute features for the binary/text decision tree.
|
|
41
|
+
|
|
42
|
+
Feature indices:
|
|
43
|
+
0: null_ratio - fraction of 0x00 bytes
|
|
44
|
+
1: control_ratio - fraction of control chars (0x01-0x08, 0x0E-0x1F)
|
|
45
|
+
2: printable_ascii_ratio - fraction of 0x20-0x7E
|
|
46
|
+
3: high_byte_ratio - fraction of 0x80-0xFF
|
|
47
|
+
4: utf8_valid - 1.0 if chunk decodes as UTF-8
|
|
48
|
+
5: even_null_ratio - fraction of even-index bytes that are 0x00
|
|
49
|
+
6: odd_null_ratio - fraction of odd-index bytes that are 0x00
|
|
50
|
+
7: byte_entropy - Shannon entropy of byte distribution
|
|
51
|
+
8-12: BOM flags - UTF-32 LE/BE, UTF-16 LE/BE, UTF-8 BOM
|
|
52
|
+
13: try_utf16le - 1.0 if chunk decodes as UTF-16-LE
|
|
53
|
+
14: try_utf16be - 1.0 if chunk decodes as UTF-16-BE
|
|
54
|
+
15: try_utf32le - 1.0 if chunk decodes as UTF-32-LE
|
|
55
|
+
16: try_utf32be - 1.0 if chunk decodes as UTF-32-BE
|
|
56
|
+
17: longest_printable_run - longest run of printable chars / length
|
|
57
|
+
18: try_gb2312 - 1.0 if chunk decodes as GB2312
|
|
58
|
+
19: try_big5 - 1.0 if chunk decodes as Big5
|
|
59
|
+
20: try_shift_jis - 1.0 if chunk decodes as Shift-JIS
|
|
60
|
+
21: try_euc_jp - 1.0 if chunk decodes as EUC-JP
|
|
61
|
+
22: try_euc_kr - 1.0 if chunk decodes as EUC-KR
|
|
62
|
+
"""
|
|
63
|
+
n = len(chunk)
|
|
64
|
+
|
|
65
|
+
null_count = chunk.count(0)
|
|
66
|
+
control_count = sum(1 for b in chunk if b in _CONTROL_BYTES)
|
|
67
|
+
printable_count = sum(1 for b in chunk if 0x20 <= b <= 0x7E)
|
|
68
|
+
high_count = sum(1 for b in chunk if b >= 0x80)
|
|
69
|
+
|
|
70
|
+
null_ratio = null_count / n
|
|
71
|
+
control_ratio = control_count / n
|
|
72
|
+
printable_ascii_ratio = printable_count / n
|
|
73
|
+
high_byte_ratio = high_count / n
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
chunk.decode("utf-8")
|
|
77
|
+
utf8_valid = 1.0
|
|
78
|
+
except (UnicodeDecodeError, ValueError):
|
|
79
|
+
utf8_valid = 0.0
|
|
80
|
+
|
|
81
|
+
even_total = (n + 1) // 2
|
|
82
|
+
odd_total = n // 2
|
|
83
|
+
even_nulls = sum(1 for i in range(0, n, 2) if chunk[i] == 0)
|
|
84
|
+
odd_nulls = sum(1 for i in range(1, n, 2) if chunk[i] == 0)
|
|
85
|
+
even_null_ratio = even_nulls / even_total if even_total else 0
|
|
86
|
+
odd_null_ratio = odd_nulls / odd_total if odd_total else 0
|
|
87
|
+
|
|
88
|
+
hist = [0] * 256
|
|
89
|
+
for b in chunk:
|
|
90
|
+
hist[b] += 1
|
|
91
|
+
entropy = 0.0
|
|
92
|
+
for count in hist:
|
|
93
|
+
if count > 0:
|
|
94
|
+
p = count / n
|
|
95
|
+
entropy -= p * math.log2(p)
|
|
96
|
+
|
|
97
|
+
bom_utf32le = 1.0 if chunk[:4] == b"\xff\xfe\x00\x00" else 0.0
|
|
98
|
+
bom_utf32be = 1.0 if chunk[:4] == b"\x00\x00\xfe\xff" else 0.0
|
|
99
|
+
bom_utf16le = 1.0 if chunk[:2] == b"\xff\xfe" and chunk[:4] != b"\xff\xfe\x00\x00" else 0.0
|
|
100
|
+
bom_utf16be = 1.0 if chunk[:2] == b"\xfe\xff" else 0.0
|
|
101
|
+
bom_utf8 = 1.0 if chunk[:3] == b"\xef\xbb\xbf" else 0.0
|
|
102
|
+
|
|
103
|
+
try_utf16le = 0.0
|
|
104
|
+
try_utf16be = 0.0
|
|
105
|
+
try_utf32le = 0.0
|
|
106
|
+
try_utf32be = 0.0
|
|
107
|
+
if n >= 10:
|
|
108
|
+
try:
|
|
109
|
+
chunk.decode("utf-16-le")
|
|
110
|
+
try_utf16le = 1.0
|
|
111
|
+
except (UnicodeDecodeError, ValueError):
|
|
112
|
+
pass
|
|
113
|
+
try:
|
|
114
|
+
chunk.decode("utf-16-be")
|
|
115
|
+
try_utf16be = 1.0
|
|
116
|
+
except (UnicodeDecodeError, ValueError):
|
|
117
|
+
pass
|
|
118
|
+
if n >= 16:
|
|
119
|
+
try:
|
|
120
|
+
chunk.decode("utf-32-le")
|
|
121
|
+
try_utf32le = 1.0
|
|
122
|
+
except (UnicodeDecodeError, ValueError):
|
|
123
|
+
pass
|
|
124
|
+
try:
|
|
125
|
+
chunk.decode("utf-32-be")
|
|
126
|
+
try_utf32be = 1.0
|
|
127
|
+
except (UnicodeDecodeError, ValueError):
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
max_run = 0
|
|
131
|
+
current_run = 0
|
|
132
|
+
for b in chunk:
|
|
133
|
+
if 0x20 <= b <= 0x7E or b in (9, 10, 13):
|
|
134
|
+
current_run += 1
|
|
135
|
+
if current_run > max_run:
|
|
136
|
+
max_run = current_run
|
|
137
|
+
else:
|
|
138
|
+
current_run = 0
|
|
139
|
+
longest_printable_run = max_run / n
|
|
140
|
+
|
|
141
|
+
def _try_decode(encoding):
|
|
142
|
+
try:
|
|
143
|
+
chunk.decode(encoding)
|
|
144
|
+
return 1.0
|
|
145
|
+
except (UnicodeDecodeError, ValueError):
|
|
146
|
+
return 0.0
|
|
147
|
+
|
|
148
|
+
try_gb2312 = _try_decode("gb2312") if n >= 10 else 0.0
|
|
149
|
+
try_big5 = _try_decode("big5") if n >= 10 else 0.0
|
|
150
|
+
try_shift_jis = _try_decode("shift_jis") if n >= 10 else 0.0
|
|
151
|
+
try_euc_jp = _try_decode("euc-jp") if n >= 10 else 0.0
|
|
152
|
+
try_euc_kr = _try_decode("euc-kr") if n >= 10 else 0.0
|
|
153
|
+
|
|
154
|
+
return [
|
|
155
|
+
null_ratio,
|
|
156
|
+
control_ratio,
|
|
157
|
+
printable_ascii_ratio,
|
|
158
|
+
high_byte_ratio,
|
|
159
|
+
utf8_valid,
|
|
160
|
+
even_null_ratio,
|
|
161
|
+
odd_null_ratio,
|
|
162
|
+
entropy,
|
|
163
|
+
bom_utf32le,
|
|
164
|
+
bom_utf32be,
|
|
165
|
+
bom_utf16le,
|
|
166
|
+
bom_utf16be,
|
|
167
|
+
bom_utf8,
|
|
168
|
+
try_utf16le,
|
|
169
|
+
try_utf16be,
|
|
170
|
+
try_utf32le,
|
|
171
|
+
try_utf32be,
|
|
172
|
+
longest_printable_run,
|
|
173
|
+
try_gb2312,
|
|
174
|
+
try_big5,
|
|
175
|
+
try_shift_jis,
|
|
176
|
+
try_euc_jp,
|
|
177
|
+
try_euc_kr,
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def is_binary_string(bytes_to_check):
|
|
182
|
+
"""
|
|
183
|
+
Check if a chunk of bytes appears to be binary or text.
|
|
184
|
+
|
|
185
|
+
Uses a trained decision tree on byte statistics including entropy,
|
|
186
|
+
character class ratios, encoding validity checks, and BOM detection.
|
|
187
|
+
|
|
188
|
+
:param bytes_to_check: A chunk of bytes to check.
|
|
189
|
+
:returns: True if appears to be a binary, otherwise False.
|
|
190
|
+
"""
|
|
191
|
+
if not bytes_to_check:
|
|
192
|
+
return False
|
|
193
|
+
|
|
194
|
+
features = _compute_features(bytes_to_check)
|
|
195
|
+
result = _is_binary_by_features(features)
|
|
196
|
+
logger.debug(
|
|
197
|
+
"is_binary_string: %r (features=%r)",
|
|
198
|
+
result,
|
|
199
|
+
dict(
|
|
200
|
+
zip(
|
|
201
|
+
[
|
|
202
|
+
"null",
|
|
203
|
+
"ctrl",
|
|
204
|
+
"ascii",
|
|
205
|
+
"high",
|
|
206
|
+
"utf8",
|
|
207
|
+
"even0",
|
|
208
|
+
"odd0",
|
|
209
|
+
"entropy",
|
|
210
|
+
"bom32le",
|
|
211
|
+
"bom32be",
|
|
212
|
+
"bom16le",
|
|
213
|
+
"bom16be",
|
|
214
|
+
"bom8",
|
|
215
|
+
"try16le",
|
|
216
|
+
"try16be",
|
|
217
|
+
"try32le",
|
|
218
|
+
"try32be",
|
|
219
|
+
"run",
|
|
220
|
+
"gb2312",
|
|
221
|
+
"big5",
|
|
222
|
+
"shiftjis",
|
|
223
|
+
"eucjp",
|
|
224
|
+
"euckr",
|
|
225
|
+
],
|
|
226
|
+
[f"{v:.3f}" for v in features],
|
|
227
|
+
strict=True,
|
|
228
|
+
)
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
return result
|
binaryornot/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Marker file for PEP 561
|
binaryornot/tree.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Auto-generated decision tree for binary/text classification.
|
|
2
|
+
|
|
3
|
+
Do not edit by hand. Regenerate with:
|
|
4
|
+
uv run --with 'scikit-learn,numpy,hypothesis' python scripts/train_detector.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_binary(features):
|
|
9
|
+
"""Classify a byte chunk as binary or text.
|
|
10
|
+
|
|
11
|
+
Takes the feature list from helpers._compute_features().
|
|
12
|
+
Returns True for binary.
|
|
13
|
+
"""
|
|
14
|
+
if features[1] <= 0.003906: # control_ratio
|
|
15
|
+
if features[17] <= 0.003906: # longest_printable_run
|
|
16
|
+
if features[4] <= 0.500000: # utf8_valid
|
|
17
|
+
if features[19] <= 0.500000: # try_big5
|
|
18
|
+
if features[7] <= 1.953445: # byte_entropy
|
|
19
|
+
return True # binary (100.0%, n=1)
|
|
20
|
+
else:
|
|
21
|
+
if features[7] <= 2.749470: # byte_entropy
|
|
22
|
+
return False # text (100.0%, n=1)
|
|
23
|
+
else:
|
|
24
|
+
if features[7] <= 3.004886: # byte_entropy
|
|
25
|
+
return True # binary (100.0%, n=1)
|
|
26
|
+
else:
|
|
27
|
+
return True # binary (100.0%, n=1)
|
|
28
|
+
else:
|
|
29
|
+
if features[21] <= 0.500000: # try_euc_jp
|
|
30
|
+
return False # text (100.0%, n=1)
|
|
31
|
+
else:
|
|
32
|
+
return False # text (100.0%, n=1)
|
|
33
|
+
else:
|
|
34
|
+
if features[7] <= 1.750000: # byte_entropy
|
|
35
|
+
return False # text (100.0%, n=1)
|
|
36
|
+
else:
|
|
37
|
+
return False # text (100.0%, n=1)
|
|
38
|
+
else:
|
|
39
|
+
if features[4] <= 0.500000: # utf8_valid
|
|
40
|
+
if features[17] <= 0.074176: # longest_printable_run
|
|
41
|
+
if features[7] <= 5.159404: # byte_entropy
|
|
42
|
+
if features[7] <= 2.659421: # byte_entropy
|
|
43
|
+
if features[7] <= 2.523218: # byte_entropy
|
|
44
|
+
return False # text (100.0%, n=1)
|
|
45
|
+
else:
|
|
46
|
+
return True # binary (100.0%, n=1)
|
|
47
|
+
else:
|
|
48
|
+
return False # text (100.0%, n=1)
|
|
49
|
+
else:
|
|
50
|
+
return True # binary (100.0%, n=1)
|
|
51
|
+
else:
|
|
52
|
+
if features[7] <= 3.040512: # byte_entropy
|
|
53
|
+
return False # text (100.0%, n=1)
|
|
54
|
+
else:
|
|
55
|
+
if features[17] <= 0.354353: # longest_printable_run
|
|
56
|
+
if features[2] <= 0.615079: # printable_ascii_ratio
|
|
57
|
+
if features[17] <= 0.267943: # longest_printable_run
|
|
58
|
+
if features[7] <= 3.572021: # byte_entropy
|
|
59
|
+
if features[3] <= 0.490000: # high_byte_ratio
|
|
60
|
+
return False # text (100.0%, n=1)
|
|
61
|
+
else:
|
|
62
|
+
return True # binary (66.8%, n=1)
|
|
63
|
+
else:
|
|
64
|
+
if features[2] <= 0.458042: # printable_ascii_ratio
|
|
65
|
+
return True # binary (100.0%, n=1)
|
|
66
|
+
else:
|
|
67
|
+
return True # binary (79.0%, n=0)
|
|
68
|
+
else:
|
|
69
|
+
if features[3] <= 0.538095: # high_byte_ratio
|
|
70
|
+
return True # binary (100.0%, n=1)
|
|
71
|
+
else:
|
|
72
|
+
return False # text (100.0%, n=1)
|
|
73
|
+
else:
|
|
74
|
+
if features[13] <= 0.500000: # try_utf16le
|
|
75
|
+
if features[3] <= 0.348485: # high_byte_ratio
|
|
76
|
+
return True # binary (100.0%, n=1)
|
|
77
|
+
else:
|
|
78
|
+
return False # text (100.0%, n=1)
|
|
79
|
+
else:
|
|
80
|
+
if features[17] <= 0.089844: # longest_printable_run
|
|
81
|
+
return False # text (100.0%, n=1)
|
|
82
|
+
else:
|
|
83
|
+
return False # text (100.0%, n=1)
|
|
84
|
+
else:
|
|
85
|
+
return True # binary (100.0%, n=1)
|
|
86
|
+
else:
|
|
87
|
+
return False # text (100.0%, n=1)
|
|
88
|
+
else:
|
|
89
|
+
if features[0] <= 0.163978: # null_ratio
|
|
90
|
+
if features[4] <= 0.500000: # utf8_valid
|
|
91
|
+
if features[7] <= 3.264621: # byte_entropy
|
|
92
|
+
if features[17] <= 0.052849: # longest_printable_run
|
|
93
|
+
if features[1] <= 0.322917: # control_ratio
|
|
94
|
+
return True # binary (100.0%, n=1)
|
|
95
|
+
else:
|
|
96
|
+
if features[20] <= 0.500000: # try_shift_jis
|
|
97
|
+
return False # text (100.0%, n=1)
|
|
98
|
+
else:
|
|
99
|
+
return True # binary (100.0%, n=1)
|
|
100
|
+
else:
|
|
101
|
+
if features[17] <= 0.348485: # longest_printable_run
|
|
102
|
+
return False # text (100.0%, n=1)
|
|
103
|
+
else:
|
|
104
|
+
if features[7] <= 2.951575: # byte_entropy
|
|
105
|
+
return False # text (100.0%, n=1)
|
|
106
|
+
else:
|
|
107
|
+
if features[3] <= 0.303030: # high_byte_ratio
|
|
108
|
+
return True # binary (100.0%, n=1)
|
|
109
|
+
else:
|
|
110
|
+
return True # binary (100.0%, n=1)
|
|
111
|
+
else:
|
|
112
|
+
if features[1] <= 0.031089: # control_ratio
|
|
113
|
+
if features[2] <= 0.455863: # printable_ascii_ratio
|
|
114
|
+
return True # binary (100.0%, n=1)
|
|
115
|
+
else:
|
|
116
|
+
if features[17] <= 0.082207: # longest_printable_run
|
|
117
|
+
return False # text (100.0%, n=1)
|
|
118
|
+
else:
|
|
119
|
+
if features[14] <= 0.500000: # try_utf16be
|
|
120
|
+
return False # text (100.0%, n=1)
|
|
121
|
+
else:
|
|
122
|
+
return True # binary (100.0%, n=1)
|
|
123
|
+
else:
|
|
124
|
+
if features[10] <= 0.500000: # bom_utf16le
|
|
125
|
+
if features[2] <= 0.455534: # printable_ascii_ratio
|
|
126
|
+
if features[3] <= 0.790570: # high_byte_ratio
|
|
127
|
+
if features[17] <= 0.011719: # longest_printable_run
|
|
128
|
+
if features[2] <= 0.140625: # printable_ascii_ratio
|
|
129
|
+
return True # binary (100.0%, n=1)
|
|
130
|
+
else:
|
|
131
|
+
return False # text (100.0%, n=1)
|
|
132
|
+
else:
|
|
133
|
+
if features[7] <= 3.540884: # byte_entropy
|
|
134
|
+
return True # binary (95.4%, n=1)
|
|
135
|
+
else:
|
|
136
|
+
return True # binary (99.9%, n=1)
|
|
137
|
+
else:
|
|
138
|
+
return False # text (100.0%, n=1)
|
|
139
|
+
else:
|
|
140
|
+
if features[17] <= 0.072300: # longest_printable_run
|
|
141
|
+
if features[1] <= 0.112942: # control_ratio
|
|
142
|
+
if features[7] <= 5.537264: # byte_entropy
|
|
143
|
+
return False # text (100.0%, n=1)
|
|
144
|
+
else:
|
|
145
|
+
return True # binary (100.0%, n=1)
|
|
146
|
+
else:
|
|
147
|
+
return True # binary (100.0%, n=1)
|
|
148
|
+
else:
|
|
149
|
+
if features[7] <= 4.198218: # byte_entropy
|
|
150
|
+
if features[17] <= 0.174242: # longest_printable_run
|
|
151
|
+
return True # binary (53.0%, n=0)
|
|
152
|
+
else:
|
|
153
|
+
return True # binary (95.3%, n=1)
|
|
154
|
+
else:
|
|
155
|
+
if features[17] <= 0.096875: # longest_printable_run
|
|
156
|
+
return True # binary (89.4%, n=0)
|
|
157
|
+
else:
|
|
158
|
+
return True # binary (100.0%, n=1)
|
|
159
|
+
else:
|
|
160
|
+
return False # text (100.0%, n=1)
|
|
161
|
+
else:
|
|
162
|
+
if features[1] <= 0.575000: # control_ratio
|
|
163
|
+
return False # text (100.0%, n=1)
|
|
164
|
+
else:
|
|
165
|
+
return True # binary (100.0%, n=1)
|
|
166
|
+
else:
|
|
167
|
+
if features[3] <= 0.098077: # high_byte_ratio
|
|
168
|
+
if features[8] <= 0.500000: # bom_utf32le
|
|
169
|
+
if features[6] <= 0.125000: # odd_null_ratio
|
|
170
|
+
return False # text (100.0%, n=1)
|
|
171
|
+
else:
|
|
172
|
+
if features[16] <= 0.500000: # try_utf32be
|
|
173
|
+
if features[5] <= 0.341085: # even_null_ratio
|
|
174
|
+
return False # text (100.0%, n=1)
|
|
175
|
+
else:
|
|
176
|
+
if features[15] <= 0.500000: # try_utf32le
|
|
177
|
+
if features[14] <= 0.500000: # try_utf16be
|
|
178
|
+
if features[7] <= 0.864787: # byte_entropy
|
|
179
|
+
return False # text (100.0%, n=1)
|
|
180
|
+
else:
|
|
181
|
+
return True # binary (100.0%, n=1)
|
|
182
|
+
else:
|
|
183
|
+
return True # binary (100.0%, n=1)
|
|
184
|
+
else:
|
|
185
|
+
if features[1] <= 0.925781: # control_ratio
|
|
186
|
+
return False # text (100.0%, n=1)
|
|
187
|
+
else:
|
|
188
|
+
return True # binary (100.0%, n=1)
|
|
189
|
+
else:
|
|
190
|
+
if features[17] <= 0.003906: # longest_printable_run
|
|
191
|
+
return True # binary (100.0%, n=1)
|
|
192
|
+
else:
|
|
193
|
+
return False # text (100.0%, n=1)
|
|
194
|
+
else:
|
|
195
|
+
return False # text (100.0%, n=1)
|
|
196
|
+
else:
|
|
197
|
+
if features[7] <= 5.169243: # byte_entropy
|
|
198
|
+
if features[0] <= 0.765625: # null_ratio
|
|
199
|
+
if features[2] <= 0.378571: # printable_ascii_ratio
|
|
200
|
+
if features[20] <= 0.500000: # try_shift_jis
|
|
201
|
+
return False # text (100.0%, n=1)
|
|
202
|
+
else:
|
|
203
|
+
if features[14] <= 0.500000: # try_utf16be
|
|
204
|
+
if features[7] <= 3.188246: # byte_entropy
|
|
205
|
+
return False # text (100.0%, n=1)
|
|
206
|
+
else:
|
|
207
|
+
return True # binary (100.0%, n=1)
|
|
208
|
+
else:
|
|
209
|
+
return False # text (100.0%, n=1)
|
|
210
|
+
else:
|
|
211
|
+
if features[13] <= 0.500000: # try_utf16le
|
|
212
|
+
if features[3] <= 0.320000: # high_byte_ratio
|
|
213
|
+
return True # binary (100.0%, n=1)
|
|
214
|
+
else:
|
|
215
|
+
return False # text (100.0%, n=1)
|
|
216
|
+
else:
|
|
217
|
+
if features[7] <= 2.892092: # byte_entropy
|
|
218
|
+
return False # text (100.0%, n=1)
|
|
219
|
+
else:
|
|
220
|
+
return False # text (100.0%, n=1)
|
|
221
|
+
else:
|
|
222
|
+
if features[5] <= 0.921875: # even_null_ratio
|
|
223
|
+
if features[2] <= 0.027344: # printable_ascii_ratio
|
|
224
|
+
return True # binary (100.0%, n=1)
|
|
225
|
+
else:
|
|
226
|
+
return True # binary (100.0%, n=1)
|
|
227
|
+
else:
|
|
228
|
+
return False # text (100.0%, n=1)
|
|
229
|
+
else:
|
|
230
|
+
return True # binary (100.0%, n=1)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: binaryornot
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Ultra-lightweight pure Python package to check if a file is binary or text.
|
|
5
|
+
Project-URL: bugs, https://github.com/binaryornot/binaryornot/issues
|
|
6
|
+
Project-URL: changelog, https://github.com/binaryornot/binaryornot/releases
|
|
7
|
+
Project-URL: documentation, https://binaryornot.github.io/binaryornot/
|
|
8
|
+
Project-URL: homepage, https://github.com/binaryornot/binaryornot
|
|
9
|
+
Author-email: Audrey Roy Greenfeld <aroy@alum.mit.edu>
|
|
10
|
+
Maintainer-email: Audrey Roy Greenfeld <aroy@alum.mit.edu>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.12
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# BinaryOrNot
|
|
26
|
+
|
|
27
|
+
Python library and CLI tool to check if a file is binary or text. Zero dependencies.
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from binaryornot.check import is_binary
|
|
31
|
+
|
|
32
|
+
is_binary("image.png") # True
|
|
33
|
+
is_binary("README.md") # False
|
|
34
|
+
is_binary("data.sqlite") # True
|
|
35
|
+
is_binary("report.csv") # False
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```sh
|
|
39
|
+
$ binaryornot image.png
|
|
40
|
+
True
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```sh
|
|
46
|
+
pip install binaryornot
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Why not just check for null bytes?
|
|
50
|
+
|
|
51
|
+
That's the first thing everyone tries. It works until it doesn't:
|
|
52
|
+
|
|
53
|
+
- A UTF-16 text file is full of null bytes. Your tool thinks it's binary and corrupts it.
|
|
54
|
+
- A Big5 or GB2312 text file has high-ASCII bytes everywhere. Looks binary by byte ratios alone.
|
|
55
|
+
- A font file (.woff, .eot) is clearly binary but might not have null bytes in the first chunk.
|
|
56
|
+
|
|
57
|
+
BinaryOrNot reads the first 128 bytes and runs them through a trained decision tree that considers byte ratios, Shannon entropy, encoding validity, BOM detection, and more. It handles all the edge cases above correctly, with zero dependencies.
|
|
58
|
+
|
|
59
|
+
Tested against [37 text encodings and 49 binary formats](https://binaryornot.github.io/binaryornot/usage/), verified by parametrized tests driven from coverage CSVs.
|
|
60
|
+
|
|
61
|
+
## API
|
|
62
|
+
|
|
63
|
+
One function:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from binaryornot.check import is_binary
|
|
67
|
+
|
|
68
|
+
is_binary(filename) # returns True or False
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
There's also `is_binary_string()` if you already have bytes:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from binaryornot.helpers import is_binary_string
|
|
75
|
+
|
|
76
|
+
is_binary_string(b"\x00\x01\x02") # True
|
|
77
|
+
is_binary_string(b"hello world") # False
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
[Full documentation](https://binaryornot.github.io/binaryornot/) covers the detection algorithm in detail.
|
|
81
|
+
|
|
82
|
+
## Credits
|
|
83
|
+
|
|
84
|
+
Created by [Audrey Roy Greenfeld](https://audrey.feldroy.com).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
binaryornot/__init__.py,sha256=OQjHzR0Rewtt-cTpYKKDAA2XUBpLGuUeb9JRak15Nqs,95
|
|
2
|
+
binaryornot/__main__.py,sha256=OLoOkeoAmMnJvnXLTuzot_UdlUPr1665bvRXMtgzIEM,63
|
|
3
|
+
binaryornot/check.py,sha256=JlZ04JwB6_EklLdhup-PilLB2nTlN0PfndGzru8xs70,1147
|
|
4
|
+
binaryornot/helpers.py,sha256=_ejKNcTMiiKCvED7jGSX6njM_vFCvbmqHcAk-hVweOg,7050
|
|
5
|
+
binaryornot/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
|
|
6
|
+
binaryornot/tree.py,sha256=jpA2_UOV1Sc3L-VW-1DYMYLBfgNJg_lvebf8jOwCj8o,13093
|
|
7
|
+
binaryornot/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
binaryornot/data/binary_formats.csv,sha256=TFDAgDhZPa6StowTdcTxZpxqIYuhFuWhW4QJkQRv_JI,3738
|
|
9
|
+
binaryornot/data/encodings.csv,sha256=8A82R9syvD2whLGFy6X7ndrfuSSuNEsrePCl7WxLk0A,3734
|
|
10
|
+
binaryornot-0.5.0.dist-info/METADATA,sha256=BrJwB8wye4JgNo8xNdlPtu8pOOih_xaT4bdu0zzc0qs,2755
|
|
11
|
+
binaryornot-0.5.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
12
|
+
binaryornot-0.5.0.dist-info/entry_points.txt,sha256=ZcvM7LFMwzNBFbY4-8IJp2t4dyeDLziu2WGYfINyNew,55
|
|
13
|
+
binaryornot-0.5.0.dist-info/licenses/LICENSE,sha256=A_Rr9DwQHmSvZLn4GKyXg2GCkPclKWiBdZyTyS0veWg,1078
|
|
14
|
+
binaryornot-0.5.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, Audrey Roy Greenfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|