file-identifier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- file_identifier-0.1.0/LICENSE +6 -0
- file_identifier-0.1.0/PKG-INFO +12 -0
- file_identifier-0.1.0/README +49 -0
- file_identifier-0.1.0/file_identifier/__init__.py +9 -0
- file_identifier-0.1.0/file_identifier/cli.py +41 -0
- file_identifier-0.1.0/file_identifier/core.py +144 -0
- file_identifier-0.1.0/file_identifier.egg-info/PKG-INFO +12 -0
- file_identifier-0.1.0/file_identifier.egg-info/SOURCES.txt +11 -0
- file_identifier-0.1.0/file_identifier.egg-info/dependency_links.txt +1 -0
- file_identifier-0.1.0/file_identifier.egg-info/entry_points.txt +2 -0
- file_identifier-0.1.0/file_identifier.egg-info/top_level.txt +1 -0
- file_identifier-0.1.0/pyproject.toml +23 -0
- file_identifier-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: file-identifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: File Type Identifier using Magic Numbers
|
|
5
|
+
Author-email: Author <author@example.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# š File Identifier
|
|
2
|
+
|
|
3
|
+
Identify file types using **magic numbers (file signatures)** instead of file extensions.
|
|
4
|
+
|
|
5
|
+
Works even if files have incorrect or fake extensions (e.g., `.pdf.exe`).
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## š Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install file-identifier
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## š Usage
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
file-identifier <file_path>
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Example
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
file-identifier "C:\Users\YourName\Downloads\file.zip"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## š Output
|
|
32
|
+
|
|
33
|
+
* File type
|
|
34
|
+
* Likely extension
|
|
35
|
+
* File size
|
|
36
|
+
* Confidence level
|
|
37
|
+
* Hex header
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## š§ How it works
|
|
42
|
+
|
|
43
|
+
Reads the first bytes of a file and matches them with known **magic signatures**.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## š License
|
|
48
|
+
|
|
49
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File Type Identifier using Magic Numbers
|
|
3
|
+
Reads file headers (magic bytes) to identify file types - no extensions needed.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .core import identify_file, identify_many, read_header, print_report
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__all__ = ["identify_file", "identify_many", "read_header", "print_report"]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
import argparse
|
|
5
|
+
from .core import identify_file, print_report
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
parser = argparse.ArgumentParser(description="File Type Identifier using Magic Numbers.")
|
|
9
|
+
parser.add_argument("files", nargs="*", help="List of files to identify")
|
|
10
|
+
args = parser.parse_args()
|
|
11
|
+
|
|
12
|
+
# If files are provided via CLI arguments
|
|
13
|
+
if args.files:
|
|
14
|
+
for fp in args.files:
|
|
15
|
+
print_report(identify_file(fp))
|
|
16
|
+
else:
|
|
17
|
+
# Default behavior with no arguments: Self-demo
|
|
18
|
+
print("\nš File Type Identifier ā Demo Mode")
|
|
19
|
+
print("Creating sample files with known magic bytes...\n")
|
|
20
|
+
|
|
21
|
+
samples = {
|
|
22
|
+
"sample.jpg": b'\xff\xd8\xff\xe0\x00\x10JFIF',
|
|
23
|
+
"sample.png": b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR',
|
|
24
|
+
"sample.pdf": b'%PDF-1.4\n%\xe2\xe3\xcf\xd3',
|
|
25
|
+
"sample.gz": b'\x1f\x8b\x08\x00\x00\x00\x00\x00',
|
|
26
|
+
"sample.exe": b'MZ\x90\x00\x03\x00\x00\x00',
|
|
27
|
+
"sample.zip": b'PK\x03\x04\x14\x00\x00\x00',
|
|
28
|
+
"sample.mp3": b'ID3\x03\x00\x00\x00\x00',
|
|
29
|
+
"sample.txt": b'Hello, world!\nThis is plain text.\n',
|
|
30
|
+
"sample.bin": bytes(range(16)),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
34
|
+
for name, data in samples.items():
|
|
35
|
+
fp = os.path.join(tmpdir, name)
|
|
36
|
+
with open(fp, 'wb') as f:
|
|
37
|
+
f.write(data)
|
|
38
|
+
print_report(identify_file(fp))
|
|
39
|
+
|
|
40
|
+
if __name__ == "__main__":
|
|
41
|
+
main()
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File Type Identifier using Magic Numbers
|
|
3
|
+
Reads file headers (magic bytes) to identify file types - no extensions needed.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import struct
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
MAGIC_SIGNATURES = [
|
|
10
|
+
# Images
|
|
11
|
+
(b'\xff\xd8\xff', "JPEG Image", ".jpg"),
|
|
12
|
+
(b'\x89PNG\r\n\x1a\n', "PNG Image", ".png"),
|
|
13
|
+
(b'GIF87a', "GIF Image (87a)", ".gif"),
|
|
14
|
+
(b'GIF89a', "GIF Image (89a)", ".gif"),
|
|
15
|
+
(b'BM', "BMP Image", ".bmp"),
|
|
16
|
+
(b'RIFF', "RIFF (WAV/AVI/WebP)", ".riff"),
|
|
17
|
+
(b'\x00\x00\x01\x00', "ICO Icon", ".ico"),
|
|
18
|
+
(b'\x49\x49\x2a\x00', "TIFF (little-endian)", ".tif"),
|
|
19
|
+
(b'\x4d\x4d\x00\x2a', "TIFF (big-endian)", ".tif"),
|
|
20
|
+
|
|
21
|
+
# Documents
|
|
22
|
+
(b'%PDF', "PDF Document", ".pdf"),
|
|
23
|
+
(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1', "MS Office (old)", ".doc/.xls/.ppt"),
|
|
24
|
+
(b'PK\x03\x04', "ZIP / Office Open XML",".zip/.docx/.xlsx"),
|
|
25
|
+
|
|
26
|
+
# Audio / Video
|
|
27
|
+
(b'ID3', "MP3 Audio (ID3 tag)", ".mp3"),
|
|
28
|
+
(b'\xff\xfb', "MP3 Audio", ".mp3"),
|
|
29
|
+
(b'\xff\xf3', "MP3 Audio", ".mp3"),
|
|
30
|
+
(b'fLaC', "FLAC Audio", ".flac"),
|
|
31
|
+
(b'OggS', "OGG Container", ".ogg"),
|
|
32
|
+
(b'\x1aE\xdf\xa3', "Matroska/WebM", ".mkv/.webm"),
|
|
33
|
+
(b'\x00\x00\x00\x18ftypmp4', "MP4 Video", ".mp4"),
|
|
34
|
+
(b'\x00\x00\x00\x20ftyp', "MP4 Video", ".mp4"),
|
|
35
|
+
|
|
36
|
+
# Archives
|
|
37
|
+
(b'\x1f\x8b', "GZIP Archive", ".gz"),
|
|
38
|
+
(b'BZh', "BZIP2 Archive", ".bz2"),
|
|
39
|
+
(b'\xfd7zXZ\x00', "XZ Archive", ".xz"),
|
|
40
|
+
(b'7z\xbc\xaf\x27\x1c', "7-Zip Archive", ".7z"),
|
|
41
|
+
(b'Rar!\x1a\x07\x00', "RAR Archive (v4)", ".rar"),
|
|
42
|
+
(b'Rar!\x1a\x07\x01\x00', "RAR Archive (v5)", ".rar"),
|
|
43
|
+
|
|
44
|
+
# Executables / Binary
|
|
45
|
+
(b'MZ', "Windows Executable", ".exe/.dll"),
|
|
46
|
+
(b'\x7fELF', "ELF Executable (Linux)",".elf"),
|
|
47
|
+
(b'\xca\xfe\xba\xbe', "Java Class / Mach-O", ".class"),
|
|
48
|
+
(b'\xfe\xed\xfa\xce', "Mach-O 32-bit", ".macho"),
|
|
49
|
+
(b'\xfe\xed\xfa\xcf', "Mach-O 64-bit", ".macho"),
|
|
50
|
+
|
|
51
|
+
# Text / Code (heuristic, checked last)
|
|
52
|
+
(b'#!/', "Shell Script", ".sh"),
|
|
53
|
+
(b'<?xml', "XML Document", ".xml"),
|
|
54
|
+
(b'<?php', "PHP Script", ".php"),
|
|
55
|
+
(b'<html', "HTML Document", ".html"),
|
|
56
|
+
(b'<HTML', "HTML Document", ".html"),
|
|
57
|
+
(b'{\n', "JSON (likely)", ".json"),
|
|
58
|
+
(b'{\r\n', "JSON (likely)", ".json"),
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
MAX_HEADER = 32 # bytes to read from the file
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def read_header(filepath: str) -> bytes:
|
|
65
|
+
"""Read the first MAX_HEADER bytes of a file."""
|
|
66
|
+
with open(filepath, 'rb') as f:
|
|
67
|
+
return f.read(MAX_HEADER)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def identify_file(filepath: str) -> dict:
|
|
71
|
+
"""
|
|
72
|
+
Identify a file's type by reading its magic bytes.
|
|
73
|
+
|
|
74
|
+
Returns a dict with:
|
|
75
|
+
path - original filepath
|
|
76
|
+
hex_header - hex dump of the first bytes
|
|
77
|
+
file_type - human-readable type name
|
|
78
|
+
extension - likely extension(s)
|
|
79
|
+
matched_sig - the raw signature that matched (hex)
|
|
80
|
+
confidence - 'high' or 'low'
|
|
81
|
+
"""
|
|
82
|
+
path = Path(filepath)
|
|
83
|
+
result = {
|
|
84
|
+
"path": str(path),
|
|
85
|
+
"size_bytes": path.stat().st_size if path.exists() else 0,
|
|
86
|
+
"file_type": "Unknown",
|
|
87
|
+
"extension": "?",
|
|
88
|
+
"hex_header": "",
|
|
89
|
+
"matched_sig": "",
|
|
90
|
+
"confidence": "low",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if not path.exists():
|
|
94
|
+
result["file_type"] = "File not found"
|
|
95
|
+
return result
|
|
96
|
+
|
|
97
|
+
if path.stat().st_size == 0:
|
|
98
|
+
result["file_type"] = "Empty file"
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
header = read_header(filepath)
|
|
102
|
+
result["hex_header"] = " ".join(f"{b:02X}" for b in header)
|
|
103
|
+
|
|
104
|
+
for sig, name, ext in MAGIC_SIGNATURES:
|
|
105
|
+
if header[:len(sig)] == sig:
|
|
106
|
+
result["file_type"] = name
|
|
107
|
+
result["extension"] = ext
|
|
108
|
+
result["matched_sig"] = sig.hex(" ")
|
|
109
|
+
result["confidence"] = "high" if len(sig) >= 4 else "medium"
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
# Fallback: try to detect plain text (reject if null bytes present)
|
|
113
|
+
try:
|
|
114
|
+
decoded = header.decode('utf-8')
|
|
115
|
+
if '\x00' in decoded:
|
|
116
|
+
raise UnicodeDecodeError('utf-8', b'', 0, 1, 'null byte')
|
|
117
|
+
result["file_type"] = "Plain Text (UTF-8)"
|
|
118
|
+
result["extension"] = ".txt"
|
|
119
|
+
result["confidence"] = "medium"
|
|
120
|
+
except UnicodeDecodeError:
|
|
121
|
+
result["file_type"] = "Unknown Binary"
|
|
122
|
+
result["extension"] = ".bin"
|
|
123
|
+
result["confidence"] = "low"
|
|
124
|
+
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def identify_many(filepaths: list[str]) -> list[dict]:
|
|
129
|
+
"""Identify multiple files at once."""
|
|
130
|
+
return [identify_file(fp) for fp in filepaths]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def print_report(result: dict) -> None:
|
|
134
|
+
"""Pretty-print a single file identification result."""
|
|
135
|
+
print(f"\n{'='*55}")
|
|
136
|
+
print(f" File : {result['path']}")
|
|
137
|
+
print(f" Type : {result['file_type']}")
|
|
138
|
+
print(f" Ext : {result['extension']}")
|
|
139
|
+
print(f" Size : {result['size_bytes']:,} bytes")
|
|
140
|
+
print(f" Confidence : {result['confidence']}")
|
|
141
|
+
print(f" Header : {result['hex_header']}")
|
|
142
|
+
if result['matched_sig']:
|
|
143
|
+
print(f" Matched : {result['matched_sig']}")
|
|
144
|
+
print(f"{'='*55}")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: file-identifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: File Type Identifier using Magic Numbers
|
|
5
|
+
Author-email: Author <author@example.com>
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README
|
|
3
|
+
pyproject.toml
|
|
4
|
+
file_identifier/__init__.py
|
|
5
|
+
file_identifier/cli.py
|
|
6
|
+
file_identifier/core.py
|
|
7
|
+
file_identifier.egg-info/PKG-INFO
|
|
8
|
+
file_identifier.egg-info/SOURCES.txt
|
|
9
|
+
file_identifier.egg-info/dependency_links.txt
|
|
10
|
+
file_identifier.egg-info/entry_points.txt
|
|
11
|
+
file_identifier.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
file_identifier
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "file-identifier"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "File Type Identifier using Magic Numbers"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
|
|
12
|
+
authors = [
|
|
13
|
+
{ name="Author", email="author@example.com" }
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
file-identifier = "file_identifier.cli:main"
|