file-identifier 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Priyanshi Dwivedi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software...
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: file-identifier
3
+ Version: 0.1.0
4
+ Summary: File Type Identifier using Magic Numbers
5
+ Author-email: Author <author@example.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Dynamic: license-file
@@ -0,0 +1,49 @@
1
+ # šŸ” File Identifier
2
+
3
+ Identify file types using **magic numbers (file signatures)** instead of file extensions.
4
+
5
+ Works even if files have incorrect or fake extensions (e.g., `.pdf.exe`).
6
+
7
+ ---
8
+
9
+ ## šŸš€ Installation
10
+
11
+ ```bash
12
+ pip install file-identifier
13
+ ```
14
+
15
+ ---
16
+
17
+ ## šŸ›  Usage
18
+
19
+ ```bash
20
+ file-identifier <file_path>
21
+ ```
22
+
23
+ ### Example
24
+
25
+ ```bash
26
+ file-identifier "C:\Users\YourName\Downloads\file.zip"
27
+ ```
28
+
29
+ ---
30
+
31
+ ## šŸ“Š Output
32
+
33
+ * File type
34
+ * Likely extension
35
+ * File size
36
+ * Confidence level
37
+ * Hex header
38
+
39
+ ---
40
+
41
+ ## 🧠 How it works
42
+
43
+ Reads the first bytes of a file and matches them with known **magic signatures**.
44
+
45
+ ---
46
+
47
+ ## šŸ“œ License
48
+
49
+ MIT
@@ -0,0 +1,9 @@
1
+ """
2
+ File Type Identifier using Magic Numbers
3
+ Reads file headers (magic bytes) to identify file types - no extensions needed.
4
+ """
5
+
6
+ from .core import identify_file, identify_many, read_header, print_report
7
+
8
+ __version__ = "0.1.0"
9
+ __all__ = ["identify_file", "identify_many", "read_header", "print_report"]
@@ -0,0 +1,41 @@
1
+ import sys
2
+ import os
3
+ import tempfile
4
+ import argparse
5
+ from .core import identify_file, print_report
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(description="File Type Identifier using Magic Numbers.")
9
+ parser.add_argument("files", nargs="*", help="List of files to identify")
10
+ args = parser.parse_args()
11
+
12
+ # If files are provided via CLI arguments
13
+ if args.files:
14
+ for fp in args.files:
15
+ print_report(identify_file(fp))
16
+ else:
17
+ # Default behavior with no arguments: Self-demo
18
+ print("\nšŸ” File Type Identifier — Demo Mode")
19
+ print("Creating sample files with known magic bytes...\n")
20
+
21
+ samples = {
22
+ "sample.jpg": b'\xff\xd8\xff\xe0\x00\x10JFIF',
23
+ "sample.png": b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR',
24
+ "sample.pdf": b'%PDF-1.4\n%\xe2\xe3\xcf\xd3',
25
+ "sample.gz": b'\x1f\x8b\x08\x00\x00\x00\x00\x00',
26
+ "sample.exe": b'MZ\x90\x00\x03\x00\x00\x00',
27
+ "sample.zip": b'PK\x03\x04\x14\x00\x00\x00',
28
+ "sample.mp3": b'ID3\x03\x00\x00\x00\x00',
29
+ "sample.txt": b'Hello, world!\nThis is plain text.\n',
30
+ "sample.bin": bytes(range(16)),
31
+ }
32
+
33
+ with tempfile.TemporaryDirectory() as tmpdir:
34
+ for name, data in samples.items():
35
+ fp = os.path.join(tmpdir, name)
36
+ with open(fp, 'wb') as f:
37
+ f.write(data)
38
+ print_report(identify_file(fp))
39
+
40
+ if __name__ == "__main__":
41
+ main()
@@ -0,0 +1,144 @@
1
+ """
2
+ File Type Identifier using Magic Numbers
3
+ Reads file headers (magic bytes) to identify file types - no extensions needed.
4
+ """
5
+
6
+ import struct
7
+ from pathlib import Path
8
+
9
+ MAGIC_SIGNATURES = [
10
+ # Images
11
+ (b'\xff\xd8\xff', "JPEG Image", ".jpg"),
12
+ (b'\x89PNG\r\n\x1a\n', "PNG Image", ".png"),
13
+ (b'GIF87a', "GIF Image (87a)", ".gif"),
14
+ (b'GIF89a', "GIF Image (89a)", ".gif"),
15
+ (b'BM', "BMP Image", ".bmp"),
16
+ (b'RIFF', "RIFF (WAV/AVI/WebP)", ".riff"),
17
+ (b'\x00\x00\x01\x00', "ICO Icon", ".ico"),
18
+ (b'\x49\x49\x2a\x00', "TIFF (little-endian)", ".tif"),
19
+ (b'\x4d\x4d\x00\x2a', "TIFF (big-endian)", ".tif"),
20
+
21
+ # Documents
22
+ (b'%PDF', "PDF Document", ".pdf"),
23
+ (b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1', "MS Office (old)", ".doc/.xls/.ppt"),
24
+ (b'PK\x03\x04', "ZIP / Office Open XML",".zip/.docx/.xlsx"),
25
+
26
+ # Audio / Video
27
+ (b'ID3', "MP3 Audio (ID3 tag)", ".mp3"),
28
+ (b'\xff\xfb', "MP3 Audio", ".mp3"),
29
+ (b'\xff\xf3', "MP3 Audio", ".mp3"),
30
+ (b'fLaC', "FLAC Audio", ".flac"),
31
+ (b'OggS', "OGG Container", ".ogg"),
32
+ (b'\x1aE\xdf\xa3', "Matroska/WebM", ".mkv/.webm"),
33
+ (b'\x00\x00\x00\x18ftypmp4', "MP4 Video", ".mp4"),
34
+ (b'\x00\x00\x00\x20ftyp', "MP4 Video", ".mp4"),
35
+
36
+ # Archives
37
+ (b'\x1f\x8b', "GZIP Archive", ".gz"),
38
+ (b'BZh', "BZIP2 Archive", ".bz2"),
39
+ (b'\xfd7zXZ\x00', "XZ Archive", ".xz"),
40
+ (b'7z\xbc\xaf\x27\x1c', "7-Zip Archive", ".7z"),
41
+ (b'Rar!\x1a\x07\x00', "RAR Archive (v4)", ".rar"),
42
+ (b'Rar!\x1a\x07\x01\x00', "RAR Archive (v5)", ".rar"),
43
+
44
+ # Executables / Binary
45
+ (b'MZ', "Windows Executable", ".exe/.dll"),
46
+ (b'\x7fELF', "ELF Executable (Linux)",".elf"),
47
+ (b'\xca\xfe\xba\xbe', "Java Class / Mach-O", ".class"),
48
+ (b'\xfe\xed\xfa\xce', "Mach-O 32-bit", ".macho"),
49
+ (b'\xfe\xed\xfa\xcf', "Mach-O 64-bit", ".macho"),
50
+
51
+ # Text / Code (heuristic, checked last)
52
+ (b'#!/', "Shell Script", ".sh"),
53
+ (b'<?xml', "XML Document", ".xml"),
54
+ (b'<?php', "PHP Script", ".php"),
55
+ (b'<html', "HTML Document", ".html"),
56
+ (b'<HTML', "HTML Document", ".html"),
57
+ (b'{\n', "JSON (likely)", ".json"),
58
+ (b'{\r\n', "JSON (likely)", ".json"),
59
+ ]
60
+
61
+ MAX_HEADER = 32 # bytes to read from the file
62
+
63
+
64
+ def read_header(filepath: str) -> bytes:
65
+ """Read the first MAX_HEADER bytes of a file."""
66
+ with open(filepath, 'rb') as f:
67
+ return f.read(MAX_HEADER)
68
+
69
+
70
+ def identify_file(filepath: str) -> dict:
71
+ """
72
+ Identify a file's type by reading its magic bytes.
73
+
74
+ Returns a dict with:
75
+ path - original filepath
76
+ hex_header - hex dump of the first bytes
77
+ file_type - human-readable type name
78
+ extension - likely extension(s)
79
+ matched_sig - the raw signature that matched (hex)
80
+ confidence - 'high' or 'low'
81
+ """
82
+ path = Path(filepath)
83
+ result = {
84
+ "path": str(path),
85
+ "size_bytes": path.stat().st_size if path.exists() else 0,
86
+ "file_type": "Unknown",
87
+ "extension": "?",
88
+ "hex_header": "",
89
+ "matched_sig": "",
90
+ "confidence": "low",
91
+ }
92
+
93
+ if not path.exists():
94
+ result["file_type"] = "File not found"
95
+ return result
96
+
97
+ if path.stat().st_size == 0:
98
+ result["file_type"] = "Empty file"
99
+ return result
100
+
101
+ header = read_header(filepath)
102
+ result["hex_header"] = " ".join(f"{b:02X}" for b in header)
103
+
104
+ for sig, name, ext in MAGIC_SIGNATURES:
105
+ if header[:len(sig)] == sig:
106
+ result["file_type"] = name
107
+ result["extension"] = ext
108
+ result["matched_sig"] = sig.hex(" ")
109
+ result["confidence"] = "high" if len(sig) >= 4 else "medium"
110
+ return result
111
+
112
+ # Fallback: try to detect plain text (reject if null bytes present)
113
+ try:
114
+ decoded = header.decode('utf-8')
115
+ if '\x00' in decoded:
116
+ raise UnicodeDecodeError('utf-8', b'', 0, 1, 'null byte')
117
+ result["file_type"] = "Plain Text (UTF-8)"
118
+ result["extension"] = ".txt"
119
+ result["confidence"] = "medium"
120
+ except UnicodeDecodeError:
121
+ result["file_type"] = "Unknown Binary"
122
+ result["extension"] = ".bin"
123
+ result["confidence"] = "low"
124
+
125
+ return result
126
+
127
+
128
+ def identify_many(filepaths: list[str]) -> list[dict]:
129
+ """Identify multiple files at once."""
130
+ return [identify_file(fp) for fp in filepaths]
131
+
132
+
133
+ def print_report(result: dict) -> None:
134
+ """Pretty-print a single file identification result."""
135
+ print(f"\n{'='*55}")
136
+ print(f" File : {result['path']}")
137
+ print(f" Type : {result['file_type']}")
138
+ print(f" Ext : {result['extension']}")
139
+ print(f" Size : {result['size_bytes']:,} bytes")
140
+ print(f" Confidence : {result['confidence']}")
141
+ print(f" Header : {result['hex_header']}")
142
+ if result['matched_sig']:
143
+ print(f" Matched : {result['matched_sig']}")
144
+ print(f"{'='*55}")
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: file-identifier
3
+ Version: 0.1.0
4
+ Summary: File Type Identifier using Magic Numbers
5
+ Author-email: Author <author@example.com>
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Dynamic: license-file
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README
3
+ pyproject.toml
4
+ file_identifier/__init__.py
5
+ file_identifier/cli.py
6
+ file_identifier/core.py
7
+ file_identifier.egg-info/PKG-INFO
8
+ file_identifier.egg-info/SOURCES.txt
9
+ file_identifier.egg-info/dependency_links.txt
10
+ file_identifier.egg-info/entry_points.txt
11
+ file_identifier.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ file-identifier = file_identifier.cli:main
@@ -0,0 +1 @@
1
+ file_identifier
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "file-identifier"
7
+ version = "0.1.0"
8
+ description = "File Type Identifier using Magic Numbers"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+
12
+ authors = [
13
+ { name="Author", email="author@example.com" }
14
+ ]
15
+
16
+ classifiers = [
17
+ "Programming Language :: Python :: 3",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ ]
21
+
22
+ [project.scripts]
23
+ file-identifier = "file_identifier.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+