filedna 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
filedna/__init__.py ADDED
@@ -0,0 +1,290 @@
1
+ """
2
+ FileDNA – Discover a file's true identity.
3
+
4
+ FileDNA's core job: tell you what a file REALLY is, whether it's trustworthy,
5
+ and surface every signal about it — without trusting extensions.
6
+
7
+ What makes FileDNA different from content-core / LangChain / etc:
8
+ - content-core extracts TEXT from files (that's its whole job)
9
+ - LangChain chunks that text for RAG pipelines
10
+ - FileDNA answers: what IS this file? is it valid? is it what it claims?
11
+ is it a duplicate? does it contain PII? what are its hashes?
12
+
13
+ These are the things nobody else does as a unified file-identity layer.
14
+
15
+ Core API (no API key, no network):
16
+ analyze(path) → AnalysisResult — full identity report
17
+ validate(path) → AnalysisResult — is it structurally valid?
18
+ detect_type(path) → str — real type from magic bytes
19
+ inspect_file(path) → dict — metadata (pages, dims, etc)
20
+ inspect_url(url) → dict — HEAD request metadata
21
+ estimate_tokens(path) → int — token count estimate
22
+
23
+ File identity utilities (no API key, no network):
24
+ extract_exif(path) → ExifData — GPS, camera, timestamps
25
+ detect_pii(text) → PIIResult — email, phone, card, SSN...
26
+ redact_pii(text) → str — replace PII with [REDACTED]
27
+ content_hash(path) → ContentHash — SHA-256 + MD5
28
+ find_duplicates(paths) → list[DuplicateGroup]
29
+ diff_files(path_a, path_b) → FileDiff
30
+ analyze_many(paths) → BatchResult — concurrent batch analysis
31
+
32
+ AI features (optional, requires API key via AIConfig):
33
+ from filedna.features.ai_features import (
34
+ AIConfig,
35
+ classify_content, — "is this a legal contract or invoice?"
36
+ extract_structured, — pull typed fields from unstructured text
37
+ clean_document, — remove headers/footers/page numbers
38
+ semantic_similarity, — are these two documents saying the same thing?
39
+ )
40
+ """
41
+ from __future__ import annotations
42
+
43
+ from pathlib import Path
44
+ from typing import Any
45
+
46
+ from .models.result import AnalysisResult
47
+
48
+ __version__ = "1.2.2"
49
+
50
+ __all__ = [
51
+ # Core
52
+ "analyze", "validate", "detect_type", "inspect_file",
53
+ "inspect_url", "estimate_tokens",
54
+ # File identity
55
+ "extract_exif",
56
+ "detect_pii", "redact_pii",
57
+ "content_hash", "find_duplicates",
58
+ "diff_files", "analyze_many",
59
+ # Model
60
+ "AnalysisResult",
61
+ ]
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Core API
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def analyze(path: str | Path, *, skip_metadata: bool = False) -> AnalysisResult:
69
+ """
70
+ Full file identity report.
71
+
72
+ Detects real type from magic bytes (never trusts the extension),
73
+ validates structural integrity, extracts metadata, scores risk.
74
+
75
+ Returns AnalysisResult with: valid, real_type, mime, extension,
76
+ extension_matches, size_human, risk_score, warnings, errors, metadata.
77
+ """
78
+ from .core.engine import analyze_file
79
+ return analyze_file(path, skip_metadata=skip_metadata)
80
+
81
+
82
+ def validate(path: str | Path) -> AnalysisResult:
83
+ """
84
+ Structural integrity check — faster than analyze(), skips metadata.
85
+
86
+ Use this for upload validation: check result.valid and result.errors.
87
+ """
88
+ return analyze(path, skip_metadata=True)
89
+
90
+
91
+ def detect_type(path: str | Path) -> str:
92
+ """
93
+ Real file type from magic bytes — never trusts the extension.
94
+
95
+ detect_type("photo.pdf") → "png" (extension lied)
96
+ detect_type("data.zip") → "docx" (actually a Word document)
97
+ """
98
+ from .detectors.type_detector import detect
99
+ real_type, _ = detect(Path(path))
100
+ return real_type
101
+
102
+
103
+ def inspect_file(path: str | Path) -> dict[str, Any]:
104
+ """
105
+ Type-specific metadata for a file.
106
+
107
+ PDF → pages, language, contains_tables, estimated_tokens
108
+ DOCX → paragraphs, words, estimated_pages
109
+ XLSX → sheets, rows, columns
110
+ PNG → width, height, mode, dpi, has_transparency
111
+ MP3 → duration, bitrate, sample_rate, channels
112
+ """
113
+ from .detectors.type_detector import detect
114
+ from .inspectors.metadata import inspect
115
+ p = Path(path)
116
+ real_type, _ = detect(p)
117
+ return inspect(p, real_type)
118
+
119
+
120
+ def inspect_url(url: str, *, timeout: int = 10) -> dict[str, Any]:
121
+ """
122
+ HTTP HEAD request — detect content type and file size without downloading.
123
+
124
+ Returns: valid, mime, real_type, size_bytes, size_human, status_code.
125
+ Does NOT fetch the page body. Use content-core for full URL extraction.
126
+ """
127
+ from .core.url_inspector import inspect_url as _inspect_url
128
+ return _inspect_url(url, timeout=timeout)
129
+
130
+
131
+ def estimate_tokens(path: str | Path) -> int:
132
+ """
133
+ Estimate LLM token count for a file's text content.
134
+
135
+ Uses tiktoken (cl100k_base) if available, else word-count heuristic.
136
+ Returns 0 for binary types (images, audio, video).
137
+ """
138
+ from .utils.tokens import estimate_tokens as _et
139
+ return _et(path)
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # File identity utilities
144
+ # ---------------------------------------------------------------------------
145
+
146
+ def extract_exif(path: str | Path) -> "ExifData":
147
+ """
148
+ Extract EXIF metadata from an image file.
149
+
150
+ Returns typed ExifData — no raw IFD tag parsing needed.
151
+
152
+ What this eliminates: manually converting GPS DMS→decimal, parsing
153
+ Rational values, handling missing tags. All done for you.
154
+
155
+ result.camera_make → "Apple"
156
+ result.camera_model → "iPhone 15 Pro"
157
+ result.focal_length → 6.86 (mm)
158
+ result.aperture → 1.78 (f-number)
159
+ result.iso → 50
160
+ result.datetime_taken → "2024:03:15 14:22:31"
161
+ result.gps.latitude → 51.507351 (decimal degrees, ready to use)
162
+ result.gps.longitude → -0.127758
163
+ result.gps.google_maps_url → "https://www.google.com/maps?q=51.5,−0.12"
164
+ """
165
+ from .extractors.exif_extractor import extract_exif as _ee
166
+ return _ee(Path(path))
167
+
168
+
169
+ def detect_pii(text: str) -> "PIIResult":
170
+ """
171
+ Scan text for Personally Identifiable Information.
172
+
173
+ Detects: email, phone (US + intl), credit card, SSN, IBAN,
174
+ IP address, API keys, AWS keys, bearer tokens.
175
+
176
+ Works offline. No LLM. No API key. Regex-based, fast.
177
+
178
+ result.has_pii → True
179
+ result.types_found → ["email", "credit_card", "aws_key"]
180
+ result.count → 3
181
+ result.matches[0] → PIIMatch(type="email", value="...", start=12)
182
+ result.redacted_text → "...send to [REDACTED_EMAIL]..."
183
+ """
184
+ from .features.pipeline import detect_pii as _dp
185
+ return _dp(text)
186
+
187
+
188
+ def redact_pii(text: str) -> str:
189
+ """Replace all detected PII with [REDACTED_TYPE] tags."""
190
+ from .features.pipeline import redact_pii as _rp
191
+ return _rp(text)
192
+
193
+
194
+ def content_hash(path: str | Path) -> "ContentHash":
195
+ """
196
+ Compute SHA-256 + MD5 of file content.
197
+
198
+ Reads in 64KB chunks — works on files of any size without loading
199
+ into memory. Use SHA-256 for deduplication and integrity checks,
200
+ MD5 for legacy system compatibility.
201
+
202
+ result.sha256 → "a750aec01847d06d..."
203
+ result.md5 → "d7591a0ac484c964..."
204
+ result == other_hash → True if same file content
205
+ str(result) → "a750aec01847d06d..." (short display form)
206
+ """
207
+ from .features.pipeline import content_hash as _ch
208
+ return _ch(Path(path))
209
+
210
+
211
+ def find_duplicates(paths: list, *, min_size: int = 1) -> list:
212
+ """
213
+ Find files with identical content in a list of paths.
214
+
215
+ Uses SHA-256 — catches exact binary duplicates regardless of filename.
216
+ Returns only groups with 2+ files. Empty list = no duplicates.
217
+
218
+ group.count → 3 (how many copies)
219
+ group.wasted_bytes → 40 (space wasted by copies)
220
+ group.paths → [Path(...), Path(...), Path(...)]
221
+
222
+ Example:
223
+ groups = find_duplicates(list(Path("uploads").rglob("*")))
224
+ for g in groups:
225
+ # keep first, delete the rest
226
+ for duplicate in g.paths[1:]:
227
+ duplicate.unlink()
228
+ """
229
+ from .features.pipeline import find_duplicates as _fd
230
+ return _fd([Path(p) for p in paths], min_size=min_size)
231
+
232
+
233
+ def diff_files(path_a: str | Path, path_b: str | Path) -> "FileDiff":
234
+ """
235
+ Structural diff between two text files.
236
+
237
+ Good for: comparing document versions, detecting what changed
238
+ in a contract, checking if a config file was modified.
239
+
240
+ diff.lines_added → 6
241
+ diff.lines_removed → 3
242
+ diff.diff_ratio → 0.72 (0.0=completely different, 1.0=identical)
243
+ diff.identical → False
244
+ diff.summary → "+6 lines added, -3 lines removed, 72% similar"
245
+ diff.unified_diff → standard unified diff string (--- a/ +++ b/ format)
246
+ """
247
+ from .features.pipeline import diff_files as _df
248
+ return _df(Path(path_a), Path(path_b))
249
+
250
+
251
+ def analyze_many(
252
+ paths: list,
253
+ *,
254
+ max_workers: int = 8,
255
+ skip_metadata: bool = False,
256
+ on_progress: Any = None,
257
+ ) -> "BatchResult":
258
+ """
259
+ Analyze a list of files concurrently using a thread pool.
260
+
261
+ Returns BatchResult — aggregate stats + per-file AnalysisResult dict.
262
+
263
+ batch.total → 50
264
+ batch.succeeded → 47
265
+ batch.failed → 3
266
+ batch.duration_seconds → 1.24
267
+ batch.success_rate → 0.94
268
+ batch.results["path"] → AnalysisResult
269
+ batch.errors["path"] → "error message"
270
+
271
+ on_progress callback: fn(completed: int, total: int, path: str)
272
+
273
+ Example — find all high-risk files in an uploads folder:
274
+ batch = analyze_many(list(Path("uploads").rglob("*")))
275
+ risky = [p for p, r in batch.results.items() if r.risk_score > 50]
276
+ """
277
+ from .features.pipeline import analyze_many as _am
278
+ return _am(paths, max_workers=max_workers,
279
+ skip_metadata=skip_metadata, on_progress=on_progress)
280
+
281
+
282
+ # ---------------------------------------------------------------------------
283
+ # TYPE_CHECKING imports for IDE support (avoids circular imports at runtime)
284
+ # ---------------------------------------------------------------------------
285
+ from typing import TYPE_CHECKING
286
+ if TYPE_CHECKING:
287
+ from .extractors.exif_extractor import ExifData
288
+ from .features.pipeline import (
289
+ PIIResult, ContentHash, DuplicateGroup, FileDiff, BatchResult
290
+ )
@@ -0,0 +1,3 @@
1
+ from .commands import cli, main
2
+
3
+ __all__ = ["cli", "main"]
@@ -0,0 +1,182 @@
1
+ """
2
+ FileDNA CLI.
3
+
4
+ Usage:
5
+ filedna analyze <path> [--pretty] [--json] [--no-metadata]
6
+ filedna validate <path>
7
+ filedna type <path>
8
+ filedna tokens <path>
9
+ filedna url <url>
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import sys
15
+
16
+ import click
17
+
18
+
19
+ @click.group()
20
+ @click.version_option(package_name="filedna")
21
+ def cli() -> None:
22
+ """FileDNA – Discover a file's true identity."""
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # analyze
27
+ # ---------------------------------------------------------------------------
28
+
29
+ @cli.command()
30
+ @click.argument("path")
31
+ @click.option("--pretty", is_flag=True, default=False, help="Human-friendly output")
32
+ @click.option("--no-metadata", is_flag=True, default=False, help="Skip metadata extraction")
33
+ def analyze(path: str, pretty: bool, no_metadata: bool) -> None:
34
+ """Analyze PATH and print a full identity report."""
35
+ from .. import analyze as _analyze
36
+
37
+ result = _analyze(path, skip_metadata=no_metadata)
38
+
39
+ if pretty:
40
+ _print_pretty(result)
41
+ else:
42
+ click.echo(json.dumps(result.model_dump(), indent=2, default=str))
43
+
44
+
45
+ def _print_pretty(result) -> None: # type: ignore[type-arg]
46
+ from ..models.result import AnalysisResult
47
+ r: AnalysisResult = result
48
+
49
+ icon = "✓" if r.valid else "✗"
50
+ color = "green" if r.valid else "red"
51
+ click.echo(click.style(f"{icon} {r.real_type.upper()}", fg=color, bold=True))
52
+ click.echo()
53
+
54
+ meta = r.metadata
55
+ if "pages" in meta:
56
+ click.echo(f"Pages: {meta['pages']}")
57
+ if "slides" in meta:
58
+ click.echo(f"Slides: {meta['slides']}")
59
+ if "paragraphs" in meta:
60
+ click.echo(f"Paragraphs: {meta['paragraphs']}")
61
+ if "sheets" in meta:
62
+ click.echo(f"Sheets: {meta['sheets']}")
63
+ if "duration" in meta:
64
+ click.echo(f"Duration: {meta['duration']}s")
65
+ if "width" in meta and "height" in meta:
66
+ click.echo(f"Dimensions: {meta['width']}×{meta['height']}")
67
+ if "language" in meta:
68
+ click.echo(f"Language: {meta['language']}")
69
+ if "contains_tables" in meta and meta["contains_tables"]:
70
+ click.echo("Contains tables")
71
+ if "contains_images" in meta and meta["contains_images"]:
72
+ click.echo("Contains images")
73
+
74
+ click.echo(f"Size: {r.size_human}")
75
+
76
+ if "estimated_tokens" in meta:
77
+ tok = meta["estimated_tokens"]
78
+ tok_str = f"{tok / 1000:.1f}k" if tok >= 1000 else str(tok)
79
+ click.echo(f"Tokens: {tok_str}")
80
+
81
+ risk_color = "green" if r.risk_score == 0 else ("yellow" if r.risk_score < 50 else "red")
82
+ click.echo(f"Risk Score: {click.style(str(r.risk_score), fg=risk_color)}")
83
+ click.echo(f"MIME: {r.mime}")
84
+
85
+ ext_match = click.style("yes", fg="green") if r.extension_matches else click.style("no", fg="red")
86
+ click.echo(f"Ext match: {ext_match} ({r.extension!r} declared)")
87
+
88
+ if r.warnings:
89
+ click.echo()
90
+ for w in r.warnings:
91
+ click.echo(click.style(f"⚠ {w}", fg="yellow"))
92
+
93
+ if r.errors:
94
+ click.echo()
95
+ for e in r.errors:
96
+ click.echo(click.style(f"✗ {e}", fg="red"))
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # validate
101
+ # ---------------------------------------------------------------------------
102
+
103
+ @cli.command()
104
+ @click.argument("path")
105
+ def validate(path: str) -> None:
106
+ """Validate PATH and print result."""
107
+ from .. import validate as _validate
108
+
109
+ result = _validate(path)
110
+ icon = "✓" if result.valid else "✗"
111
+ color = "green" if result.valid else "red"
112
+ click.echo(click.style(f"{icon} {result.real_type.upper()}", fg=color, bold=True))
113
+ if result.errors:
114
+ for e in result.errors:
115
+ click.echo(click.style(f" ✗ {e}", fg="red"))
116
+ if result.warnings:
117
+ for w in result.warnings:
118
+ click.echo(click.style(f" ⚠ {w}", fg="yellow"))
119
+ sys.exit(0 if result.valid else 1)
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # type
124
+ # ---------------------------------------------------------------------------
125
+
126
+ @cli.command(name="type")
127
+ @click.argument("path")
128
+ def detect_type_cmd(path: str) -> None:
129
+ """Print the detected real type of PATH."""
130
+ from .. import detect_type
131
+
132
+ click.echo(detect_type(path))
133
+
134
+
135
+ # ---------------------------------------------------------------------------
136
+ # tokens
137
+ # ---------------------------------------------------------------------------
138
+
139
+ @cli.command()
140
+ @click.argument("path")
141
+ def tokens(path: str) -> None:
142
+ """Estimate token count for PATH."""
143
+ from .. import estimate_tokens
144
+
145
+ click.echo(estimate_tokens(path))
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # url
150
+ # ---------------------------------------------------------------------------
151
+
152
+ @cli.command()
153
+ @click.argument("url")
154
+ @click.option("--pretty", is_flag=True, default=False, help="Human-friendly output")
155
+ def url(url: str, pretty: bool) -> None:
156
+ """Inspect URL and print content type / metadata."""
157
+ from .. import inspect_url as _inspect_url
158
+
159
+ result = _inspect_url(url)
160
+ if pretty:
161
+ valid = result.get("valid", False)
162
+ icon = "✓" if valid else "✗"
163
+ color = "green" if valid else "red"
164
+ click.echo(click.style(f"{icon} {result.get('real_type', 'unknown').upper()}", fg=color, bold=True))
165
+ click.echo(f"URL: {result['url']}")
166
+ click.echo(f"MIME: {result.get('mime', 'unknown')}")
167
+ if result.get("size_human"):
168
+ click.echo(f"Size: {result['size_human']}")
169
+ if result.get("status_code"):
170
+ click.echo(f"HTTP: {result['status_code']}")
171
+ for e in result.get("errors", []):
172
+ click.echo(click.style(f"✗ {e}", fg="red"))
173
+ else:
174
+ click.echo(json.dumps(result, indent=2, default=str))
175
+
176
+
177
+ def main() -> None:
178
+ cli()
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
@@ -0,0 +1,5 @@
1
+ from .engine import analyze_file
2
+ from .risk import compute_risk
3
+ from .url_inspector import inspect_url
4
+
5
+ __all__ = ["analyze_file", "inspect_url", "compute_risk"]
filedna/core/engine.py ADDED
@@ -0,0 +1,132 @@
1
+ """
2
+ FileDNA – core analysis engine.
3
+
4
+ Orchestrates detection → validation → inspection → risk scoring.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from ..core.risk import compute_risk
13
+ from ..detectors.type_detector import detect, extension_matches, get_extension
14
+ from ..inspectors.metadata import human_size, inspect
15
+ from ..models.result import AnalysisResult
16
+ from ..validators.file_validators import validate
17
+
18
+
19
+ def _make_result(**kwargs: Any) -> AnalysisResult:
20
+ return AnalysisResult(**kwargs)
21
+
22
+
23
+ def analyze_file(path: str | Path, *, skip_metadata: bool = False) -> AnalysisResult:
24
+ """
25
+ Full analysis pipeline for a local file.
26
+
27
+ Steps:
28
+ 1. Existence & readability check
29
+ 2. Type detection (magic bytes)
30
+ 3. Extension mismatch check
31
+ 4. Structural validation
32
+ 5. Metadata extraction
33
+ 6. Risk scoring
34
+ """
35
+ p = Path(path)
36
+ errors: list[str] = []
37
+ warnings: list[str] = []
38
+
39
+ # ------------------------------------------------------------------ #
40
+ # 1. File existence / readability #
41
+ # ------------------------------------------------------------------ #
42
+ if not p.exists():
43
+ return _make_result(
44
+ valid=False,
45
+ errors=[f"File not found: {path}"],
46
+ warnings=[],
47
+ )
48
+ if not p.is_file():
49
+ return _make_result(
50
+ valid=False,
51
+ errors=[f"Path is not a file: {path}"],
52
+ warnings=[],
53
+ )
54
+ if not os.access(p, os.R_OK):
55
+ return _make_result(
56
+ valid=False,
57
+ errors=[f"File is not readable: {path}"],
58
+ warnings=[],
59
+ )
60
+
61
+ size_bytes = p.stat().st_size
62
+ size_human_str = human_size(size_bytes)
63
+
64
+ if size_bytes == 0:
65
+ return _make_result(
66
+ valid=False,
67
+ size_bytes=0,
68
+ size_human="0 B",
69
+ errors=["File is empty"],
70
+ warnings=[],
71
+ risk_score=30,
72
+ )
73
+
74
+ declared_ext = get_extension(p)
75
+
76
+ # ------------------------------------------------------------------ #
77
+ # 2. Type detection #
78
+ # ------------------------------------------------------------------ #
79
+ real_type, mime = detect(p)
80
+
81
+ # ------------------------------------------------------------------ #
82
+ # 3. Extension mismatch #
83
+ # ------------------------------------------------------------------ #
84
+ ext_ok = extension_matches(real_type, declared_ext)
85
+ if not ext_ok and declared_ext:
86
+ errors.append(f"File is not a valid {declared_ext.upper()} (real type: {real_type})")
87
+
88
+ # ------------------------------------------------------------------ #
89
+ # 4. Structural validation #
90
+ # ------------------------------------------------------------------ #
91
+ valid, val_errors, val_warnings = validate(p, real_type)
92
+ errors.extend(val_errors)
93
+ warnings.extend(val_warnings)
94
+
95
+ # ------------------------------------------------------------------ #
96
+ # 5. Metadata extraction #
97
+ # ------------------------------------------------------------------ #
98
+ metadata: dict[str, Any] = {}
99
+ if not skip_metadata:
100
+ metadata = inspect(p, real_type)
101
+
102
+ # ------------------------------------------------------------------ #
103
+ # 6. Risk scoring #
104
+ # ------------------------------------------------------------------ #
105
+ risk_score, risk_warnings = compute_risk(
106
+ valid=valid,
107
+ extension_matches=ext_ok,
108
+ errors=errors,
109
+ warnings=warnings,
110
+ metadata=metadata,
111
+ real_type=real_type,
112
+ path=p,
113
+ )
114
+ warnings.extend(risk_warnings)
115
+
116
+ # Extension mismatch means the file is NOT what it claims to be
117
+ # (even if the actual content is valid for its real type)
118
+ final_valid = valid and len(errors) == 0 and ext_ok
119
+
120
+ return _make_result(
121
+ valid=final_valid,
122
+ real_type=real_type,
123
+ mime=mime,
124
+ extension=declared_ext,
125
+ extension_matches=ext_ok,
126
+ size_bytes=size_bytes,
127
+ size_human=size_human_str,
128
+ risk_score=risk_score,
129
+ warnings=warnings,
130
+ errors=errors,
131
+ metadata=metadata,
132
+ )
filedna/core/risk.py ADDED
@@ -0,0 +1,69 @@
1
+ """
2
+ FileDNA – risk scoring engine.
3
+
4
+ Computes a 0-100 risk score based on validation results and metadata.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+
12
+ def compute_risk(
13
+ *,
14
+ valid: bool,
15
+ extension_matches: bool,
16
+ errors: list[str],
17
+ warnings: list[str],
18
+ metadata: dict[str, Any],
19
+ real_type: str,
20
+ path: Path,
21
+ ) -> tuple[int, list[str]]:
22
+ """
23
+ Return (risk_score, extra_warnings).
24
+ Score is capped at 100.
25
+ """
26
+ score = 0
27
+ extra_warnings: list[str] = []
28
+
29
+ # Extension mismatch
30
+ if not extension_matches:
31
+ score += 40
32
+ extra_warnings.append("Extension mismatch")
33
+
34
+ # Corrupted / unreadable
35
+ if not valid:
36
+ score += 50
37
+ elif errors:
38
+ score += 30
39
+
40
+ # Metadata could not be read
41
+ if "inspection_error" in metadata:
42
+ score += 20
43
+ extra_warnings.append("Metadata could not be fully extracted")
44
+
45
+ # Empty file
46
+ if path.stat().st_size == 0:
47
+ score += 30
48
+ extra_warnings.append("File is empty")
49
+
50
+ # Embedded executable heuristic for ZIP-based formats
51
+ if real_type in ("zip", "docx", "xlsx", "pptx", "epub"):
52
+ try:
53
+ import zipfile
54
+ with zipfile.ZipFile(path) as z:
55
+ for name in z.namelist():
56
+ low = name.lower()
57
+ if any(low.endswith(ext) for ext in (
58
+ ".exe", ".dll", ".bat", ".cmd", ".ps1", ".vbs",
59
+ ".msi", ".scr", ".com", ".pif",
60
+ )):
61
+ score += 80
62
+ extra_warnings.append(
63
+ f"Embedded executable detected: {name}"
64
+ )
65
+ break
66
+ except Exception:
67
+ pass
68
+
69
+ return min(score, 100), extra_warnings