PyPI - filedna - Versions diffs - 1.2.4__py3-none-any.whl - Mend

filedna 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

filedna/__init__.py +290 -0
filedna/cli/__init__.py +3 -0
filedna/cli/commands.py +182 -0
filedna/core/__init__.py +5 -0
filedna/core/engine.py +132 -0
filedna/core/risk.py +69 -0
filedna/core/url_inspector.py +129 -0
filedna/detectors/__init__.py +3 -0
filedna/detectors/type_detector.py +294 -0
filedna/extractors/__init__.py +5 -0
filedna/extractors/exif_extractor.py +346 -0
filedna/extractors/text_extractor.py +381 -0
filedna/extractors/url_extractor.py +256 -0
filedna/features/__init__.py +15 -0
filedna/features/ai_features.py +698 -0
filedna/features/pipeline.py +625 -0
filedna/inspectors/__init__.py +3 -0
filedna/inspectors/metadata.py +428 -0
filedna/models/__init__.py +3 -0
filedna/models/result.py +106 -0
filedna/utils/__init__.py +3 -0
filedna/utils/tokens.py +26 -0
filedna/validators/__init__.py +3 -0
filedna/validators/file_validators.py +356 -0
filedna-1.2.4.dist-info/METADATA +774 -0
filedna-1.2.4.dist-info/RECORD +29 -0
filedna-1.2.4.dist-info/WHEEL +4 -0
filedna-1.2.4.dist-info/entry_points.txt +2 -0
filedna-1.2.4.dist-info/licenses/LICENSE +21 -0

filedna/__init__.py ADDED Viewed

@@ -0,0 +1,290 @@
+"""
+FileDNA – Discover a file's true identity.
+FileDNA's core job: tell you what a file REALLY is, whether it's trustworthy,
+and surface every signal about it — without trusting extensions.
+What makes FileDNA different from content-core / LangChain / etc:
+  - content-core extracts TEXT from files (that's its whole job)
+  - LangChain chunks that text for RAG pipelines
+  - FileDNA answers: what IS this file? is it valid? is it what it claims?
+    is it a duplicate? does it contain PII? what are its hashes?
+These are the things nobody else does as a unified file-identity layer.
+Core API (no API key, no network):
+    analyze(path)              → AnalysisResult   — full identity report
+    validate(path)             → AnalysisResult   — is it structurally valid?
+    detect_type(path)          → str              — real type from magic bytes
+    inspect_file(path)         → dict             — metadata (pages, dims, etc)
+    inspect_url(url)           → dict             — HEAD request metadata
+    estimate_tokens(path)      → int              — token count estimate
+File identity utilities (no API key, no network):
+    extract_exif(path)         → ExifData         — GPS, camera, timestamps
+    detect_pii(text)           → PIIResult        — email, phone, card, SSN...
+    redact_pii(text)           → str              — replace PII with [REDACTED]
+    content_hash(path)         → ContentHash      — SHA-256 + MD5
+    find_duplicates(paths)     → list[DuplicateGroup]
+    diff_files(path_a, path_b) → FileDiff
+    analyze_many(paths)        → BatchResult      — concurrent batch analysis
+AI features (optional, requires API key via AIConfig):
+    from filedna.features.ai_features import (
+        AIConfig,
+        classify_content,      — "is this a legal contract or invoice?"
+        extract_structured,    — pull typed fields from unstructured text
+        clean_document,        — remove headers/footers/page numbers
+        semantic_similarity,   — are these two documents saying the same thing?
+    )
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from .models.result import AnalysisResult
+__version__ = "1.2.2"
+__all__ = [
+    # Core
+    "analyze", "validate", "detect_type", "inspect_file",
+    "inspect_url", "estimate_tokens",
+    # File identity
+    "extract_exif",
+    "detect_pii", "redact_pii",
+    "content_hash", "find_duplicates",
+    "diff_files", "analyze_many",
+    # Model
+    "AnalysisResult",
+]
+# ---------------------------------------------------------------------------
+# Core API
+# ---------------------------------------------------------------------------
+def analyze(path: str | Path, *, skip_metadata: bool = False) -> AnalysisResult:
+    """
+    Full file identity report.
+    Detects real type from magic bytes (never trusts the extension),
+    validates structural integrity, extracts metadata, scores risk.
+    Returns AnalysisResult with: valid, real_type, mime, extension,
+    extension_matches, size_human, risk_score, warnings, errors, metadata.
+    """
+    from .core.engine import analyze_file
+    return analyze_file(path, skip_metadata=skip_metadata)
+def validate(path: str | Path) -> AnalysisResult:
+    """
+    Structural integrity check — faster than analyze(), skips metadata.
+    Use this for upload validation: check result.valid and result.errors.
+    """
+    return analyze(path, skip_metadata=True)
+def detect_type(path: str | Path) -> str:
+    """
+    Real file type from magic bytes — never trusts the extension.
+    detect_type("photo.pdf")  →  "png"   (extension lied)
+    detect_type("data.zip")   →  "docx"  (actually a Word document)
+    """
+    from .detectors.type_detector import detect
+    real_type, _ = detect(Path(path))
+    return real_type
+def inspect_file(path: str | Path) -> dict[str, Any]:
+    """
+    Type-specific metadata for a file.
+    PDF  → pages, language, contains_tables, estimated_tokens
+    DOCX → paragraphs, words, estimated_pages
+    XLSX → sheets, rows, columns
+    PNG  → width, height, mode, dpi, has_transparency
+    MP3  → duration, bitrate, sample_rate, channels
+    """
+    from .detectors.type_detector import detect
+    from .inspectors.metadata import inspect
+    p = Path(path)
+    real_type, _ = detect(p)
+    return inspect(p, real_type)
+def inspect_url(url: str, *, timeout: int = 10) -> dict[str, Any]:
+    """
+    HTTP HEAD request — detect content type and file size without downloading.
+    Returns: valid, mime, real_type, size_bytes, size_human, status_code.
+    Does NOT fetch the page body. Use content-core for full URL extraction.
+    """
+    from .core.url_inspector import inspect_url as _inspect_url
+    return _inspect_url(url, timeout=timeout)
+def estimate_tokens(path: str | Path) -> int:
+    """
+    Estimate LLM token count for a file's text content.
+    Uses tiktoken (cl100k_base) if available, else word-count heuristic.
+    Returns 0 for binary types (images, audio, video).
+    """
+    from .utils.tokens import estimate_tokens as _et
+    return _et(path)
+# ---------------------------------------------------------------------------
+# File identity utilities
+# ---------------------------------------------------------------------------
+def extract_exif(path: str | Path) -> "ExifData":
+    """
+    Extract EXIF metadata from an image file.
+    Returns typed ExifData — no raw IFD tag parsing needed.
+    What this eliminates: manually converting GPS DMS→decimal, parsing
+    Rational values, handling missing tags. All done for you.
+    result.camera_make     → "Apple"
+    result.camera_model    → "iPhone 15 Pro"
+    result.focal_length    → 6.86   (mm)
+    result.aperture        → 1.78   (f-number)
+    result.iso             → 50
+    result.datetime_taken  → "2024:03:15 14:22:31"
+    result.gps.latitude    → 51.507351   (decimal degrees, ready to use)
+    result.gps.longitude   → -0.127758
+    result.gps.google_maps_url  → "https://www.google.com/maps?q=51.5,−0.12"
+    """
+    from .extractors.exif_extractor import extract_exif as _ee
+    return _ee(Path(path))
+def detect_pii(text: str) -> "PIIResult":
+    """
+    Scan text for Personally Identifiable Information.
+    Detects: email, phone (US + intl), credit card, SSN, IBAN,
+    IP address, API keys, AWS keys, bearer tokens.
+    Works offline. No LLM. No API key. Regex-based, fast.
+    result.has_pii         → True
+    result.types_found     → ["email", "credit_card", "aws_key"]
+    result.count           → 3
+    result.matches[0]      → PIIMatch(type="email", value="...", start=12)
+    result.redacted_text   → "...send to [REDACTED_EMAIL]..."
+    """
+    from .features.pipeline import detect_pii as _dp
+    return _dp(text)
+def redact_pii(text: str) -> str:
+    """Replace all detected PII with [REDACTED_TYPE] tags."""
+    from .features.pipeline import redact_pii as _rp
+    return _rp(text)
+def content_hash(path: str | Path) -> "ContentHash":
+    """
+    Compute SHA-256 + MD5 of file content.
+    Reads in 64KB chunks — works on files of any size without loading
+    into memory. Use SHA-256 for deduplication and integrity checks,
+    MD5 for legacy system compatibility.
+    result.sha256  → "a750aec01847d06d..."
+    result.md5     → "d7591a0ac484c964..."
+    result == other_hash  → True if same file content
+    str(result)    → "a750aec01847d06d..."  (short display form)
+    """
+    from .features.pipeline import content_hash as _ch
+    return _ch(Path(path))
+def find_duplicates(paths: list, *, min_size: int = 1) -> list:
+    """
+    Find files with identical content in a list of paths.
+    Uses SHA-256 — catches exact binary duplicates regardless of filename.
+    Returns only groups with 2+ files. Empty list = no duplicates.
+    group.count        → 3          (how many copies)
+    group.wasted_bytes → 40         (space wasted by copies)
+    group.paths        → [Path(...), Path(...), Path(...)]
+    Example:
+        groups = find_duplicates(list(Path("uploads").rglob("*")))
+        for g in groups:
+            # keep first, delete the rest
+            for duplicate in g.paths[1:]:
+                duplicate.unlink()
+    """
+    from .features.pipeline import find_duplicates as _fd
+    return _fd([Path(p) for p in paths], min_size=min_size)
+def diff_files(path_a: str | Path, path_b: str | Path) -> "FileDiff":
+    """
+    Structural diff between two text files.
+    Good for: comparing document versions, detecting what changed
+    in a contract, checking if a config file was modified.
+    diff.lines_added    → 6
+    diff.lines_removed  → 3
+    diff.diff_ratio     → 0.72   (0.0=completely different, 1.0=identical)
+    diff.identical      → False
+    diff.summary        → "+6 lines added, -3 lines removed, 72% similar"
+    diff.unified_diff   → standard unified diff string (--- a/  +++ b/ format)
+    """
+    from .features.pipeline import diff_files as _df
+    return _df(Path(path_a), Path(path_b))
+def analyze_many(
+    paths: list,
+    *,
+    max_workers: int = 8,
+    skip_metadata: bool = False,
+    on_progress: Any = None,
+) -> "BatchResult":
+    """
+    Analyze a list of files concurrently using a thread pool.
+    Returns BatchResult — aggregate stats + per-file AnalysisResult dict.
+    batch.total              → 50
+    batch.succeeded          → 47
+    batch.failed             → 3
+    batch.duration_seconds   → 1.24
+    batch.success_rate       → 0.94
+    batch.results["path"]    → AnalysisResult
+    batch.errors["path"]     → "error message"
+    on_progress callback: fn(completed: int, total: int, path: str)
+    Example — find all high-risk files in an uploads folder:
+        batch = analyze_many(list(Path("uploads").rglob("*")))
+        risky = [p for p, r in batch.results.items() if r.risk_score > 50]
+    """
+    from .features.pipeline import analyze_many as _am
+    return _am(paths, max_workers=max_workers,
+               skip_metadata=skip_metadata, on_progress=on_progress)
+# ---------------------------------------------------------------------------
+# TYPE_CHECKING imports for IDE support (avoids circular imports at runtime)
+# ---------------------------------------------------------------------------
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .extractors.exif_extractor import ExifData
+    from .features.pipeline import (
+        PIIResult, ContentHash, DuplicateGroup, FileDiff, BatchResult
+    )

filedna/cli/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .commands import cli, main
+__all__ = ["cli", "main"]

filedna/cli/commands.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""
+FileDNA CLI.
+Usage:
+    filedna analyze <path> [--pretty] [--json] [--no-metadata]
+    filedna validate <path>
+    filedna type <path>
+    filedna tokens <path>
+    filedna url <url>
+"""
+from __future__ import annotations
+import json
+import sys
+import click
+@click.group()
+@click.version_option(package_name="filedna")
+def cli() -> None:
+    """FileDNA – Discover a file's true identity."""
+# ---------------------------------------------------------------------------
+# analyze
+# ---------------------------------------------------------------------------
+@cli.command()
+@click.argument("path")
+@click.option("--pretty", is_flag=True, default=False, help="Human-friendly output")
+@click.option("--no-metadata", is_flag=True, default=False, help="Skip metadata extraction")
+def analyze(path: str, pretty: bool, no_metadata: bool) -> None:
+    """Analyze PATH and print a full identity report."""
+    from .. import analyze as _analyze
+    result = _analyze(path, skip_metadata=no_metadata)
+    if pretty:
+        _print_pretty(result)
+    else:
+        click.echo(json.dumps(result.model_dump(), indent=2, default=str))
+def _print_pretty(result) -> None:  # type: ignore[type-arg]
+    from ..models.result import AnalysisResult
+    r: AnalysisResult = result
+    icon = "✓" if r.valid else "✗"
+    color = "green" if r.valid else "red"
+    click.echo(click.style(f"{icon} {r.real_type.upper()}", fg=color, bold=True))
+    click.echo()
+    meta = r.metadata
+    if "pages" in meta:
+        click.echo(f"Pages:        {meta['pages']}")
+    if "slides" in meta:
+        click.echo(f"Slides:       {meta['slides']}")
+    if "paragraphs" in meta:
+        click.echo(f"Paragraphs:   {meta['paragraphs']}")
+    if "sheets" in meta:
+        click.echo(f"Sheets:       {meta['sheets']}")
+    if "duration" in meta:
+        click.echo(f"Duration:     {meta['duration']}s")
+    if "width" in meta and "height" in meta:
+        click.echo(f"Dimensions:   {meta['width']}×{meta['height']}")
+    if "language" in meta:
+        click.echo(f"Language:     {meta['language']}")
+    if "contains_tables" in meta and meta["contains_tables"]:
+        click.echo("Contains tables")
+    if "contains_images" in meta and meta["contains_images"]:
+        click.echo("Contains images")
+    click.echo(f"Size:         {r.size_human}")
+    if "estimated_tokens" in meta:
+        tok = meta["estimated_tokens"]
+        tok_str = f"{tok / 1000:.1f}k" if tok >= 1000 else str(tok)
+        click.echo(f"Tokens:       {tok_str}")
+    risk_color = "green" if r.risk_score == 0 else ("yellow" if r.risk_score < 50 else "red")
+    click.echo(f"Risk Score:   {click.style(str(r.risk_score), fg=risk_color)}")
+    click.echo(f"MIME:         {r.mime}")
+    ext_match = click.style("yes", fg="green") if r.extension_matches else click.style("no", fg="red")
+    click.echo(f"Ext match:    {ext_match}  ({r.extension!r} declared)")
+    if r.warnings:
+        click.echo()
+        for w in r.warnings:
+            click.echo(click.style(f"⚠  {w}", fg="yellow"))
+    if r.errors:
+        click.echo()
+        for e in r.errors:
+            click.echo(click.style(f"✗  {e}", fg="red"))
+# ---------------------------------------------------------------------------
+# validate
+# ---------------------------------------------------------------------------
+@cli.command()
+@click.argument("path")
+def validate(path: str) -> None:
+    """Validate PATH and print result."""
+    from .. import validate as _validate
+    result = _validate(path)
+    icon = "✓" if result.valid else "✗"
+    color = "green" if result.valid else "red"
+    click.echo(click.style(f"{icon} {result.real_type.upper()}", fg=color, bold=True))
+    if result.errors:
+        for e in result.errors:
+            click.echo(click.style(f"  ✗  {e}", fg="red"))
+    if result.warnings:
+        for w in result.warnings:
+            click.echo(click.style(f"  ⚠  {w}", fg="yellow"))
+    sys.exit(0 if result.valid else 1)
+# ---------------------------------------------------------------------------
+# type
+# ---------------------------------------------------------------------------
+@cli.command(name="type")
+@click.argument("path")
+def detect_type_cmd(path: str) -> None:
+    """Print the detected real type of PATH."""
+    from .. import detect_type
+    click.echo(detect_type(path))
+# ---------------------------------------------------------------------------
+# tokens
+# ---------------------------------------------------------------------------
+@cli.command()
+@click.argument("path")
+def tokens(path: str) -> None:
+    """Estimate token count for PATH."""
+    from .. import estimate_tokens
+    click.echo(estimate_tokens(path))
+# ---------------------------------------------------------------------------
+# url
+# ---------------------------------------------------------------------------
+@cli.command()
+@click.argument("url")
+@click.option("--pretty", is_flag=True, default=False, help="Human-friendly output")
+def url(url: str, pretty: bool) -> None:
+    """Inspect URL and print content type / metadata."""
+    from .. import inspect_url as _inspect_url
+    result = _inspect_url(url)
+    if pretty:
+        valid = result.get("valid", False)
+        icon = "✓" if valid else "✗"
+        color = "green" if valid else "red"
+        click.echo(click.style(f"{icon} {result.get('real_type', 'unknown').upper()}", fg=color, bold=True))
+        click.echo(f"URL:    {result['url']}")
+        click.echo(f"MIME:   {result.get('mime', 'unknown')}")
+        if result.get("size_human"):
+            click.echo(f"Size:   {result['size_human']}")
+        if result.get("status_code"):
+            click.echo(f"HTTP:   {result['status_code']}")
+        for e in result.get("errors", []):
+            click.echo(click.style(f"✗  {e}", fg="red"))
+    else:
+        click.echo(json.dumps(result, indent=2, default=str))
+def main() -> None:
+    cli()
+if __name__ == "__main__":
+    main()

filedna/core/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .engine import analyze_file
+from .risk import compute_risk
+from .url_inspector import inspect_url
+__all__ = ["analyze_file", "inspect_url", "compute_risk"]

filedna/core/engine.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""
+FileDNA – core analysis engine.
+Orchestrates detection → validation → inspection → risk scoring.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any
+from ..core.risk import compute_risk
+from ..detectors.type_detector import detect, extension_matches, get_extension
+from ..inspectors.metadata import human_size, inspect
+from ..models.result import AnalysisResult
+from ..validators.file_validators import validate
+def _make_result(**kwargs: Any) -> AnalysisResult:
+    return AnalysisResult(**kwargs)
+def analyze_file(path: str | Path, *, skip_metadata: bool = False) -> AnalysisResult:
+    """
+    Full analysis pipeline for a local file.
+    Steps:
+      1. Existence & readability check
+      2. Type detection (magic bytes)
+      3. Extension mismatch check
+      4. Structural validation
+      5. Metadata extraction
+      6. Risk scoring
+    """
+    p = Path(path)
+    errors: list[str] = []
+    warnings: list[str] = []
+    # ------------------------------------------------------------------ #
+    # 1. File existence / readability                                       #
+    # ------------------------------------------------------------------ #
+    if not p.exists():
+        return _make_result(
+            valid=False,
+            errors=[f"File not found: {path}"],
+            warnings=[],
+        )
+    if not p.is_file():
+        return _make_result(
+            valid=False,
+            errors=[f"Path is not a file: {path}"],
+            warnings=[],
+        )
+    if not os.access(p, os.R_OK):
+        return _make_result(
+            valid=False,
+            errors=[f"File is not readable: {path}"],
+            warnings=[],
+        )
+    size_bytes = p.stat().st_size
+    size_human_str = human_size(size_bytes)
+    if size_bytes == 0:
+        return _make_result(
+            valid=False,
+            size_bytes=0,
+            size_human="0 B",
+            errors=["File is empty"],
+            warnings=[],
+            risk_score=30,
+        )
+    declared_ext = get_extension(p)
+    # ------------------------------------------------------------------ #
+    # 2. Type detection                                                    #
+    # ------------------------------------------------------------------ #
+    real_type, mime = detect(p)
+    # ------------------------------------------------------------------ #
+    # 3. Extension mismatch                                                #
+    # ------------------------------------------------------------------ #
+    ext_ok = extension_matches(real_type, declared_ext)
+    if not ext_ok and declared_ext:
+        errors.append(f"File is not a valid {declared_ext.upper()} (real type: {real_type})")
+    # ------------------------------------------------------------------ #
+    # 4. Structural validation                                             #
+    # ------------------------------------------------------------------ #
+    valid, val_errors, val_warnings = validate(p, real_type)
+    errors.extend(val_errors)
+    warnings.extend(val_warnings)
+    # ------------------------------------------------------------------ #
+    # 5. Metadata extraction                                               #
+    # ------------------------------------------------------------------ #
+    metadata: dict[str, Any] = {}
+    if not skip_metadata:
+        metadata = inspect(p, real_type)
+    # ------------------------------------------------------------------ #
+    # 6. Risk scoring                                                      #
+    # ------------------------------------------------------------------ #
+    risk_score, risk_warnings = compute_risk(
+        valid=valid,
+        extension_matches=ext_ok,
+        errors=errors,
+        warnings=warnings,
+        metadata=metadata,
+        real_type=real_type,
+        path=p,
+    )
+    warnings.extend(risk_warnings)
+    # Extension mismatch means the file is NOT what it claims to be
+    # (even if the actual content is valid for its real type)
+    final_valid = valid and len(errors) == 0 and ext_ok
+    return _make_result(
+        valid=final_valid,
+        real_type=real_type,
+        mime=mime,
+        extension=declared_ext,
+        extension_matches=ext_ok,
+        size_bytes=size_bytes,
+        size_human=size_human_str,
+        risk_score=risk_score,
+        warnings=warnings,
+        errors=errors,
+        metadata=metadata,
+    )

filedna/core/risk.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""
+FileDNA – risk scoring engine.
+Computes a 0-100 risk score based on validation results and metadata.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+def compute_risk(
+    *,
+    valid: bool,
+    extension_matches: bool,
+    errors: list[str],
+    warnings: list[str],
+    metadata: dict[str, Any],
+    real_type: str,
+    path: Path,
+) -> tuple[int, list[str]]:
+    """
+    Return (risk_score, extra_warnings).
+    Score is capped at 100.
+    """
+    score = 0
+    extra_warnings: list[str] = []
+    # Extension mismatch
+    if not extension_matches:
+        score += 40
+        extra_warnings.append("Extension mismatch")
+    # Corrupted / unreadable
+    if not valid:
+        score += 50
+    elif errors:
+        score += 30
+    # Metadata could not be read
+    if "inspection_error" in metadata:
+        score += 20
+        extra_warnings.append("Metadata could not be fully extracted")
+    # Empty file
+    if path.stat().st_size == 0:
+        score += 30
+        extra_warnings.append("File is empty")
+    # Embedded executable heuristic for ZIP-based formats
+    if real_type in ("zip", "docx", "xlsx", "pptx", "epub"):
+        try:
+            import zipfile
+            with zipfile.ZipFile(path) as z:
+                for name in z.namelist():
+                    low = name.lower()
+                    if any(low.endswith(ext) for ext in (
+                        ".exe", ".dll", ".bat", ".cmd", ".ps1", ".vbs",
+                        ".msi", ".scr", ".com", ".pif",
+                    )):
+                        score += 80
+                        extra_warnings.append(
+                            f"Embedded executable detected: {name}"
+                        )
+                        break
+        except Exception:
+            pass
+    return min(score, 100), extra_warnings