valscanner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valscanner/__init__.py +3 -0
- valscanner/cli.py +101 -0
- valscanner/core/__init__.py +0 -0
- valscanner/core/categories.py +50 -0
- valscanner/core/db.py +117 -0
- valscanner/core/export.py +32 -0
- valscanner/core/metadata.py +175 -0
- valscanner/core/scanner.py +238 -0
- valscanner/core/schema.py +94 -0
- valscanner/core/similarity.py +168 -0
- valscanner/core/tagging.py +102 -0
- valscanner/gui/__init__.py +0 -0
- valscanner/gui/constants.py +50 -0
- valscanner/gui/delegates.py +138 -0
- valscanner/gui/dialogs.py +470 -0
- valscanner/gui/models.py +190 -0
- valscanner/gui/panels/__init__.py +0 -0
- valscanner/gui/panels/console.py +87 -0
- valscanner/gui/panels/detail.py +271 -0
- valscanner/gui/panels/folders.py +275 -0
- valscanner/gui/panels/scans.py +141 -0
- valscanner/gui/panels/similar.py +564 -0
- valscanner/gui/preferences.py +400 -0
- valscanner/gui/theme.py +87 -0
- valscanner/gui/window.py +1673 -0
- valscanner/gui/workers.py +81 -0
- valscanner-0.1.0.dist-info/LICENSE +21 -0
- valscanner-0.1.0.dist-info/METADATA +248 -0
- valscanner-0.1.0.dist-info/RECORD +32 -0
- valscanner-0.1.0.dist-info/WHEEL +5 -0
- valscanner-0.1.0.dist-info/entry_points.txt +3 -0
- valscanner-0.1.0.dist-info/top_level.txt +1 -0
valscanner/__init__.py
ADDED
valscanner/cli.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
"""
|
|
4
|
+
ValScanner CLI ā scan a directory and build a searchable file database.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
valscanner /path/to/scan [options]
|
|
8
|
+
valscanner --list-scans --db my.db
|
|
9
|
+
valscanner --delete-scan 3 --db my.db
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
import argparse
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from .core.metadata import PIL_AVAILABLE, MUTAGEN_AVAILABLE, PYPDF_AVAILABLE
|
|
18
|
+
from .core.scanner import scan
|
|
19
|
+
from .core.export import export_csv, export_json
|
|
20
|
+
from .core.db import query_db, print_summary, list_scans, delete_scan
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> None:
|
|
24
|
+
parser = argparse.ArgumentParser(
|
|
25
|
+
description="Scan a directory and build a searchable file database.",
|
|
26
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument("path", nargs="?", help="Root directory to scan")
|
|
29
|
+
parser.add_argument("--db", default="file_index.db",
|
|
30
|
+
help="SQLite database path (default: file_index.db)")
|
|
31
|
+
parser.add_argument("--label", metavar="NAME", default="",
|
|
32
|
+
help="Human-readable label for this scan")
|
|
33
|
+
parser.add_argument("--export-csv", action="store_true", help="Export results to CSV")
|
|
34
|
+
parser.add_argument("--export-json", action="store_true", help="Export results to JSON")
|
|
35
|
+
parser.add_argument("--no-hash", action="store_true", help="Skip SHA-256 hashing")
|
|
36
|
+
parser.add_argument("--verbose", action="store_true", help="Print each file as indexed")
|
|
37
|
+
parser.add_argument("--query", metavar="TERM", help="Query the database after scanning")
|
|
38
|
+
parser.add_argument("--list-scans", action="store_true", help="List all scans in the database")
|
|
39
|
+
parser.add_argument("--delete-scan", type=int, metavar="ID", help="Delete a scan by ID")
|
|
40
|
+
args = parser.parse_args()
|
|
41
|
+
|
|
42
|
+
if args.list_scans:
|
|
43
|
+
scans = list_scans(args.db)
|
|
44
|
+
if not scans:
|
|
45
|
+
print("No scans in database.")
|
|
46
|
+
for s in scans:
|
|
47
|
+
print(f" [{s['id']:3d}] {s['label'] or s['root']:40s} "
|
|
48
|
+
f"{s['file_count']:>8,} files {s['total_human']:>10s} {s['scanned_at']}")
|
|
49
|
+
sys.exit(0)
|
|
50
|
+
|
|
51
|
+
if args.delete_scan is not None:
|
|
52
|
+
delete_scan(args.db, args.delete_scan)
|
|
53
|
+
print(f"ā Scan {args.delete_scan} deleted.")
|
|
54
|
+
sys.exit(0)
|
|
55
|
+
|
|
56
|
+
if not args.path:
|
|
57
|
+
parser.error("path is required unless using --list-scans or --delete-scan")
|
|
58
|
+
|
|
59
|
+
root = Path(args.path).expanduser().resolve()
|
|
60
|
+
if not root.exists():
|
|
61
|
+
print(f"Error: path does not exist: {root}")
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
print(f"\nš Scanning: {root}")
|
|
65
|
+
print(f" Database: {args.db}")
|
|
66
|
+
if args.label:
|
|
67
|
+
print(f" Label: {args.label}")
|
|
68
|
+
if not PIL_AVAILABLE:
|
|
69
|
+
print(" ā Pillow not installed ā image EXIF metadata skipped")
|
|
70
|
+
if not MUTAGEN_AVAILABLE:
|
|
71
|
+
print(" ā mutagen not installed ā audio metadata skipped")
|
|
72
|
+
if not PYPDF_AVAILABLE:
|
|
73
|
+
print(" ā PyPDF2 not installed ā PDF metadata skipped")
|
|
74
|
+
print()
|
|
75
|
+
|
|
76
|
+
t0 = time.time()
|
|
77
|
+
stats = scan(root, args.db, compute_hash=not args.no_hash,
|
|
78
|
+
verbose=args.verbose, label=args.label)
|
|
79
|
+
elapsed = time.time() - t0
|
|
80
|
+
|
|
81
|
+
print(f"\nā
Done in {elapsed:.1f}s ā "
|
|
82
|
+
f"scan #{stats['scan_id']}, "
|
|
83
|
+
f"{stats['scanned']:,} indexed, "
|
|
84
|
+
f"{stats['errors']:,} errors, "
|
|
85
|
+
f"{stats['skipped']:,} skipped")
|
|
86
|
+
|
|
87
|
+
print_summary(args.db)
|
|
88
|
+
|
|
89
|
+
if args.export_csv:
|
|
90
|
+
export_csv(args.db, args.db.replace(".db", ".csv"), scan_id=stats["scan_id"])
|
|
91
|
+
if args.export_json:
|
|
92
|
+
export_json(args.db, args.db.replace(".db", ".json"), scan_id=stats["scan_id"])
|
|
93
|
+
if args.query:
|
|
94
|
+
query_db(args.db, args.query)
|
|
95
|
+
|
|
96
|
+
print(f" š” Run with --query photos, --list-scans, or open the GUI:\n"
|
|
97
|
+
f" valscanner-gui\n")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
MIME_CATEGORY: dict[str, str] = {
|
|
4
|
+
"image": "photo",
|
|
5
|
+
"video": "video",
|
|
6
|
+
"audio": "audio",
|
|
7
|
+
"text": "document",
|
|
8
|
+
"application": "application",
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
EXT_CATEGORY: dict[str, str] = {
|
|
12
|
+
# Documents
|
|
13
|
+
".pdf": "document", ".doc": "document", ".docx": "document",
|
|
14
|
+
".odt": "document", ".rtf": "document", ".txt": "document",
|
|
15
|
+
".md": "document", ".rst": "document", ".tex": "document",
|
|
16
|
+
# Spreadsheets
|
|
17
|
+
".xls": "spreadsheet", ".xlsx": "spreadsheet", ".ods": "spreadsheet",
|
|
18
|
+
".csv": "spreadsheet",
|
|
19
|
+
# Presentations
|
|
20
|
+
".ppt": "presentation", ".pptx": "presentation", ".odp": "presentation",
|
|
21
|
+
# Photos / images
|
|
22
|
+
".jpg": "photo", ".jpeg": "photo", ".png": "photo", ".gif": "photo",
|
|
23
|
+
".bmp": "photo", ".tiff": "photo", ".tif": "photo", ".webp": "photo",
|
|
24
|
+
".heic": "photo", ".heif": "photo", ".raw": "photo", ".cr2": "photo",
|
|
25
|
+
".nef": "photo", ".svg": "image",
|
|
26
|
+
# Video
|
|
27
|
+
".mp4": "video", ".mov": "video", ".avi": "video", ".mkv": "video",
|
|
28
|
+
".wmv": "video", ".flv": "video", ".webm": "video", ".m4v": "video",
|
|
29
|
+
# Audio
|
|
30
|
+
".mp3": "audio", ".flac": "audio", ".wav": "audio", ".aac": "audio",
|
|
31
|
+
".ogg": "audio", ".m4a": "audio", ".wma": "audio",
|
|
32
|
+
# Code
|
|
33
|
+
".py": "code", ".js": "code", ".ts": "code", ".java": "code",
|
|
34
|
+
".c": "code", ".cpp": "code", ".h": "code", ".cs": "code",
|
|
35
|
+
".go": "code", ".rs": "code", ".rb": "code", ".php": "code",
|
|
36
|
+
".sh": "code", ".bat": "code", ".ps1": "code",
|
|
37
|
+
# Data
|
|
38
|
+
".json": "data", ".xml": "data", ".yaml": "data", ".yml": "data",
|
|
39
|
+
".toml": "data", ".ini": "data", ".sql": "data",
|
|
40
|
+
# Archives
|
|
41
|
+
".zip": "archive", ".tar": "archive", ".gz": "archive", ".bz2": "archive",
|
|
42
|
+
".7z": "archive", ".rar": "archive", ".xz": "archive",
|
|
43
|
+
# Executables / installers
|
|
44
|
+
".exe": "executable", ".msi": "executable", ".dmg": "executable",
|
|
45
|
+
".deb": "executable", ".rpm": "executable", ".appimage": "executable",
|
|
46
|
+
# Fonts
|
|
47
|
+
".ttf": "font", ".otf": "font", ".woff": "font", ".woff2": "font",
|
|
48
|
+
# Ebooks
|
|
49
|
+
".epub": "ebook", ".mobi": "ebook", ".azw": "ebook",
|
|
50
|
+
}
|
valscanner/core/db.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import sqlite3
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
from .schema import human_size
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def list_scans(db_path: str) -> list[dict]:
|
|
9
|
+
conn = sqlite3.connect(db_path)
|
|
10
|
+
conn.row_factory = sqlite3.Row
|
|
11
|
+
try:
|
|
12
|
+
rows = conn.execute(
|
|
13
|
+
"SELECT id, label, root, scanned_at, file_count, total_bytes, total_human "
|
|
14
|
+
"FROM scans ORDER BY id"
|
|
15
|
+
).fetchall()
|
|
16
|
+
except sqlite3.OperationalError:
|
|
17
|
+
rows = []
|
|
18
|
+
conn.close()
|
|
19
|
+
return [dict(r) for r in rows]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def delete_scan(db_path: str, scan_id: int) -> None:
|
|
23
|
+
conn = sqlite3.connect(db_path)
|
|
24
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
25
|
+
conn.execute("DELETE FROM scans WHERE id = ?", (scan_id,))
|
|
26
|
+
conn.commit()
|
|
27
|
+
conn.close()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def query_db(db_path: str, term: str) -> None:
|
|
31
|
+
conn = sqlite3.connect(db_path)
|
|
32
|
+
conn.row_factory = sqlite3.Row
|
|
33
|
+
print(f"\nš Searching for: '{term}'\n{'ā'*60}")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
fts_rows = conn.execute(
|
|
37
|
+
"SELECT path, category, size_human, tags FROM files "
|
|
38
|
+
"WHERE id IN (SELECT rowid FROM files_fts WHERE files_fts MATCH ?) "
|
|
39
|
+
"ORDER BY path LIMIT 50",
|
|
40
|
+
(term,),
|
|
41
|
+
).fetchall()
|
|
42
|
+
except sqlite3.OperationalError:
|
|
43
|
+
fts_rows = []
|
|
44
|
+
|
|
45
|
+
like = f"%{term}%"
|
|
46
|
+
like_rows = conn.execute(
|
|
47
|
+
"SELECT path, category, size_human, tags FROM files "
|
|
48
|
+
"WHERE path LIKE ? OR category LIKE ? OR tags LIKE ? ORDER BY path LIMIT 50",
|
|
49
|
+
(like, like, like),
|
|
50
|
+
).fetchall()
|
|
51
|
+
|
|
52
|
+
seen: set[str] = set()
|
|
53
|
+
total = 0
|
|
54
|
+
for row in list(fts_rows) + list(like_rows):
|
|
55
|
+
p = row["path"]
|
|
56
|
+
if p in seen:
|
|
57
|
+
continue
|
|
58
|
+
seen.add(p)
|
|
59
|
+
total += 1
|
|
60
|
+
print(f" {row['category']:14s} {row['size_human']:>10s} {p}")
|
|
61
|
+
print(f" {'':14s} tags: {row['tags']}\n")
|
|
62
|
+
|
|
63
|
+
print(f"{'ā'*60}\n Found {total} result(s).")
|
|
64
|
+
conn.close()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def print_summary(db_path: str) -> None:
|
|
68
|
+
conn = sqlite3.connect(db_path)
|
|
69
|
+
total, = conn.execute("SELECT COUNT(*) FROM files").fetchone()
|
|
70
|
+
total_size, = conn.execute("SELECT SUM(size_bytes) FROM files").fetchone()
|
|
71
|
+
total_size = total_size or 0
|
|
72
|
+
|
|
73
|
+
scans = conn.execute(
|
|
74
|
+
"SELECT id, label, root, scanned_at, file_count, total_human FROM scans ORDER BY id"
|
|
75
|
+
).fetchall()
|
|
76
|
+
|
|
77
|
+
print(f"\n{'ā'*60}")
|
|
78
|
+
if len(scans) > 1:
|
|
79
|
+
print(f" š¦ Scans in database: {len(scans)}")
|
|
80
|
+
for sid, slabel, sroot, sat, sfc, sth in scans:
|
|
81
|
+
print(f" [{sid}] {slabel or sroot} ā {sfc:,} files {sth} ({sat})")
|
|
82
|
+
print()
|
|
83
|
+
|
|
84
|
+
print(f" š Total files indexed : {total:,}")
|
|
85
|
+
print(f" š¾ Total size : {human_size(total_size)}")
|
|
86
|
+
print(f"\n Files by category:")
|
|
87
|
+
for cat, cnt, sz in conn.execute(
|
|
88
|
+
"SELECT category, COUNT(*), SUM(size_bytes) FROM files "
|
|
89
|
+
"GROUP BY category ORDER BY 2 DESC"
|
|
90
|
+
):
|
|
91
|
+
print(f" {cat:20s} {cnt:>6,} files {human_size(sz or 0):>10s}")
|
|
92
|
+
|
|
93
|
+
print(f"\n Top 10 most common extensions:")
|
|
94
|
+
for ext, cnt in conn.execute(
|
|
95
|
+
"SELECT extension, COUNT(*) AS cnt FROM files "
|
|
96
|
+
"GROUP BY extension ORDER BY 2 DESC LIMIT 10"
|
|
97
|
+
):
|
|
98
|
+
print(f" {ext:12s} {cnt:>6,}")
|
|
99
|
+
|
|
100
|
+
print(f"\n Top 10 tags:")
|
|
101
|
+
all_tags = conn.execute("SELECT tags FROM files WHERE tags != ''").fetchall()
|
|
102
|
+
tag_counter: Counter = Counter()
|
|
103
|
+
for (tag_str,) in all_tags:
|
|
104
|
+
for t in tag_str.split(", "):
|
|
105
|
+
tag_counter[t.strip()] += 1
|
|
106
|
+
for tag, cnt in tag_counter.most_common(10):
|
|
107
|
+
print(f" {tag:30s} {cnt:>6,}")
|
|
108
|
+
|
|
109
|
+
print(f"\n Top 10 largest folders (cumulative):")
|
|
110
|
+
for fpath_str, tb, fc in conn.execute(
|
|
111
|
+
"SELECT path, SUM(total_bytes), SUM(file_count) FROM folders "
|
|
112
|
+
"GROUP BY path ORDER BY 2 DESC LIMIT 10"
|
|
113
|
+
):
|
|
114
|
+
print(f" {human_size(tb):>10s} ({fc:,} files) {fpath_str}")
|
|
115
|
+
|
|
116
|
+
print(f"{'ā'*60}\n")
|
|
117
|
+
conn.close()
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import csv
|
|
3
|
+
import json
|
|
4
|
+
import sqlite3
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def export_csv(db_path: str, out_path: str, scan_id: int | None = None) -> None:
|
|
8
|
+
conn = sqlite3.connect(db_path)
|
|
9
|
+
conn.row_factory = sqlite3.Row
|
|
10
|
+
q = "SELECT * FROM files" + (" WHERE scan_id=?" if scan_id else "") + " ORDER BY path"
|
|
11
|
+
rows = conn.execute(q, (scan_id,) if scan_id else ()).fetchall()
|
|
12
|
+
if not rows:
|
|
13
|
+
print("No rows to export.")
|
|
14
|
+
conn.close()
|
|
15
|
+
return
|
|
16
|
+
with open(out_path, "w", newline="", encoding="utf-8") as f:
|
|
17
|
+
w = csv.DictWriter(f, fieldnames=rows[0].keys())
|
|
18
|
+
w.writeheader()
|
|
19
|
+
w.writerows([dict(r) for r in rows])
|
|
20
|
+
conn.close()
|
|
21
|
+
print(f"ā CSV exported ā {out_path}")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def export_json(db_path: str, out_path: str, scan_id: int | None = None) -> None:
|
|
25
|
+
conn = sqlite3.connect(db_path)
|
|
26
|
+
conn.row_factory = sqlite3.Row
|
|
27
|
+
q = "SELECT * FROM files" + (" WHERE scan_id=?" if scan_id else "") + " ORDER BY path"
|
|
28
|
+
rows = [dict(r) for r in conn.execute(q, (scan_id,) if scan_id else ())]
|
|
29
|
+
conn.close()
|
|
30
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
31
|
+
json.dump(rows, f, indent=2, ensure_ascii=False)
|
|
32
|
+
print(f"ā JSON exported ā {out_path}")
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import io
|
|
3
|
+
import hashlib
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from PIL import Image
|
|
10
|
+
from PIL.ExifTags import TAGS
|
|
11
|
+
PIL_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
PIL_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import mutagen
|
|
17
|
+
MUTAGEN_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
MUTAGEN_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import PyPDF2
|
|
23
|
+
PYPDF_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
PYPDF_AVAILABLE = False
|
|
26
|
+
|
|
27
|
+
FFMPEG_AVAILABLE = bool(shutil.which("ffmpeg"))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_image_metadata(path: Path) -> dict:
|
|
31
|
+
if not PIL_AVAILABLE:
|
|
32
|
+
return {}
|
|
33
|
+
try:
|
|
34
|
+
with Image.open(path) as img:
|
|
35
|
+
meta = {
|
|
36
|
+
"img_width": img.width,
|
|
37
|
+
"img_height": img.height,
|
|
38
|
+
"img_mode": img.mode,
|
|
39
|
+
"img_format": img.format,
|
|
40
|
+
}
|
|
41
|
+
exif_data = img._getexif() if hasattr(img, "_getexif") else None
|
|
42
|
+
if exif_data:
|
|
43
|
+
readable = {}
|
|
44
|
+
for tag_id, value in exif_data.items():
|
|
45
|
+
tag = TAGS.get(tag_id, str(tag_id))
|
|
46
|
+
if isinstance(value, (str, int, float)):
|
|
47
|
+
readable[tag] = value
|
|
48
|
+
if "DateTime" in readable:
|
|
49
|
+
meta["exif_datetime"] = readable["DateTime"]
|
|
50
|
+
if "Make" in readable:
|
|
51
|
+
meta["exif_camera_make"] = readable["Make"]
|
|
52
|
+
if "Model" in readable:
|
|
53
|
+
meta["exif_camera_model"] = readable["Model"]
|
|
54
|
+
if "GPSInfo" in exif_data:
|
|
55
|
+
meta["has_gps"] = True
|
|
56
|
+
return meta
|
|
57
|
+
except Exception:
|
|
58
|
+
return {}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_audio_metadata(path: Path) -> dict:
|
|
62
|
+
if not MUTAGEN_AVAILABLE:
|
|
63
|
+
return {}
|
|
64
|
+
try:
|
|
65
|
+
audio = mutagen.File(path, easy=True)
|
|
66
|
+
if audio is None:
|
|
67
|
+
return {}
|
|
68
|
+
meta = {}
|
|
69
|
+
for key in ("title", "artist", "album", "date", "genre", "tracknumber"):
|
|
70
|
+
if key in audio:
|
|
71
|
+
meta[f"audio_{key}"] = ", ".join(audio[key])
|
|
72
|
+
if hasattr(audio, "info"):
|
|
73
|
+
info = audio.info
|
|
74
|
+
if hasattr(info, "length"):
|
|
75
|
+
meta["audio_duration_sec"] = round(info.length, 2)
|
|
76
|
+
if hasattr(info, "bitrate"):
|
|
77
|
+
meta["audio_bitrate"] = info.bitrate
|
|
78
|
+
return meta
|
|
79
|
+
except Exception:
|
|
80
|
+
return {}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def extract_pdf_metadata(path: Path) -> dict:
|
|
84
|
+
if not PYPDF_AVAILABLE:
|
|
85
|
+
return {}
|
|
86
|
+
try:
|
|
87
|
+
with open(path, "rb") as f:
|
|
88
|
+
reader = PyPDF2.PdfReader(f)
|
|
89
|
+
meta = {"pdf_pages": len(reader.pages)}
|
|
90
|
+
info = reader.metadata
|
|
91
|
+
if info:
|
|
92
|
+
for k in ("/Title", "/Author", "/Subject", "/Creator", "/CreationDate"):
|
|
93
|
+
v = info.get(k)
|
|
94
|
+
if v:
|
|
95
|
+
meta[f"pdf_{k.strip('/').lower()}"] = str(v)
|
|
96
|
+
return meta
|
|
97
|
+
except Exception:
|
|
98
|
+
return {}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def file_sha256(path: Path, block: int = 65536) -> str:
|
|
102
|
+
h = hashlib.sha256()
|
|
103
|
+
try:
|
|
104
|
+
with open(path, "rb") as f:
|
|
105
|
+
while chunk := f.read(block):
|
|
106
|
+
h.update(chunk)
|
|
107
|
+
return h.hexdigest()
|
|
108
|
+
except Exception:
|
|
109
|
+
return ""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _thumb_image(fpath: Path, max_size: int, quality: int) -> bytes | None:
|
|
113
|
+
if not PIL_AVAILABLE:
|
|
114
|
+
return None
|
|
115
|
+
try:
|
|
116
|
+
img = Image.open(fpath)
|
|
117
|
+
if img.mode == "P" and "transparency" in img.info:
|
|
118
|
+
img = img.convert("RGBA")
|
|
119
|
+
img.thumbnail((max_size, max_size), Image.LANCZOS)
|
|
120
|
+
buf = io.BytesIO()
|
|
121
|
+
img.convert("RGB").save(buf, format="JPEG", quality=quality, optimize=True)
|
|
122
|
+
return buf.getvalue()
|
|
123
|
+
except Exception:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _thumb_video(fpath: Path, max_size: int, quality: int) -> bytes | None:
|
|
128
|
+
if not FFMPEG_AVAILABLE:
|
|
129
|
+
return None
|
|
130
|
+
try:
|
|
131
|
+
result = subprocess.run(
|
|
132
|
+
[
|
|
133
|
+
"ffmpeg", "-y", "-i", str(fpath),
|
|
134
|
+
"-ss", "00:00:01",
|
|
135
|
+
"-vframes", "1",
|
|
136
|
+
"-vf", f"scale={max_size}:{max_size}:force_original_aspect_ratio=decrease",
|
|
137
|
+
"-q:v", str(max(1, (100 - quality) // 10)),
|
|
138
|
+
"-f", "image2pipe", "-vcodec", "mjpeg", "pipe:1",
|
|
139
|
+
],
|
|
140
|
+
capture_output=True, timeout=30,
|
|
141
|
+
)
|
|
142
|
+
return result.stdout if result.returncode == 0 and result.stdout else None
|
|
143
|
+
except Exception:
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _sample_media(fpath: Path, category: str, duration: int) -> tuple[bytes, str] | None:
|
|
148
|
+
if not FFMPEG_AVAILABLE:
|
|
149
|
+
return None
|
|
150
|
+
try:
|
|
151
|
+
if category == "video":
|
|
152
|
+
cmd = [
|
|
153
|
+
"ffmpeg", "-y", "-i", str(fpath),
|
|
154
|
+
"-t", str(duration),
|
|
155
|
+
"-vf", "scale=320:-2",
|
|
156
|
+
"-c:v", "libx264", "-crf", "35", "-preset", "ultrafast",
|
|
157
|
+
"-c:a", "aac", "-b:a", "32k",
|
|
158
|
+
"-movflags", "frag_keyframe+empty_moov",
|
|
159
|
+
"-f", "mp4", "pipe:1",
|
|
160
|
+
]
|
|
161
|
+
fmt = "mp4"
|
|
162
|
+
else:
|
|
163
|
+
cmd = [
|
|
164
|
+
"ffmpeg", "-y", "-i", str(fpath),
|
|
165
|
+
"-t", str(duration),
|
|
166
|
+
"-c:a", "libmp3lame", "-b:a", "32k",
|
|
167
|
+
"-f", "mp3", "pipe:1",
|
|
168
|
+
]
|
|
169
|
+
fmt = "mp3"
|
|
170
|
+
result = subprocess.run(cmd, capture_output=True, timeout=60)
|
|
171
|
+
if result.returncode == 0 and result.stdout:
|
|
172
|
+
return result.stdout, fmt
|
|
173
|
+
except Exception:
|
|
174
|
+
pass
|
|
175
|
+
return None
|