valscanner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
valscanner/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """ValScanner — recursive file scanner with metadata, tagging, and duplicate detection."""
2
+
3
+ __version__ = "0.1.0"
valscanner/cli.py ADDED
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+ """
4
+ ValScanner CLI — scan a directory and build a searchable file database.
5
+
6
+ Usage:
7
+ valscanner /path/to/scan [options]
8
+ valscanner --list-scans --db my.db
9
+ valscanner --delete-scan 3 --db my.db
10
+ """
11
+
12
+ import sys
13
+ import time
14
+ import argparse
15
+ from pathlib import Path
16
+
17
+ from .core.metadata import PIL_AVAILABLE, MUTAGEN_AVAILABLE, PYPDF_AVAILABLE
18
+ from .core.scanner import scan
19
+ from .core.export import export_csv, export_json
20
+ from .core.db import query_db, print_summary, list_scans, delete_scan
21
+
22
+
23
+ def main() -> None:
24
+ parser = argparse.ArgumentParser(
25
+ description="Scan a directory and build a searchable file database.",
26
+ formatter_class=argparse.RawDescriptionHelpFormatter,
27
+ )
28
+ parser.add_argument("path", nargs="?", help="Root directory to scan")
29
+ parser.add_argument("--db", default="file_index.db",
30
+ help="SQLite database path (default: file_index.db)")
31
+ parser.add_argument("--label", metavar="NAME", default="",
32
+ help="Human-readable label for this scan")
33
+ parser.add_argument("--export-csv", action="store_true", help="Export results to CSV")
34
+ parser.add_argument("--export-json", action="store_true", help="Export results to JSON")
35
+ parser.add_argument("--no-hash", action="store_true", help="Skip SHA-256 hashing")
36
+ parser.add_argument("--verbose", action="store_true", help="Print each file as indexed")
37
+ parser.add_argument("--query", metavar="TERM", help="Query the database after scanning")
38
+ parser.add_argument("--list-scans", action="store_true", help="List all scans in the database")
39
+ parser.add_argument("--delete-scan", type=int, metavar="ID", help="Delete a scan by ID")
40
+ args = parser.parse_args()
41
+
42
+ if args.list_scans:
43
+ scans = list_scans(args.db)
44
+ if not scans:
45
+ print("No scans in database.")
46
+ for s in scans:
47
+ print(f" [{s['id']:3d}] {s['label'] or s['root']:40s} "
48
+ f"{s['file_count']:>8,} files {s['total_human']:>10s} {s['scanned_at']}")
49
+ sys.exit(0)
50
+
51
+ if args.delete_scan is not None:
52
+ delete_scan(args.db, args.delete_scan)
53
+ print(f"āœ“ Scan {args.delete_scan} deleted.")
54
+ sys.exit(0)
55
+
56
+ if not args.path:
57
+ parser.error("path is required unless using --list-scans or --delete-scan")
58
+
59
+ root = Path(args.path).expanduser().resolve()
60
+ if not root.exists():
61
+ print(f"Error: path does not exist: {root}")
62
+ sys.exit(1)
63
+
64
+ print(f"\nšŸ”Ž Scanning: {root}")
65
+ print(f" Database: {args.db}")
66
+ if args.label:
67
+ print(f" Label: {args.label}")
68
+ if not PIL_AVAILABLE:
69
+ print(" ⚠ Pillow not installed — image EXIF metadata skipped")
70
+ if not MUTAGEN_AVAILABLE:
71
+ print(" ⚠ mutagen not installed — audio metadata skipped")
72
+ if not PYPDF_AVAILABLE:
73
+ print(" ⚠ PyPDF2 not installed — PDF metadata skipped")
74
+ print()
75
+
76
+ t0 = time.time()
77
+ stats = scan(root, args.db, compute_hash=not args.no_hash,
78
+ verbose=args.verbose, label=args.label)
79
+ elapsed = time.time() - t0
80
+
81
+ print(f"\nāœ… Done in {elapsed:.1f}s — "
82
+ f"scan #{stats['scan_id']}, "
83
+ f"{stats['scanned']:,} indexed, "
84
+ f"{stats['errors']:,} errors, "
85
+ f"{stats['skipped']:,} skipped")
86
+
87
+ print_summary(args.db)
88
+
89
+ if args.export_csv:
90
+ export_csv(args.db, args.db.replace(".db", ".csv"), scan_id=stats["scan_id"])
91
+ if args.export_json:
92
+ export_json(args.db, args.db.replace(".db", ".json"), scan_id=stats["scan_id"])
93
+ if args.query:
94
+ query_db(args.db, args.query)
95
+
96
+ print(f" šŸ’” Run with --query photos, --list-scans, or open the GUI:\n"
97
+ f" valscanner-gui\n")
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
File without changes
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ MIME_CATEGORY: dict[str, str] = {
4
+ "image": "photo",
5
+ "video": "video",
6
+ "audio": "audio",
7
+ "text": "document",
8
+ "application": "application",
9
+ }
10
+
11
+ EXT_CATEGORY: dict[str, str] = {
12
+ # Documents
13
+ ".pdf": "document", ".doc": "document", ".docx": "document",
14
+ ".odt": "document", ".rtf": "document", ".txt": "document",
15
+ ".md": "document", ".rst": "document", ".tex": "document",
16
+ # Spreadsheets
17
+ ".xls": "spreadsheet", ".xlsx": "spreadsheet", ".ods": "spreadsheet",
18
+ ".csv": "spreadsheet",
19
+ # Presentations
20
+ ".ppt": "presentation", ".pptx": "presentation", ".odp": "presentation",
21
+ # Photos / images
22
+ ".jpg": "photo", ".jpeg": "photo", ".png": "photo", ".gif": "photo",
23
+ ".bmp": "photo", ".tiff": "photo", ".tif": "photo", ".webp": "photo",
24
+ ".heic": "photo", ".heif": "photo", ".raw": "photo", ".cr2": "photo",
25
+ ".nef": "photo", ".svg": "image",
26
+ # Video
27
+ ".mp4": "video", ".mov": "video", ".avi": "video", ".mkv": "video",
28
+ ".wmv": "video", ".flv": "video", ".webm": "video", ".m4v": "video",
29
+ # Audio
30
+ ".mp3": "audio", ".flac": "audio", ".wav": "audio", ".aac": "audio",
31
+ ".ogg": "audio", ".m4a": "audio", ".wma": "audio",
32
+ # Code
33
+ ".py": "code", ".js": "code", ".ts": "code", ".java": "code",
34
+ ".c": "code", ".cpp": "code", ".h": "code", ".cs": "code",
35
+ ".go": "code", ".rs": "code", ".rb": "code", ".php": "code",
36
+ ".sh": "code", ".bat": "code", ".ps1": "code",
37
+ # Data
38
+ ".json": "data", ".xml": "data", ".yaml": "data", ".yml": "data",
39
+ ".toml": "data", ".ini": "data", ".sql": "data",
40
+ # Archives
41
+ ".zip": "archive", ".tar": "archive", ".gz": "archive", ".bz2": "archive",
42
+ ".7z": "archive", ".rar": "archive", ".xz": "archive",
43
+ # Executables / installers
44
+ ".exe": "executable", ".msi": "executable", ".dmg": "executable",
45
+ ".deb": "executable", ".rpm": "executable", ".appimage": "executable",
46
+ # Fonts
47
+ ".ttf": "font", ".otf": "font", ".woff": "font", ".woff2": "font",
48
+ # Ebooks
49
+ ".epub": "ebook", ".mobi": "ebook", ".azw": "ebook",
50
+ }
valscanner/core/db.py ADDED
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+ import sqlite3
3
+ from collections import Counter
4
+
5
+ from .schema import human_size
6
+
7
+
8
+ def list_scans(db_path: str) -> list[dict]:
9
+ conn = sqlite3.connect(db_path)
10
+ conn.row_factory = sqlite3.Row
11
+ try:
12
+ rows = conn.execute(
13
+ "SELECT id, label, root, scanned_at, file_count, total_bytes, total_human "
14
+ "FROM scans ORDER BY id"
15
+ ).fetchall()
16
+ except sqlite3.OperationalError:
17
+ rows = []
18
+ conn.close()
19
+ return [dict(r) for r in rows]
20
+
21
+
22
+ def delete_scan(db_path: str, scan_id: int) -> None:
23
+ conn = sqlite3.connect(db_path)
24
+ conn.execute("PRAGMA foreign_keys = ON")
25
+ conn.execute("DELETE FROM scans WHERE id = ?", (scan_id,))
26
+ conn.commit()
27
+ conn.close()
28
+
29
+
30
+ def query_db(db_path: str, term: str) -> None:
31
+ conn = sqlite3.connect(db_path)
32
+ conn.row_factory = sqlite3.Row
33
+ print(f"\nšŸ” Searching for: '{term}'\n{'─'*60}")
34
+
35
+ try:
36
+ fts_rows = conn.execute(
37
+ "SELECT path, category, size_human, tags FROM files "
38
+ "WHERE id IN (SELECT rowid FROM files_fts WHERE files_fts MATCH ?) "
39
+ "ORDER BY path LIMIT 50",
40
+ (term,),
41
+ ).fetchall()
42
+ except sqlite3.OperationalError:
43
+ fts_rows = []
44
+
45
+ like = f"%{term}%"
46
+ like_rows = conn.execute(
47
+ "SELECT path, category, size_human, tags FROM files "
48
+ "WHERE path LIKE ? OR category LIKE ? OR tags LIKE ? ORDER BY path LIMIT 50",
49
+ (like, like, like),
50
+ ).fetchall()
51
+
52
+ seen: set[str] = set()
53
+ total = 0
54
+ for row in list(fts_rows) + list(like_rows):
55
+ p = row["path"]
56
+ if p in seen:
57
+ continue
58
+ seen.add(p)
59
+ total += 1
60
+ print(f" {row['category']:14s} {row['size_human']:>10s} {p}")
61
+ print(f" {'':14s} tags: {row['tags']}\n")
62
+
63
+ print(f"{'─'*60}\n Found {total} result(s).")
64
+ conn.close()
65
+
66
+
67
+ def print_summary(db_path: str) -> None:
68
+ conn = sqlite3.connect(db_path)
69
+ total, = conn.execute("SELECT COUNT(*) FROM files").fetchone()
70
+ total_size, = conn.execute("SELECT SUM(size_bytes) FROM files").fetchone()
71
+ total_size = total_size or 0
72
+
73
+ scans = conn.execute(
74
+ "SELECT id, label, root, scanned_at, file_count, total_human FROM scans ORDER BY id"
75
+ ).fetchall()
76
+
77
+ print(f"\n{'═'*60}")
78
+ if len(scans) > 1:
79
+ print(f" šŸ“¦ Scans in database: {len(scans)}")
80
+ for sid, slabel, sroot, sat, sfc, sth in scans:
81
+ print(f" [{sid}] {slabel or sroot} — {sfc:,} files {sth} ({sat})")
82
+ print()
83
+
84
+ print(f" šŸ“ Total files indexed : {total:,}")
85
+ print(f" šŸ’¾ Total size : {human_size(total_size)}")
86
+ print(f"\n Files by category:")
87
+ for cat, cnt, sz in conn.execute(
88
+ "SELECT category, COUNT(*), SUM(size_bytes) FROM files "
89
+ "GROUP BY category ORDER BY 2 DESC"
90
+ ):
91
+ print(f" {cat:20s} {cnt:>6,} files {human_size(sz or 0):>10s}")
92
+
93
+ print(f"\n Top 10 most common extensions:")
94
+ for ext, cnt in conn.execute(
95
+ "SELECT extension, COUNT(*) AS cnt FROM files "
96
+ "GROUP BY extension ORDER BY 2 DESC LIMIT 10"
97
+ ):
98
+ print(f" {ext:12s} {cnt:>6,}")
99
+
100
+ print(f"\n Top 10 tags:")
101
+ all_tags = conn.execute("SELECT tags FROM files WHERE tags != ''").fetchall()
102
+ tag_counter: Counter = Counter()
103
+ for (tag_str,) in all_tags:
104
+ for t in tag_str.split(", "):
105
+ tag_counter[t.strip()] += 1
106
+ for tag, cnt in tag_counter.most_common(10):
107
+ print(f" {tag:30s} {cnt:>6,}")
108
+
109
+ print(f"\n Top 10 largest folders (cumulative):")
110
+ for fpath_str, tb, fc in conn.execute(
111
+ "SELECT path, SUM(total_bytes), SUM(file_count) FROM folders "
112
+ "GROUP BY path ORDER BY 2 DESC LIMIT 10"
113
+ ):
114
+ print(f" {human_size(tb):>10s} ({fc:,} files) {fpath_str}")
115
+
116
+ print(f"{'═'*60}\n")
117
+ conn.close()
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+ import csv
3
+ import json
4
+ import sqlite3
5
+
6
+
7
+ def export_csv(db_path: str, out_path: str, scan_id: int | None = None) -> None:
8
+ conn = sqlite3.connect(db_path)
9
+ conn.row_factory = sqlite3.Row
10
+ q = "SELECT * FROM files" + (" WHERE scan_id=?" if scan_id else "") + " ORDER BY path"
11
+ rows = conn.execute(q, (scan_id,) if scan_id else ()).fetchall()
12
+ if not rows:
13
+ print("No rows to export.")
14
+ conn.close()
15
+ return
16
+ with open(out_path, "w", newline="", encoding="utf-8") as f:
17
+ w = csv.DictWriter(f, fieldnames=rows[0].keys())
18
+ w.writeheader()
19
+ w.writerows([dict(r) for r in rows])
20
+ conn.close()
21
+ print(f"āœ“ CSV exported → {out_path}")
22
+
23
+
24
+ def export_json(db_path: str, out_path: str, scan_id: int | None = None) -> None:
25
+ conn = sqlite3.connect(db_path)
26
+ conn.row_factory = sqlite3.Row
27
+ q = "SELECT * FROM files" + (" WHERE scan_id=?" if scan_id else "") + " ORDER BY path"
28
+ rows = [dict(r) for r in conn.execute(q, (scan_id,) if scan_id else ())]
29
+ conn.close()
30
+ with open(out_path, "w", encoding="utf-8") as f:
31
+ json.dump(rows, f, indent=2, ensure_ascii=False)
32
+ print(f"āœ“ JSON exported → {out_path}")
@@ -0,0 +1,175 @@
1
+ from __future__ import annotations
2
+ import io
3
+ import hashlib
4
+ import shutil
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+ try:
9
+ from PIL import Image
10
+ from PIL.ExifTags import TAGS
11
+ PIL_AVAILABLE = True
12
+ except ImportError:
13
+ PIL_AVAILABLE = False
14
+
15
+ try:
16
+ import mutagen
17
+ MUTAGEN_AVAILABLE = True
18
+ except ImportError:
19
+ MUTAGEN_AVAILABLE = False
20
+
21
+ try:
22
+ import PyPDF2
23
+ PYPDF_AVAILABLE = True
24
+ except ImportError:
25
+ PYPDF_AVAILABLE = False
26
+
27
+ FFMPEG_AVAILABLE = bool(shutil.which("ffmpeg"))
28
+
29
+
30
+ def extract_image_metadata(path: Path) -> dict:
31
+ if not PIL_AVAILABLE:
32
+ return {}
33
+ try:
34
+ with Image.open(path) as img:
35
+ meta = {
36
+ "img_width": img.width,
37
+ "img_height": img.height,
38
+ "img_mode": img.mode,
39
+ "img_format": img.format,
40
+ }
41
+ exif_data = img._getexif() if hasattr(img, "_getexif") else None
42
+ if exif_data:
43
+ readable = {}
44
+ for tag_id, value in exif_data.items():
45
+ tag = TAGS.get(tag_id, str(tag_id))
46
+ if isinstance(value, (str, int, float)):
47
+ readable[tag] = value
48
+ if "DateTime" in readable:
49
+ meta["exif_datetime"] = readable["DateTime"]
50
+ if "Make" in readable:
51
+ meta["exif_camera_make"] = readable["Make"]
52
+ if "Model" in readable:
53
+ meta["exif_camera_model"] = readable["Model"]
54
+ if "GPSInfo" in exif_data:
55
+ meta["has_gps"] = True
56
+ return meta
57
+ except Exception:
58
+ return {}
59
+
60
+
61
+ def extract_audio_metadata(path: Path) -> dict:
62
+ if not MUTAGEN_AVAILABLE:
63
+ return {}
64
+ try:
65
+ audio = mutagen.File(path, easy=True)
66
+ if audio is None:
67
+ return {}
68
+ meta = {}
69
+ for key in ("title", "artist", "album", "date", "genre", "tracknumber"):
70
+ if key in audio:
71
+ meta[f"audio_{key}"] = ", ".join(audio[key])
72
+ if hasattr(audio, "info"):
73
+ info = audio.info
74
+ if hasattr(info, "length"):
75
+ meta["audio_duration_sec"] = round(info.length, 2)
76
+ if hasattr(info, "bitrate"):
77
+ meta["audio_bitrate"] = info.bitrate
78
+ return meta
79
+ except Exception:
80
+ return {}
81
+
82
+
83
+ def extract_pdf_metadata(path: Path) -> dict:
84
+ if not PYPDF_AVAILABLE:
85
+ return {}
86
+ try:
87
+ with open(path, "rb") as f:
88
+ reader = PyPDF2.PdfReader(f)
89
+ meta = {"pdf_pages": len(reader.pages)}
90
+ info = reader.metadata
91
+ if info:
92
+ for k in ("/Title", "/Author", "/Subject", "/Creator", "/CreationDate"):
93
+ v = info.get(k)
94
+ if v:
95
+ meta[f"pdf_{k.strip('/').lower()}"] = str(v)
96
+ return meta
97
+ except Exception:
98
+ return {}
99
+
100
+
101
+ def file_sha256(path: Path, block: int = 65536) -> str:
102
+ h = hashlib.sha256()
103
+ try:
104
+ with open(path, "rb") as f:
105
+ while chunk := f.read(block):
106
+ h.update(chunk)
107
+ return h.hexdigest()
108
+ except Exception:
109
+ return ""
110
+
111
+
112
+ def _thumb_image(fpath: Path, max_size: int, quality: int) -> bytes | None:
113
+ if not PIL_AVAILABLE:
114
+ return None
115
+ try:
116
+ img = Image.open(fpath)
117
+ if img.mode == "P" and "transparency" in img.info:
118
+ img = img.convert("RGBA")
119
+ img.thumbnail((max_size, max_size), Image.LANCZOS)
120
+ buf = io.BytesIO()
121
+ img.convert("RGB").save(buf, format="JPEG", quality=quality, optimize=True)
122
+ return buf.getvalue()
123
+ except Exception:
124
+ return None
125
+
126
+
127
+ def _thumb_video(fpath: Path, max_size: int, quality: int) -> bytes | None:
128
+ if not FFMPEG_AVAILABLE:
129
+ return None
130
+ try:
131
+ result = subprocess.run(
132
+ [
133
+ "ffmpeg", "-y", "-i", str(fpath),
134
+ "-ss", "00:00:01",
135
+ "-vframes", "1",
136
+ "-vf", f"scale={max_size}:{max_size}:force_original_aspect_ratio=decrease",
137
+ "-q:v", str(max(1, (100 - quality) // 10)),
138
+ "-f", "image2pipe", "-vcodec", "mjpeg", "pipe:1",
139
+ ],
140
+ capture_output=True, timeout=30,
141
+ )
142
+ return result.stdout if result.returncode == 0 and result.stdout else None
143
+ except Exception:
144
+ return None
145
+
146
+
147
+ def _sample_media(fpath: Path, category: str, duration: int) -> tuple[bytes, str] | None:
148
+ if not FFMPEG_AVAILABLE:
149
+ return None
150
+ try:
151
+ if category == "video":
152
+ cmd = [
153
+ "ffmpeg", "-y", "-i", str(fpath),
154
+ "-t", str(duration),
155
+ "-vf", "scale=320:-2",
156
+ "-c:v", "libx264", "-crf", "35", "-preset", "ultrafast",
157
+ "-c:a", "aac", "-b:a", "32k",
158
+ "-movflags", "frag_keyframe+empty_moov",
159
+ "-f", "mp4", "pipe:1",
160
+ ]
161
+ fmt = "mp4"
162
+ else:
163
+ cmd = [
164
+ "ffmpeg", "-y", "-i", str(fpath),
165
+ "-t", str(duration),
166
+ "-c:a", "libmp3lame", "-b:a", "32k",
167
+ "-f", "mp3", "pipe:1",
168
+ ]
169
+ fmt = "mp3"
170
+ result = subprocess.run(cmd, capture_output=True, timeout=60)
171
+ if result.returncode == 0 and result.stdout:
172
+ return result.stdout, fmt
173
+ except Exception:
174
+ pass
175
+ return None