dupefinder 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. dupefinder-0.4.0/LICENSE +21 -0
  2. dupefinder-0.4.0/PKG-INFO +367 -0
  3. dupefinder-0.4.0/README.md +310 -0
  4. dupefinder-0.4.0/pyproject.toml +79 -0
  5. dupefinder-0.4.0/setup.cfg +4 -0
  6. dupefinder-0.4.0/src/dupefinder/__init__.py +21 -0
  7. dupefinder-0.4.0/src/dupefinder/__main__.py +6 -0
  8. dupefinder-0.4.0/src/dupefinder/api.py +23 -0
  9. dupefinder-0.4.0/src/dupefinder/cache.py +101 -0
  10. dupefinder-0.4.0/src/dupefinder/cli.py +192 -0
  11. dupefinder-0.4.0/src/dupefinder/constants.py +40 -0
  12. dupefinder-0.4.0/src/dupefinder/engine.py +267 -0
  13. dupefinder-0.4.0/src/dupefinder/errors.py +29 -0
  14. dupefinder-0.4.0/src/dupefinder/events.py +51 -0
  15. dupefinder-0.4.0/src/dupefinder/filters.py +42 -0
  16. dupefinder-0.4.0/src/dupefinder/grouping.py +64 -0
  17. dupefinder-0.4.0/src/dupefinder/hashing.py +147 -0
  18. dupefinder-0.4.0/src/dupefinder/models.py +125 -0
  19. dupefinder-0.4.0/src/dupefinder/py.typed +0 -0
  20. dupefinder-0.4.0/src/dupefinder/report.py +100 -0
  21. dupefinder-0.4.0/src/dupefinder/safety.py +80 -0
  22. dupefinder-0.4.0/src/dupefinder/scanner.py +114 -0
  23. dupefinder-0.4.0/src/dupefinder.egg-info/PKG-INFO +367 -0
  24. dupefinder-0.4.0/src/dupefinder.egg-info/SOURCES.txt +40 -0
  25. dupefinder-0.4.0/src/dupefinder.egg-info/dependency_links.txt +1 -0
  26. dupefinder-0.4.0/src/dupefinder.egg-info/entry_points.txt +2 -0
  27. dupefinder-0.4.0/src/dupefinder.egg-info/requires.txt +9 -0
  28. dupefinder-0.4.0/src/dupefinder.egg-info/top_level.txt +1 -0
  29. dupefinder-0.4.0/tests/test_cache.py +183 -0
  30. dupefinder-0.4.0/tests/test_cli.py +257 -0
  31. dupefinder-0.4.0/tests/test_engine.py +761 -0
  32. dupefinder-0.4.0/tests/test_filters.py +74 -0
  33. dupefinder-0.4.0/tests/test_grouping.py +87 -0
  34. dupefinder-0.4.0/tests/test_hardening.py +365 -0
  35. dupefinder-0.4.0/tests/test_hashing.py +164 -0
  36. dupefinder-0.4.0/tests/test_models.py +145 -0
  37. dupefinder-0.4.0/tests/test_packaging.py +152 -0
  38. dupefinder-0.4.0/tests/test_report.py +99 -0
  39. dupefinder-0.4.0/tests/test_safety.py +106 -0
  40. dupefinder-0.4.0/tests/test_scanner.py +91 -0
  41. dupefinder-0.4.0/tests/test_schema.py +56 -0
  42. dupefinder-0.4.0/tests/test_workflows.py +119 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,367 @@
1
+ Metadata-Version: 2.4
2
+ Name: dupefinder
3
+ Version: 0.4.0
4
+ Summary: A simple, safe, zero-dependency duplicate file finder for Python.
5
+ Author-email: Igor Souza <igor.souza.92@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Your Name
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/igors93/dupefinder
29
+ Project-URL: Repository, https://github.com/igors93/dupefinder
30
+ Project-URL: Issues, https://github.com/igors93/dupefinder/issues
31
+ Project-URL: Changelog, https://github.com/igors93/dupefinder/blob/main/CHANGELOG.md
32
+ Keywords: duplicates,files,hash,finder,zero-dependency,cli
33
+ Classifier: Development Status :: 4 - Beta
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Operating System :: OS Independent
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Topic :: Utilities
43
+ Classifier: Topic :: System :: Filesystems
44
+ Classifier: Typing :: Typed
45
+ Requires-Python: >=3.10
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Provides-Extra: dev
49
+ Requires-Dist: pytest>=8.0; extra == "dev"
50
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
51
+ Requires-Dist: ruff>=0.11; extra == "dev"
52
+ Requires-Dist: pyright>=1.1; extra == "dev"
53
+ Requires-Dist: build>=1.2; extra == "dev"
54
+ Requires-Dist: twine>=6.0; extra == "dev"
55
+ Requires-Dist: pyyaml>=6.0; extra == "dev"
56
+ Dynamic: license-file
57
+
58
+ # dupefinder
59
+
60
+ <p align="center">
61
+ <a href="https://pypi.org/project/dupefinder/"><img src="https://img.shields.io/pypi/v/dupefinder.svg" alt="PyPI version"></a>
62
+ <a href="https://pypi.org/project/dupefinder/"><img src="https://img.shields.io/pypi/pyversions/dupefinder.svg" alt="Python versions"></a>
63
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"></a>
64
+ <a href="https://github.com/igors93/dupefinder/actions/workflows/ci.yml"><img src="https://github.com/igors93/dupefinder/actions/workflows/ci.yml/badge.svg" alt="Tests"></a>
65
+ </p>
66
+
67
+ **dupefinder** is a small, zero-dependency Python library and CLI tool for finding duplicate files using content hashes.
68
+
69
+ **Requires Python 3.10 or later.** Detects exact duplicates (identical byte content) only.
70
+
71
+ ## Features
72
+
73
+ - **Simple**: one function for common use, a full report for advanced use.
74
+ - **Safe by default**: read-only. Never deletes, moves, or modifies files.
75
+ - **Zero dependency**: uses only the Python standard library.
76
+ - **Modular**: each responsibility lives in its own module.
77
+ - **Memory-friendly**: files are hashed in configurable chunks, not loaded fully into RAM.
78
+ - **Fast**: groups by file size before hashing — only candidates are hashed.
79
+ - **Typed**: ships with inline type annotations.
80
+ - **Observable**: typed event system for progress callbacks and integrations.
81
+ - **Cancellable**: abort scans via a callback or timeout.
82
+ - **Cached**: optional SQLite hash cache for repeated scans.
83
+
84
+ ## Installation
85
+
86
+ ```bash
87
+ pip install dupefinder
88
+ ```
89
+
90
+ For development:
91
+
92
+ ```bash
93
+ git clone https://github.com/igors93/dupefinder.git
94
+ cd dupefinder
95
+ pip install -e ".[dev]"
96
+ ```
97
+
98
+ ## Quick start
99
+
100
+ ### As a library
101
+
102
+ ```python
103
+ from dupefinder import find_duplicates
104
+
105
+ groups = find_duplicates("./Downloads")
106
+
107
+ for group in groups:
108
+ print(f"{group.count} duplicate files — {group.size} bytes each")
109
+ for path in group.files:
110
+ print(f" {path}")
111
+ ```
112
+
113
+ ### Full report
114
+
115
+ ```python
116
+ from dupefinder import scan
117
+ from dupefinder.models import ScanOptions
118
+ from dupefinder.report import format_report
119
+
120
+ report = scan(
121
+ "./Downloads",
122
+ options=ScanOptions(
123
+ min_size=1024, # ignore files smaller than 1 KB
124
+ ignore_hidden=True, # skip dotfiles and dotfolders
125
+ follow_symlinks=False, # safe default
126
+ ),
127
+ )
128
+
129
+ print(format_report(report))
130
+ print(f"Wasted space: {report.total_wasted_space} bytes")
131
+ ```
132
+
133
+ ### JSON output
134
+
135
+ ```python
136
+ from dupefinder import scan
137
+ from dupefinder.report import report_to_json
138
+
139
+ report = scan("./Downloads")
140
+ print(report_to_json(report))
141
+ ```
142
+
143
+ ### DupeFinder with events
144
+
145
+ ```python
146
+ from dupefinder import DupeFinder, ScanOptions
147
+
148
+ def on_event(event):
149
+ if event.type == "file_discovered":
150
+ print(f"\rFound {event.scanned_files} files...", end="", flush=True)
151
+ elif event.type == "issue":
152
+ print(f"\nWarning: {event.message}")
153
+ elif event.type == "scan_completed":
154
+ print(f"\nDone in {event.elapsed_seconds:.2f}s")
155
+
156
+ finder = DupeFinder(
157
+ options=ScanOptions(min_size=1024),
158
+ on_event=on_event,
159
+ )
160
+ report = finder.scan("./Downloads")
161
+ ```
162
+
163
+ ### Progress callback
164
+
165
+ ```python
166
+ from dupefinder import DupeFinder, ScanOptions
167
+
168
+ def on_progress(progress):
169
+ print(f"[{progress.phase}] {progress.scanned_files} files scanned, "
170
+ f"{progress.hashed_files}/{progress.total_candidates} hashed")
171
+
172
+ finder = DupeFinder(
173
+ options=ScanOptions(min_size=1024),
174
+ on_progress=on_progress,
175
+ )
176
+ report = finder.scan("./Downloads")
177
+ print(f"Total bytes read: {report.total_bytes_read:,}")
178
+ ```
179
+
180
+ ### Cancellation
181
+
182
+ ```python
183
+ import threading
184
+ from dupefinder import DupeFinder
185
+
186
+ cancel_flag = threading.Event()
187
+ threading.Timer(5.0, cancel_flag.set).start() # cancel after 5 seconds
188
+
189
+ finder = DupeFinder(should_cancel=cancel_flag.is_set)
190
+ report = finder.scan("./Downloads")
191
+
192
+ if report.cancelled:
193
+ print(f"Cancelled after {report.elapsed_seconds:.2f}s — partial results")
194
+ ```
195
+
196
+ ### SQLite cache
197
+
198
+ ```python
199
+ from dupefinder import DupeFinder
200
+ from dupefinder.cache import SQLiteHashCache
201
+
202
+ with SQLiteHashCache(".dupefinder-cache.sqlite") as cache:
203
+ finder = DupeFinder(cache=cache)
204
+ report = finder.scan("./media") # second run will be much faster
205
+ ```
206
+
207
+ ### Scan limits
208
+
209
+ ```python
210
+ from dupefinder import DupeFinder, ScanOptions
211
+
212
+ finder = DupeFinder(options=ScanOptions(
213
+ max_files=1000, # stop after 1000 files
214
+ max_depth=3, # scan at most 3 levels deep
215
+ timeout_seconds=30.0, # stop after 30 seconds
216
+ ))
217
+ report = finder.scan("./data")
218
+ ```
219
+
220
+ ### CLI
221
+
222
+ ```bash
223
+ # Basic scan
224
+ dupefinder ./Downloads
225
+
226
+ # JSON output
227
+ dupefinder ./Downloads --json
228
+
229
+ # Ignore files smaller than 1 MB
230
+ dupefinder ./Downloads --min-size 1MB
231
+
232
+ # Only scan images
233
+ dupefinder ./Pictures --include-ext .jpg,.jpeg,.png,.webp
234
+
235
+ # Ignore temp and log files
236
+ dupefinder . --ignore-ext .tmp,.log
237
+
238
+ # Exit with code 2 if any duplicates are found (useful in scripts/CI)
239
+ dupefinder . --fail-on-duplicates
240
+
241
+ # Strict mode: raise errors instead of skipping inaccessible files
242
+ dupefinder . --strict
243
+
244
+ # Follow symbolic links (disabled by default)
245
+ dupefinder . --follow-symlinks
246
+ ```
247
+
248
+ Run `dupefinder --help` to see all options.
249
+
250
+ ### Exit codes
251
+
252
+ | Code | Meaning |
253
+ |------|---------|
254
+ | `0` | Scan completed. Non-fatal issues may still be present in the report. |
255
+ | `1` | Scan failed because of an invalid option, invalid path, cache error, or strict-mode error. |
256
+ | `2` | Scan completed and duplicates were found while `--fail-on-duplicates` was enabled. |
257
+ | `3` | Scan was cancelled or stopped by the configured timeout. |
258
+
259
+ Notes:
260
+ - `--strict` turns otherwise non-fatal file access errors into exit code `1`.
261
+ - JSON output can contain an `issues` list even when the exit code is `0`.
262
+ - Exit code `3` takes priority over `--fail-on-duplicates`.
263
+
264
+ ## CLI reference
265
+
266
+ | Flag | Description |
267
+ |---|---|
268
+ | `path` | File or directory to scan |
269
+ | `--algorithm` | Hash algorithm (default: `sha256`) |
270
+ | `--chunk-size` | Read chunk size, e.g. `1MB` (default: 1 MiB) |
271
+ | `--min-size` | Skip files smaller than this, e.g. `10KB` |
272
+ | `--max-size` | Skip files larger than this, e.g. `5GB` |
273
+ | `--include-ext` | Only scan these extensions, e.g. `.jpg,.png` |
274
+ | `--ignore-ext` | Skip these extensions, e.g. `.tmp,.log` |
275
+ | `--no-ignore-hidden` | Do not skip hidden dotfiles and dotfolders |
276
+ | `--follow-symlinks` | Follow symbolic links |
277
+ | `--max-files N` | Stop after discovering N files |
278
+ | `--max-depth N` | Maximum directory depth to scan |
279
+ | `--timeout SECONDS` | Stop scan after this many seconds |
280
+ | `--cache PATH` | SQLite cache file for file hashes |
281
+ | `--progress` | Print progress to stderr |
282
+ | `--strict` | Raise errors instead of skipping bad files |
283
+ | `--json` | Print JSON output |
284
+ | `--fail-on-duplicates` | Exit with code `2` when duplicates are found |
285
+ | `--version` | Show version and exit |
286
+
287
+ ## API summary
288
+
289
+ | Symbol | Description |
290
+ |---|---|
291
+ | `find_duplicates(path, options)` | Return a tuple of `DuplicateGroup` |
292
+ | `scan(path, options)` | Return a full `ScanReport` |
293
+ | `DupeFinder` | Engine with events, progress, cache, and cancellation |
294
+ | `ScanEvent` | Typed event emitted during scanning |
295
+ | `ScanProgress` | Simplified progress snapshot for the `on_progress` callback |
296
+ | `ScanOptions` | Frozen dataclass with all scan settings |
297
+ | `ScanReport` | Result of a scan — groups, counts, issues, bytes read |
298
+ | `DuplicateGroup` | One group of files with identical content |
299
+ | `FileInfo` | Path and size of a single file |
300
+ | `ScanIssue` | A non-fatal error recorded during a scan |
301
+ | `SQLiteHashCache` | SQLite-backed hash cache — import from `dupefinder.cache` |
302
+
303
+ ### JSON schema
304
+
305
+ All JSON output includes a `schema_version` field (currently `"1.1"`) for forward compatibility.
306
+
307
+ See [docs/api.md](docs/api.md) for the full reference.
308
+
309
+ ## Project structure
310
+
311
+ ```text
312
+ src/dupefinder/
313
+ ├── api.py public functions: scan, find_duplicates
314
+ ├── cli.py terminal command
315
+ ├── engine.py DupeFinder class: events, cancellation, cache
316
+ ├── events.py ScanEvent dataclass
317
+ ├── cache.py HashCache protocol and SQLiteHashCache
318
+ ├── scanner.py file discovery (os.scandir, loop detection, max_depth)
319
+ ├── hashing.py chunked file hashing with optional cache
320
+ ├── grouping.py group by size then by hash
321
+ ├── filters.py ignore/include rules
322
+ ├── models.py frozen dataclasses
323
+ ├── report.py text and JSON output
324
+ ├── safety.py path/options validation, helpers
325
+ ├── constants.py default values
326
+ └── errors.py custom exceptions
327
+ ```
328
+
329
+ ## Safety and design constraints
330
+
331
+ `dupefinder` is intentionally read-only:
332
+
333
+ - Does **not** delete, move, or rename files.
334
+ - Does **not** connect to the internet.
335
+ - Does **not** follow symbolic links by default. Pass `--follow-symlinks` or `ScanOptions(follow_symlinks=True)` to opt in.
336
+ - Reads files in chunks — no large allocations.
337
+ - Permission errors are recorded and skipped by default.
338
+ - SQLite cache writes occur **only** when the user explicitly passes `--cache PATH` or constructs `SQLiteHashCache`. The cache file is written to the path chosen by the user.
339
+ - Detects **exact duplicates only** — files with identical byte content. Near-duplicates, similar images, or renamed files are not detected.
340
+
341
+ See [SECURITY.md](SECURITY.md) for more details.
342
+
343
+ ## Running tests
344
+
345
+ ```bash
346
+ pytest
347
+ ```
348
+
349
+ With coverage:
350
+
351
+ ```bash
352
+ pytest --cov=dupefinder --cov-report=term-missing
353
+ ```
354
+
355
+ ## Contributing
356
+
357
+ Contributions are welcome. Please open an issue first to discuss what you want to change.
358
+
359
+ 1. Fork the repository.
360
+ 2. Create a branch: `git checkout -b feature/your-feature`.
361
+ 3. Make your changes and add tests.
362
+ 4. Run `pytest` and make sure all tests pass.
363
+ 5. Open a pull request.
364
+
365
+ ## License
366
+
367
+ [MIT](LICENSE) — Igor Souza
@@ -0,0 +1,310 @@
1
+ # dupefinder
2
+
3
+ <p align="center">
4
+ <a href="https://pypi.org/project/dupefinder/"><img src="https://img.shields.io/pypi/v/dupefinder.svg" alt="PyPI version"></a>
5
+ <a href="https://pypi.org/project/dupefinder/"><img src="https://img.shields.io/pypi/pyversions/dupefinder.svg" alt="Python versions"></a>
6
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"></a>
7
+ <a href="https://github.com/igors93/dupefinder/actions/workflows/ci.yml"><img src="https://github.com/igors93/dupefinder/actions/workflows/ci.yml/badge.svg" alt="Tests"></a>
8
+ </p>
9
+
10
+ **dupefinder** is a small, zero-dependency Python library and CLI tool for finding duplicate files using content hashes.
11
+
12
+ **Requires Python 3.10 or later.** Detects exact duplicates (identical byte content) only.
13
+
14
+ ## Features
15
+
16
+ - **Simple**: one function for common use, a full report for advanced use.
17
+ - **Safe by default**: read-only. Never deletes, moves, or modifies files.
18
+ - **Zero dependency**: uses only the Python standard library.
19
+ - **Modular**: each responsibility lives in its own module.
20
+ - **Memory-friendly**: files are hashed in configurable chunks, not loaded fully into RAM.
21
+ - **Fast**: groups by file size before hashing — only candidates are hashed.
22
+ - **Typed**: ships with inline type annotations.
23
+ - **Observable**: typed event system for progress callbacks and integrations.
24
+ - **Cancellable**: abort scans via a callback or timeout.
25
+ - **Cached**: optional SQLite hash cache for repeated scans.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install dupefinder
31
+ ```
32
+
33
+ For development:
34
+
35
+ ```bash
36
+ git clone https://github.com/igors93/dupefinder.git
37
+ cd dupefinder
38
+ pip install -e ".[dev]"
39
+ ```
40
+
41
+ ## Quick start
42
+
43
+ ### As a library
44
+
45
+ ```python
46
+ from dupefinder import find_duplicates
47
+
48
+ groups = find_duplicates("./Downloads")
49
+
50
+ for group in groups:
51
+ print(f"{group.count} duplicate files — {group.size} bytes each")
52
+ for path in group.files:
53
+ print(f" {path}")
54
+ ```
55
+
56
+ ### Full report
57
+
58
+ ```python
59
+ from dupefinder import scan
60
+ from dupefinder.models import ScanOptions
61
+ from dupefinder.report import format_report
62
+
63
+ report = scan(
64
+ "./Downloads",
65
+ options=ScanOptions(
66
+ min_size=1024, # ignore files smaller than 1 KB
67
+ ignore_hidden=True, # skip dotfiles and dotfolders
68
+ follow_symlinks=False, # safe default
69
+ ),
70
+ )
71
+
72
+ print(format_report(report))
73
+ print(f"Wasted space: {report.total_wasted_space} bytes")
74
+ ```
75
+
76
+ ### JSON output
77
+
78
+ ```python
79
+ from dupefinder import scan
80
+ from dupefinder.report import report_to_json
81
+
82
+ report = scan("./Downloads")
83
+ print(report_to_json(report))
84
+ ```
85
+
86
+ ### DupeFinder with events
87
+
88
+ ```python
89
+ from dupefinder import DupeFinder, ScanOptions
90
+
91
+ def on_event(event):
92
+ if event.type == "file_discovered":
93
+ print(f"\rFound {event.scanned_files} files...", end="", flush=True)
94
+ elif event.type == "issue":
95
+ print(f"\nWarning: {event.message}")
96
+ elif event.type == "scan_completed":
97
+ print(f"\nDone in {event.elapsed_seconds:.2f}s")
98
+
99
+ finder = DupeFinder(
100
+ options=ScanOptions(min_size=1024),
101
+ on_event=on_event,
102
+ )
103
+ report = finder.scan("./Downloads")
104
+ ```
105
+
106
+ ### Progress callback
107
+
108
+ ```python
109
+ from dupefinder import DupeFinder, ScanOptions
110
+
111
+ def on_progress(progress):
112
+ print(f"[{progress.phase}] {progress.scanned_files} files scanned, "
113
+ f"{progress.hashed_files}/{progress.total_candidates} hashed")
114
+
115
+ finder = DupeFinder(
116
+ options=ScanOptions(min_size=1024),
117
+ on_progress=on_progress,
118
+ )
119
+ report = finder.scan("./Downloads")
120
+ print(f"Total bytes read: {report.total_bytes_read:,}")
121
+ ```
122
+
123
+ ### Cancellation
124
+
125
+ ```python
126
+ import threading
127
+ from dupefinder import DupeFinder
128
+
129
+ cancel_flag = threading.Event()
130
+ threading.Timer(5.0, cancel_flag.set).start() # cancel after 5 seconds
131
+
132
+ finder = DupeFinder(should_cancel=cancel_flag.is_set)
133
+ report = finder.scan("./Downloads")
134
+
135
+ if report.cancelled:
136
+ print(f"Cancelled after {report.elapsed_seconds:.2f}s — partial results")
137
+ ```
138
+
139
+ ### SQLite cache
140
+
141
+ ```python
142
+ from dupefinder import DupeFinder
143
+ from dupefinder.cache import SQLiteHashCache
144
+
145
+ with SQLiteHashCache(".dupefinder-cache.sqlite") as cache:
146
+ finder = DupeFinder(cache=cache)
147
+ report = finder.scan("./media") # second run will be much faster
148
+ ```
149
+
150
+ ### Scan limits
151
+
152
+ ```python
153
+ from dupefinder import DupeFinder, ScanOptions
154
+
155
+ finder = DupeFinder(options=ScanOptions(
156
+ max_files=1000, # stop after 1000 files
157
+ max_depth=3, # scan at most 3 levels deep
158
+ timeout_seconds=30.0, # stop after 30 seconds
159
+ ))
160
+ report = finder.scan("./data")
161
+ ```
162
+
163
+ ### CLI
164
+
165
+ ```bash
166
+ # Basic scan
167
+ dupefinder ./Downloads
168
+
169
+ # JSON output
170
+ dupefinder ./Downloads --json
171
+
172
+ # Ignore files smaller than 1 MB
173
+ dupefinder ./Downloads --min-size 1MB
174
+
175
+ # Only scan images
176
+ dupefinder ./Pictures --include-ext .jpg,.jpeg,.png,.webp
177
+
178
+ # Ignore temp and log files
179
+ dupefinder . --ignore-ext .tmp,.log
180
+
181
+ # Exit with code 2 if any duplicates are found (useful in scripts/CI)
182
+ dupefinder . --fail-on-duplicates
183
+
184
+ # Strict mode: raise errors instead of skipping inaccessible files
185
+ dupefinder . --strict
186
+
187
+ # Follow symbolic links (disabled by default)
188
+ dupefinder . --follow-symlinks
189
+ ```
190
+
191
+ Run `dupefinder --help` to see all options.
192
+
193
+ ### Exit codes
194
+
195
+ | Code | Meaning |
196
+ |------|---------|
197
+ | `0` | Scan completed. Non-fatal issues may still be present in the report. |
198
+ | `1` | Scan failed because of an invalid option, invalid path, cache error, or strict-mode error. |
199
+ | `2` | Scan completed and duplicates were found while `--fail-on-duplicates` was enabled. |
200
+ | `3` | Scan was cancelled or stopped by the configured timeout. |
201
+
202
+ Notes:
203
+ - `--strict` turns otherwise non-fatal file access errors into exit code `1`.
204
+ - JSON output can contain an `issues` list even when the exit code is `0`.
205
+ - Exit code `3` takes priority over `--fail-on-duplicates`.
206
+
207
+ ## CLI reference
208
+
209
+ | Flag | Description |
210
+ |---|---|
211
+ | `path` | File or directory to scan |
212
+ | `--algorithm` | Hash algorithm (default: `sha256`) |
213
+ | `--chunk-size` | Read chunk size, e.g. `1MB` (default: 1 MiB) |
214
+ | `--min-size` | Skip files smaller than this, e.g. `10KB` |
215
+ | `--max-size` | Skip files larger than this, e.g. `5GB` |
216
+ | `--include-ext` | Only scan these extensions, e.g. `.jpg,.png` |
217
+ | `--ignore-ext` | Skip these extensions, e.g. `.tmp,.log` |
218
+ | `--no-ignore-hidden` | Do not skip hidden dotfiles and dotfolders |
219
+ | `--follow-symlinks` | Follow symbolic links |
220
+ | `--max-files N` | Stop after discovering N files |
221
+ | `--max-depth N` | Maximum directory depth to scan |
222
+ | `--timeout SECONDS` | Stop scan after this many seconds |
223
+ | `--cache PATH` | SQLite cache file for file hashes |
224
+ | `--progress` | Print progress to stderr |
225
+ | `--strict` | Raise errors instead of skipping bad files |
226
+ | `--json` | Print JSON output |
227
+ | `--fail-on-duplicates` | Exit with code `2` when duplicates are found |
228
+ | `--version` | Show version and exit |
229
+
230
+ ## API summary
231
+
232
+ | Symbol | Description |
233
+ |---|---|
234
+ | `find_duplicates(path, options)` | Return a tuple of `DuplicateGroup` |
235
+ | `scan(path, options)` | Return a full `ScanReport` |
236
+ | `DupeFinder` | Engine with events, progress, cache, and cancellation |
237
+ | `ScanEvent` | Typed event emitted during scanning |
238
+ | `ScanProgress` | Simplified progress snapshot for the `on_progress` callback |
239
+ | `ScanOptions` | Frozen dataclass with all scan settings |
240
+ | `ScanReport` | Result of a scan — groups, counts, issues, bytes read |
241
+ | `DuplicateGroup` | One group of files with identical content |
242
+ | `FileInfo` | Path and size of a single file |
243
+ | `ScanIssue` | A non-fatal error recorded during a scan |
244
+ | `SQLiteHashCache` | SQLite-backed hash cache — import from `dupefinder.cache` |
245
+
246
+ ### JSON schema
247
+
248
+ All JSON output includes a `schema_version` field (currently `"1.1"`) for forward compatibility.
249
+
250
+ See [docs/api.md](docs/api.md) for the full reference.
251
+
252
+ ## Project structure
253
+
254
+ ```text
255
+ src/dupefinder/
256
+ ├── api.py public functions: scan, find_duplicates
257
+ ├── cli.py terminal command
258
+ ├── engine.py DupeFinder class: events, cancellation, cache
259
+ ├── events.py ScanEvent dataclass
260
+ ├── cache.py HashCache protocol and SQLiteHashCache
261
+ ├── scanner.py file discovery (os.scandir, loop detection, max_depth)
262
+ ├── hashing.py chunked file hashing with optional cache
263
+ ├── grouping.py group by size then by hash
264
+ ├── filters.py ignore/include rules
265
+ ├── models.py frozen dataclasses
266
+ ├── report.py text and JSON output
267
+ ├── safety.py path/options validation, helpers
268
+ ├── constants.py default values
269
+ └── errors.py custom exceptions
270
+ ```
271
+
272
+ ## Safety and design constraints
273
+
274
+ `dupefinder` is intentionally read-only:
275
+
276
+ - Does **not** delete, move, or rename files.
277
+ - Does **not** connect to the internet.
278
+ - Does **not** follow symbolic links by default. Pass `--follow-symlinks` or `ScanOptions(follow_symlinks=True)` to opt in.
279
+ - Reads files in chunks — no large allocations.
280
+ - Permission errors are recorded and skipped by default.
281
+ - SQLite cache writes occur **only** when the user explicitly passes `--cache PATH` or constructs `SQLiteHashCache`. The cache file is written to the path chosen by the user.
282
+ - Detects **exact duplicates only** — files with identical byte content. Near-duplicates, similar images, or renamed files are not detected.
283
+
284
+ See [SECURITY.md](SECURITY.md) for more details.
285
+
286
+ ## Running tests
287
+
288
+ ```bash
289
+ pytest
290
+ ```
291
+
292
+ With coverage:
293
+
294
+ ```bash
295
+ pytest --cov=dupefinder --cov-report=term-missing
296
+ ```
297
+
298
+ ## Contributing
299
+
300
+ Contributions are welcome. Please open an issue first to discuss what you want to change.
301
+
302
+ 1. Fork the repository.
303
+ 2. Create a branch: `git checkout -b feature/your-feature`.
304
+ 3. Make your changes and add tests.
305
+ 4. Run `pytest` and make sure all tests pass.
306
+ 5. Open a pull request.
307
+
308
+ ## License
309
+
310
+ [MIT](LICENSE) — Igor Souza