dscanpy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dscanpy-0.1.0/PKG-INFO +203 -0
- dscanpy-0.1.0/README.md +190 -0
- dscanpy-0.1.0/pyproject.toml +30 -0
- dscanpy-0.1.0/src/dscan/__init__.py +5 -0
- dscanpy-0.1.0/src/dscan/core.py +220 -0
- dscanpy-0.1.0/src/dscan/crawler.py +275 -0
- dscanpy-0.1.0/src/dscan/filter.py +140 -0
- dscanpy-0.1.0/src/dscan/models.py +50 -0
dscanpy-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dscanpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Concurrent directory tree scanner for Python 3.12+
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: System :: Filesystems
|
|
11
|
+
Project-URL: Repository, https://github.com/yourusername/dscanpy
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# dscan
|
|
15
|
+
|
|
16
|
+
`dscan` is a concurrent directory scanner for Python 3.12+. It wraps `os.scandir` in a thread pool with a work-stealing queue, exposing a filtering API that covers most of what you'd otherwise implement by hand on top of `os.walk`.
|
|
17
|
+
|
|
18
|
+
Two modes: `scan_entries` yields raw `os.DirEntry` objects with minimal overhead; `scan` yields dataclass models with pre-computed metadata.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Why concurrent scanning?
|
|
23
|
+
|
|
24
|
+
On a local SSD, directory traversal is fast enough that threading adds more overhead than it saves. `scan_entries` still matches or edges out `os.walk`, but the real case for concurrency is **network-attached storage**.
|
|
25
|
+
|
|
26
|
+
On SMB shares, NFS mounts, or any high-latency filesystem, each `scandir` call blocks waiting for a server response. `os.walk` does this serially — one directory at a time. dscan keeps multiple directories in-flight simultaneously, so workers aren't sitting idle while the network responds. On deep trees with many subdirectories, this compounds significantly.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Benchmarks
|
|
31
|
+
|
|
32
|
+
### Local SSD (~4M entries, MacBook)
|
|
33
|
+
|
|
34
|
+
| | entries | time |
|
|
35
|
+
|---|---|---|
|
|
36
|
+
| `os.walk` (no stat) | 4,046,505 | 33.30s |
|
|
37
|
+
| `os.walk` (+ stat) | 4,039,313 | 85.24s |
|
|
38
|
+
| `dscan.scan_entries` | 4,046,502 | **31.90s** |
|
|
39
|
+
| `dscan.scan` (models) | 4,014,758 | 140.15s |
|
|
40
|
+
|
|
41
|
+
`scan_entries` is on par with bare `os.walk`. `scan` is slower because stat calls happen on the main thread serially — the workers parallelise `scandir`, not `stat`. Use `scan` when you want the structured output; use `scan_entries` when throughput matters.
|
|
42
|
+
|
|
43
|
+
### Simulated network latency (5ms per directory)
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
# rough simulation
|
|
47
|
+
import time, os
|
|
48
|
+
_real = os.scandir
|
|
49
|
+
os.scandir = lambda p: (time.sleep(0.005), _real(p))[1]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
| | time |
|
|
53
|
+
|---|---|
|
|
54
|
+
| `os.walk` | ~linear with directory count |
|
|
55
|
+
| `dscan.scan_entries` | scales with `max_workers` |
|
|
56
|
+
|
|
57
|
+
At 5ms latency per directory, a tree with 10,000 directories takes ~50s serially. With 16 workers dscan brings that to ~4s. The deeper and wider the tree, the bigger the difference.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install dscan
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Requires Python 3.12+. No other dependencies.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
### Basic scan
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from dscan import scan
|
|
77
|
+
|
|
78
|
+
for entry in scan("."):
|
|
79
|
+
print(f"{entry.name} - {entry.path}")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Raw entries (lower overhead)
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from dscan import scan_entries
|
|
86
|
+
|
|
87
|
+
for entry in scan_entries("~/Documents", max_depth=2):
|
|
88
|
+
if entry.is_file():
|
|
89
|
+
print(entry.name)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Filtering
|
|
95
|
+
|
|
96
|
+
### Extensions
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
# Only Python and Markdown files
|
|
100
|
+
for file in scan(".", extensions={".py", ".md"}):
|
|
101
|
+
print(file.path)
|
|
102
|
+
|
|
103
|
+
# Skip compiled files
|
|
104
|
+
for file in scan(".", ignore_extensions={".bin", ".exe"}):
|
|
105
|
+
print(file.path)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Glob patterns
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# Only test files
|
|
112
|
+
for entry in scan(".", match="test_*"):
|
|
113
|
+
print(entry.name)
|
|
114
|
+
|
|
115
|
+
# Skip hidden files and directories
|
|
116
|
+
for entry in scan(".", ignore_pattern=".*"):
|
|
117
|
+
print(entry.name)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Directory traversal
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
# Immediate children only
|
|
124
|
+
for entry in scan(".", max_depth=0):
|
|
125
|
+
print(entry.name)
|
|
126
|
+
|
|
127
|
+
# Only descend into src/ and lib/
|
|
128
|
+
for entry in scan(".", only_dirs=["src", "lib"]):
|
|
129
|
+
print(entry.path)
|
|
130
|
+
|
|
131
|
+
# Skip specific directories
|
|
132
|
+
# .git, .idea, .venv, __pycache__ are skipped by default
|
|
133
|
+
for entry in scan(".", ignore_dirs=["node_modules", "dist"]):
|
|
134
|
+
print(entry.path)
|
|
135
|
+
|
|
136
|
+
# Disable all default ignores
|
|
137
|
+
for entry in scan(".", ignore_dirs=[]):
|
|
138
|
+
print(entry.path)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Custom filter
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
def is_large_file(entry):
|
|
145
|
+
return entry.is_file() and entry.stat().st_size > 1_000_000
|
|
146
|
+
|
|
147
|
+
for entry in scan(".", custom_filter=is_large_file):
|
|
148
|
+
print(entry.name)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Tuning workers
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# default is min(32, cpu_count * 2)
|
|
155
|
+
# increase on high-latency mounts
|
|
156
|
+
for entry in scan_entries("/mnt/nas", max_workers=32):
|
|
157
|
+
print(entry.path)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Data Models
|
|
163
|
+
|
|
164
|
+
`scan()` returns `FileEntry` or `DirectoryEntry` dataclasses.
|
|
165
|
+
|
|
166
|
+
### `FileEntry`
|
|
167
|
+
|
|
168
|
+
| field | description |
|
|
169
|
+
|---|---|
|
|
170
|
+
| `name` | filename without extension |
|
|
171
|
+
| `extension` | lowercase extension, no leading dot |
|
|
172
|
+
| `path` | full path |
|
|
173
|
+
| `dir_path` | containing directory |
|
|
174
|
+
| `size` | bytes |
|
|
175
|
+
| `created_at` | `datetime` |
|
|
176
|
+
| `modified_at` | `datetime` |
|
|
177
|
+
|
|
178
|
+
### `DirectoryEntry`
|
|
179
|
+
|
|
180
|
+
| field | description |
|
|
181
|
+
|---|---|
|
|
182
|
+
| `name` | directory name |
|
|
183
|
+
| `path` | full path |
|
|
184
|
+
| `parent_path` | parent directory |
|
|
185
|
+
| `created_at` | `datetime` |
|
|
186
|
+
| `modified_at` | `datetime` |
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## vs the stdlib
|
|
191
|
+
|
|
192
|
+
| | `os.walk` | `pathlib.rglob` | `dscan` |
|
|
193
|
+
|---|:---:|:---:|:---:|
|
|
194
|
+
| Concurrent traversal | No | No | Yes |
|
|
195
|
+
| Built-in models | No | No | Yes |
|
|
196
|
+
| Depth limit | Manual | No | Yes |
|
|
197
|
+
| Directory exclusions | Manual | No | Yes |
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
MIT
|
dscanpy-0.1.0/README.md
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# dscan
|
|
2
|
+
|
|
3
|
+
`dscan` is a concurrent directory scanner for Python 3.12+. It wraps `os.scandir` in a thread pool with a work-stealing queue, exposing a filtering API that covers most of what you'd otherwise implement by hand on top of `os.walk`.
|
|
4
|
+
|
|
5
|
+
Two modes: `scan_entries` yields raw `os.DirEntry` objects with minimal overhead; `scan` yields dataclass models with pre-computed metadata.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Why concurrent scanning?
|
|
10
|
+
|
|
11
|
+
On a local SSD, directory traversal is fast enough that threading adds more overhead than it saves. `scan_entries` still matches or edges out `os.walk`, but the real case for concurrency is **network-attached storage**.
|
|
12
|
+
|
|
13
|
+
On SMB shares, NFS mounts, or any high-latency filesystem, each `scandir` call blocks waiting for a server response. `os.walk` does this serially — one directory at a time. dscan keeps multiple directories in-flight simultaneously, so workers aren't sitting idle while the network responds. On deep trees with many subdirectories, this compounds significantly.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Benchmarks
|
|
18
|
+
|
|
19
|
+
### Local SSD (~4M entries, MacBook)
|
|
20
|
+
|
|
21
|
+
| | entries | time |
|
|
22
|
+
|---|---|---|
|
|
23
|
+
| `os.walk` (no stat) | 4,046,505 | 33.30s |
|
|
24
|
+
| `os.walk` (+ stat) | 4,039,313 | 85.24s |
|
|
25
|
+
| `dscan.scan_entries` | 4,046,502 | **31.90s** |
|
|
26
|
+
| `dscan.scan` (models) | 4,014,758 | 140.15s |
|
|
27
|
+
|
|
28
|
+
`scan_entries` is on par with bare `os.walk`. `scan` is slower because stat calls happen on the main thread serially — the workers parallelise `scandir`, not `stat`. Use `scan` when you want the structured output; use `scan_entries` when throughput matters.
|
|
29
|
+
|
|
30
|
+
### Simulated network latency (5ms per directory)
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
# rough simulation
|
|
34
|
+
import time, os
|
|
35
|
+
_real = os.scandir
|
|
36
|
+
os.scandir = lambda p: (time.sleep(0.005), _real(p))[1]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
| | time |
|
|
40
|
+
|---|---|
|
|
41
|
+
| `os.walk` | ~linear with directory count |
|
|
42
|
+
| `dscan.scan_entries` | scales with `max_workers` |
|
|
43
|
+
|
|
44
|
+
At 5ms latency per directory, a tree with 10,000 directories takes ~50s serially. With 16 workers dscan brings that to ~4s. The deeper and wider the tree, the bigger the difference.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install dscan
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Requires Python 3.12+. No other dependencies.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Usage
|
|
59
|
+
|
|
60
|
+
### Basic scan
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from dscan import scan
|
|
64
|
+
|
|
65
|
+
for entry in scan("."):
|
|
66
|
+
print(f"{entry.name} - {entry.path}")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Raw entries (lower overhead)
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from dscan import scan_entries
|
|
73
|
+
|
|
74
|
+
for entry in scan_entries("~/Documents", max_depth=2):
|
|
75
|
+
if entry.is_file():
|
|
76
|
+
print(entry.name)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Filtering
|
|
82
|
+
|
|
83
|
+
### Extensions
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# Only Python and Markdown files
|
|
87
|
+
for file in scan(".", extensions={".py", ".md"}):
|
|
88
|
+
print(file.path)
|
|
89
|
+
|
|
90
|
+
# Skip compiled files
|
|
91
|
+
for file in scan(".", ignore_extensions={".bin", ".exe"}):
|
|
92
|
+
print(file.path)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Glob patterns
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
# Only test files
|
|
99
|
+
for entry in scan(".", match="test_*"):
|
|
100
|
+
print(entry.name)
|
|
101
|
+
|
|
102
|
+
# Skip hidden files and directories
|
|
103
|
+
for entry in scan(".", ignore_pattern=".*"):
|
|
104
|
+
print(entry.name)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Directory traversal
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
# Immediate children only
|
|
111
|
+
for entry in scan(".", max_depth=0):
|
|
112
|
+
print(entry.name)
|
|
113
|
+
|
|
114
|
+
# Only descend into src/ and lib/
|
|
115
|
+
for entry in scan(".", only_dirs=["src", "lib"]):
|
|
116
|
+
print(entry.path)
|
|
117
|
+
|
|
118
|
+
# Skip specific directories
|
|
119
|
+
# .git, .idea, .venv, __pycache__ are skipped by default
|
|
120
|
+
for entry in scan(".", ignore_dirs=["node_modules", "dist"]):
|
|
121
|
+
print(entry.path)
|
|
122
|
+
|
|
123
|
+
# Disable all default ignores
|
|
124
|
+
for entry in scan(".", ignore_dirs=[]):
|
|
125
|
+
print(entry.path)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Custom filter
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def is_large_file(entry):
|
|
132
|
+
return entry.is_file() and entry.stat().st_size > 1_000_000
|
|
133
|
+
|
|
134
|
+
for entry in scan(".", custom_filter=is_large_file):
|
|
135
|
+
print(entry.name)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Tuning workers
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
# default is min(32, cpu_count * 2)
|
|
142
|
+
# increase on high-latency mounts
|
|
143
|
+
for entry in scan_entries("/mnt/nas", max_workers=32):
|
|
144
|
+
print(entry.path)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Data Models
|
|
150
|
+
|
|
151
|
+
`scan()` returns `FileEntry` or `DirectoryEntry` dataclasses.
|
|
152
|
+
|
|
153
|
+
### `FileEntry`
|
|
154
|
+
|
|
155
|
+
| field | description |
|
|
156
|
+
|---|---|
|
|
157
|
+
| `name` | filename without extension |
|
|
158
|
+
| `extension` | lowercase extension, no leading dot |
|
|
159
|
+
| `path` | full path |
|
|
160
|
+
| `dir_path` | containing directory |
|
|
161
|
+
| `size` | bytes |
|
|
162
|
+
| `created_at` | `datetime` |
|
|
163
|
+
| `modified_at` | `datetime` |
|
|
164
|
+
|
|
165
|
+
### `DirectoryEntry`
|
|
166
|
+
|
|
167
|
+
| field | description |
|
|
168
|
+
|---|---|
|
|
169
|
+
| `name` | directory name |
|
|
170
|
+
| `path` | full path |
|
|
171
|
+
| `parent_path` | parent directory |
|
|
172
|
+
| `created_at` | `datetime` |
|
|
173
|
+
| `modified_at` | `datetime` |
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## vs the stdlib
|
|
178
|
+
|
|
179
|
+
| | `os.walk` | `pathlib.rglob` | `dscan` |
|
|
180
|
+
|---|:---:|:---:|:---:|
|
|
181
|
+
| Concurrent traversal | No | No | Yes |
|
|
182
|
+
| Built-in models | No | No | Yes |
|
|
183
|
+
| Depth limit | Manual | No | Yes |
|
|
184
|
+
| Directory exclusions | Manual | No | Yes |
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## License
|
|
189
|
+
|
|
190
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dscanpy"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Concurrent directory tree scanner for Python 3.12+"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
classifiers = [
|
|
9
|
+
"Programming Language :: Python :: 3.12",
|
|
10
|
+
"License :: OSI Approved :: MIT License",
|
|
11
|
+
"Operating System :: OS Independent",
|
|
12
|
+
"Topic :: System :: Filesystems",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.urls]
|
|
16
|
+
Repository = "https://github.com/yourusername/dscanpy"
|
|
17
|
+
|
|
18
|
+
[tool.poetry]
|
|
19
|
+
packages = [{include = "dscan", from = "src"}]
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
23
|
+
build-backend = "poetry.core.masonry.api"
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"ruff (>=0.15.5,<0.16.0)",
|
|
28
|
+
"ipykernel (>=7.2.0,<8.0.0)",
|
|
29
|
+
"jupyterlab (>=4.5.5,<5.0.0)"
|
|
30
|
+
]
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from collections.abc import Callable, Iterator
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from dscan.models import FileEntry, DirectoryEntry
|
|
6
|
+
from dscan.crawler import TreeCrawler
|
|
7
|
+
from dscan.filter import FilterMode, ScanFilter
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def scan_entries(
|
|
13
|
+
path: str | Path,
|
|
14
|
+
*,
|
|
15
|
+
# ── What to yield ────────────────────────────────────────────────────────
|
|
16
|
+
files_only: bool = False,
|
|
17
|
+
dirs_only: bool = False,
|
|
18
|
+
# ── Extension filtering (mutually exclusive) ──────────────────────────────
|
|
19
|
+
extensions: set[str] | list[str] | None = None,
|
|
20
|
+
ignore_extensions: set[str] | list[str] | None = None,
|
|
21
|
+
# ── Name-pattern filtering (mutually exclusive) ───────────────────────────
|
|
22
|
+
match: str | None = None,
|
|
23
|
+
ignore_pattern: str | None = None,
|
|
24
|
+
# ── Traversal ────────────────────────────────────────────────────────────
|
|
25
|
+
ignore_dirs: list[str] | None = None,
|
|
26
|
+
only_dirs: list[str] | None = None,
|
|
27
|
+
max_depth: int | None = None,
|
|
28
|
+
max_workers: int | None = None,
|
|
29
|
+
# ── Escape hatch ─────────────────────────────────────────────────────────
|
|
30
|
+
custom_filter: Callable[[os.DirEntry], bool] | None = None,
|
|
31
|
+
) -> Iterator[os.DirEntry]:
|
|
32
|
+
"""Scan a directory tree and yield matching entries.
|
|
33
|
+
|
|
34
|
+
Combines traversal and filtering into a single call. All keyword
|
|
35
|
+
arguments are optional — calling ``scan_entries(path)`` alone yields every
|
|
36
|
+
entry in the tree with sensible defaults.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
path: Root directory to scan. Strings are accepted and resolved
|
|
40
|
+
automatically; non-existent or non-directory paths raise
|
|
41
|
+
``ValueError``.
|
|
42
|
+
|
|
43
|
+
files_only: Yield only files (no directories).
|
|
44
|
+
dirs_only: Yield only directories (no files).
|
|
45
|
+
``files_only`` and ``dirs_only`` are mutually exclusive.
|
|
46
|
+
|
|
47
|
+
extensions: Allowlist of file extensions to yield, e.g.
|
|
48
|
+
``{".py", ".md"}``. Leading dots are optional and matching
|
|
49
|
+
is case-insensitive. Has no effect on directory entries.
|
|
50
|
+
ignore_extensions: Denylist of extensions to suppress.
|
|
51
|
+
Mutually exclusive with ``extensions``.
|
|
52
|
+
|
|
53
|
+
match: Glob pattern — only yield entries whose name matches,
|
|
54
|
+
e.g. ``"test_*"``.
|
|
55
|
+
ignore_pattern: Glob pattern — suppress entries whose name
|
|
56
|
+
matches, e.g. ``".*"``.
|
|
57
|
+
Mutually exclusive with ``match``.
|
|
58
|
+
|
|
59
|
+
ignore_dirs: Directory names to skip when descending, e.g.
|
|
60
|
+
``["node_modules", "dist"]``. Merged with the built-in
|
|
61
|
+
defaults (``.git``, ``.idea``, ``.venv``, ``__pycache__``).
|
|
62
|
+
Pass an empty list to disable all default ignores.
|
|
63
|
+
only_dirs: When provided, descend *only* into directories whose
|
|
64
|
+
name appears in this list. Mutually exclusive with
|
|
65
|
+
``ignore_dirs``.
|
|
66
|
+
max_depth: How many levels deep to descend. ``0`` scans only the
|
|
67
|
+
root itself; ``None`` (default) is unlimited.
|
|
68
|
+
max_workers: Worker thread count. Defaults to
|
|
69
|
+
``min(32, cpu_count * 2)``.
|
|
70
|
+
|
|
71
|
+
custom_filter: Optional callable ``(DirEntry) -> bool``. Return
|
|
72
|
+
``False`` to drop an entry. Applied after all other filters.
|
|
73
|
+
|
|
74
|
+
Yields:
|
|
75
|
+
``os.DirEntry`` for every entry that passes all active filters.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If ``path`` does not exist or is not a directory, or
|
|
79
|
+
if any pair of mutually-exclusive arguments are both supplied.
|
|
80
|
+
|
|
81
|
+
Examples:
|
|
82
|
+
Yield all Python files up to 3 levels deep::
|
|
83
|
+
|
|
84
|
+
for entry in scan_entries("~/projects", extensions={".py"}, max_depth=3):
|
|
85
|
+
print(entry.path)
|
|
86
|
+
|
|
87
|
+
Yield everything, skipping hidden files and ``dist`` folders::
|
|
88
|
+
|
|
89
|
+
for entry in scan_entries(
|
|
90
|
+
"/srv/app",
|
|
91
|
+
ignore_pattern=".*",
|
|
92
|
+
ignore_dirs=["dist"],
|
|
93
|
+
):
|
|
94
|
+
print(entry.path)
|
|
95
|
+
|
|
96
|
+
Only descend into ``src`` and ``tests`` directories::
|
|
97
|
+
|
|
98
|
+
for entry in scan_entries(".", only_dirs=["src", "tests"], files_only=True):
|
|
99
|
+
print(entry.path)
|
|
100
|
+
"""
|
|
101
|
+
# ── Validate mutually exclusive pairs ────────────────────────────────────
|
|
102
|
+
if files_only and dirs_only:
|
|
103
|
+
raise ValueError("files_only and dirs_only are mutually exclusive")
|
|
104
|
+
if extensions and ignore_extensions:
|
|
105
|
+
raise ValueError("extensions and ignore_extensions are mutually exclusive")
|
|
106
|
+
if match and ignore_pattern:
|
|
107
|
+
raise ValueError("match and ignore_pattern are mutually exclusive")
|
|
108
|
+
if ignore_dirs is not None and only_dirs is not None:
|
|
109
|
+
raise ValueError("ignore_dirs and only_dirs are mutually exclusive")
|
|
110
|
+
|
|
111
|
+
# ── Resolve path ─────────────────────────────────────────────────────────
|
|
112
|
+
root = _resolve_path(path)
|
|
113
|
+
|
|
114
|
+
# ── Build TreeCrawler ─────────────────────────────────────────────────────
|
|
115
|
+
if only_dirs is not None:
|
|
116
|
+
crawler = TreeCrawler(
|
|
117
|
+
dirs=only_dirs,
|
|
118
|
+
filter_mode=FilterMode.INCLUDE,
|
|
119
|
+
max_workers=max_workers,
|
|
120
|
+
max_depth=max_depth,
|
|
121
|
+
)
|
|
122
|
+
else:
|
|
123
|
+
# Merge caller-supplied ignores with the built-in defaults.
|
|
124
|
+
default_ignores = {".git", ".idea", ".venv", "__pycache__"}
|
|
125
|
+
extra = set(ignore_dirs) if ignore_dirs is not None else set()
|
|
126
|
+
merged = (default_ignores | extra) if ignore_dirs is None else extra
|
|
127
|
+
crawler = TreeCrawler(
|
|
128
|
+
dirs=list(merged),
|
|
129
|
+
filter_mode=FilterMode.IGNORE,
|
|
130
|
+
max_workers=max_workers,
|
|
131
|
+
max_depth=max_depth,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# ── Build ScanFilter ──────────────────────────────────────────────────────
|
|
135
|
+
scan_filter = ScanFilter(
|
|
136
|
+
only_files=files_only,
|
|
137
|
+
only_dirs=dirs_only,
|
|
138
|
+
extensions=_normalise_exts(extensions)
|
|
139
|
+
if extensions
|
|
140
|
+
else (_normalise_exts(ignore_extensions) if ignore_extensions else None),
|
|
141
|
+
extensions_mode=(
|
|
142
|
+
FilterMode.IGNORE if ignore_extensions else FilterMode.INCLUDE
|
|
143
|
+
),
|
|
144
|
+
name_pattern=match or ignore_pattern,
|
|
145
|
+
name_pattern_mode=(FilterMode.IGNORE if ignore_pattern else FilterMode.INCLUDE),
|
|
146
|
+
custom=custom_filter,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return scan_filter.apply(crawler.scan(root))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def scan(
|
|
153
|
+
path: str | Path,
|
|
154
|
+
*,
|
|
155
|
+
# ── What to yield ────────────────────────────────────────────────────────
|
|
156
|
+
files_only: bool = False,
|
|
157
|
+
dirs_only: bool = False,
|
|
158
|
+
# ── Extension filtering (mutually exclusive) ──────────────────────────────
|
|
159
|
+
extensions: set[str] | list[str] | None = None,
|
|
160
|
+
ignore_extensions: set[str] | list[str] | None = None,
|
|
161
|
+
# ── Name-pattern filtering (mutually exclusive) ───────────────────────────
|
|
162
|
+
match: str | None = None,
|
|
163
|
+
ignore_pattern: str | None = None,
|
|
164
|
+
# ── Traversal ────────────────────────────────────────────────────────────
|
|
165
|
+
ignore_dirs: list[str] | None = None,
|
|
166
|
+
only_dirs: list[str] | None = None,
|
|
167
|
+
max_depth: int | None = None,
|
|
168
|
+
max_workers: int | None = None,
|
|
169
|
+
# ── Escape hatch ─────────────────────────────────────────────────────────
|
|
170
|
+
custom_filter: Callable[[os.DirEntry], bool] | None = None,
|
|
171
|
+
) -> Iterator[FileEntry | DirectoryEntry]:
|
|
172
|
+
"""Like ``scan_entries``, but yields rich metadata models instead of raw
|
|
173
|
+
``DirEntry`` objects.
|
|
174
|
+
|
|
175
|
+
See ``scan_entries`` for argument details and examples.
|
|
176
|
+
|
|
177
|
+
Yields:
|
|
178
|
+
FileEntry or DirectoryEntry, depending on the type of each entry.
|
|
179
|
+
"""
|
|
180
|
+
for entry in scan_entries(
|
|
181
|
+
path,
|
|
182
|
+
files_only=files_only,
|
|
183
|
+
dirs_only=dirs_only,
|
|
184
|
+
extensions=extensions,
|
|
185
|
+
ignore_extensions=ignore_extensions,
|
|
186
|
+
match=match,
|
|
187
|
+
ignore_pattern=ignore_pattern,
|
|
188
|
+
ignore_dirs=ignore_dirs,
|
|
189
|
+
only_dirs=only_dirs,
|
|
190
|
+
max_depth=max_depth,
|
|
191
|
+
max_workers=max_workers,
|
|
192
|
+
custom_filter=custom_filter,
|
|
193
|
+
):
|
|
194
|
+
if entry.is_file(follow_symlinks=False):
|
|
195
|
+
yield FileEntry.from_dir_entry(entry)
|
|
196
|
+
elif entry.is_dir(follow_symlinks=False):
|
|
197
|
+
yield DirectoryEntry.from_dir_entry(entry)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ── Internal helpers ──────────────────────────────────────────────────────────
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _resolve_path(path: str | Path) -> Path:
|
|
204
|
+
"""Resolve *path* to an absolute ``Path``, raising ``ValueError`` on failure."""
|
|
205
|
+
try:
|
|
206
|
+
p = Path(path).expanduser().resolve()
|
|
207
|
+
except Exception as e:
|
|
208
|
+
raise ValueError(f"Invalid path {path!r}: {e}") from e
|
|
209
|
+
|
|
210
|
+
if not p.exists():
|
|
211
|
+
raise ValueError(f"Path does not exist: {p}")
|
|
212
|
+
if not p.is_dir():
|
|
213
|
+
raise ValueError(f"Path is not a directory: {p}")
|
|
214
|
+
|
|
215
|
+
return p
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ── Normalise extensions (ensure leading dot) ─────────────────────────────
|
|
219
|
+
def _normalise_exts(exts: set[str] | list[str]) -> set[str]:
|
|
220
|
+
return {e if e.startswith(".") else f".{e}" for e in exts}
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from queue import Queue, Empty
|
|
5
|
+
from threading import Lock, Event
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from typing import Iterator, Literal
|
|
8
|
+
from dscan.filter import FilterMode
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _ScanState:
|
|
14
|
+
"""Thread-safe pending work counter for coordinating concurrent directory scanning.
|
|
15
|
+
|
|
16
|
+
Tracks the number of directories that are either queued or actively being
|
|
17
|
+
processed by a worker. Signals completion once all work is exhausted,
|
|
18
|
+
allowing the main thread to stop draining results.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self._lock = Lock()
|
|
23
|
+
self._pending = 0
|
|
24
|
+
self._done = Event()
|
|
25
|
+
|
|
26
|
+
def add(self, count: int) -> None:
|
|
27
|
+
"""Register new units of work.
|
|
28
|
+
|
|
29
|
+
Must be called before putting new directories into the queue to avoid
|
|
30
|
+
a race condition where workers falsely signal completion.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
count: Number of new directories to register as pending.
|
|
34
|
+
"""
|
|
35
|
+
with self._lock:
|
|
36
|
+
self._pending += count
|
|
37
|
+
|
|
38
|
+
def complete(self) -> None:
|
|
39
|
+
"""Mark one unit of work as finished.
|
|
40
|
+
|
|
41
|
+
Decrements the pending counter. If no work remains, sets the done
|
|
42
|
+
event so the main thread and all workers can exit cleanly.
|
|
43
|
+
"""
|
|
44
|
+
with self._lock:
|
|
45
|
+
self._pending -= 1
|
|
46
|
+
if self._pending == 0:
|
|
47
|
+
logger.debug("All pending work exhausted — signalling done")
|
|
48
|
+
self._done.set()
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def is_done(self) -> bool:
|
|
52
|
+
"""Returns True if all pending work has been completed."""
|
|
53
|
+
return self._done.is_set()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TreeCrawler:
|
|
57
|
+
"""Crawls a directory tree concurrently using a shared work-stealing queue.
|
|
58
|
+
|
|
59
|
+
Workers pull directories from a shared queue. Newly discovered subdirectories
|
|
60
|
+
are pushed back into the queue for any idle worker to pick up, keeping all
|
|
61
|
+
threads busy regardless of tree depth or structure.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
# Ignore specific directories (default mode):
|
|
65
|
+
crawler = TreeCrawler(dirs={".git", "node_modules"}, max_depth=3)
|
|
66
|
+
|
|
67
|
+
# Only descend into specific directories:
|
|
68
|
+
crawler = TreeCrawler(dirs={"src", "lib"}, filter_mode=FilterMode.INCLUDE)
|
|
69
|
+
|
|
70
|
+
for entry in crawler.scan(Path("/home/user")):
|
|
71
|
+
print(entry.path)
|
|
72
|
+
|
|
73
|
+
Attributes:
|
|
74
|
+
_dirs: Set of directory names used for filtering during traversal.
|
|
75
|
+
_filter_mode: Whether _dirs is treated as a block list or include list.
|
|
76
|
+
_max_workers: Number of worker threads used during scanning.
|
|
77
|
+
_max_depth: Maximum directory depth to scan. None means unlimited.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
dirs: list[str] | None = None,
|
|
83
|
+
filter_mode: FilterMode = FilterMode.IGNORE,
|
|
84
|
+
max_workers: int | None = None,
|
|
85
|
+
max_depth: int | None = None,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""Initialises the TreeCrawler.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
dirs: Directory names to filter during traversal. Behaviour depends
|
|
91
|
+
on filter_mode. Defaults to {".git", ".idea", ".venv",
|
|
92
|
+
"__pycache__"} when mode is FilterMode.IGNORE, or an empty set otherwise.
|
|
93
|
+
filter_mode: Controls how dirs is applied during descent:
|
|
94
|
+
- filter_mode.IGNORE — skip directories whose name is in dirs.
|
|
95
|
+
- filter_mode.INCLUDE — only descend into directories whose name is in dirs.
|
|
96
|
+
Entries are still yielded regardless of this filter; it only
|
|
97
|
+
controls whether a directory is recursed into.
|
|
98
|
+
max_workers: Number of worker threads. Defaults to min(32, cpu_count * 2).
|
|
99
|
+
max_depth: Maximum depth to descend relative to root_path. Depth 0
|
|
100
|
+
scans only the root itself, depth 1 includes its immediate children,
|
|
101
|
+
and so on. None (default) means unlimited depth.
|
|
102
|
+
"""
|
|
103
|
+
self._filter_mode = filter_mode
|
|
104
|
+
self._dirs = (
|
|
105
|
+
set(dirs)
|
|
106
|
+
if dirs is not None
|
|
107
|
+
else (
|
|
108
|
+
{".git", ".idea", ".venv", "__pycache__"}
|
|
109
|
+
if filter_mode == FilterMode.IGNORE
|
|
110
|
+
else set()
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
self._max_workers = max_workers or min(32, max(1, (os.cpu_count() or 4) * 2))
|
|
114
|
+
self._max_depth = max_depth
|
|
115
|
+
logger.debug(
|
|
116
|
+
f"TreeCrawler initialised — "
|
|
117
|
+
f"filter_mode={self._filter_mode!r}, "
|
|
118
|
+
f"dirs={self._dirs}, "
|
|
119
|
+
f"max_workers={self._max_workers}, "
|
|
120
|
+
f"max_depth={self._max_depth!r}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def scan(self, root_path: Path) -> Iterator[os.DirEntry]:
|
|
124
|
+
"""Scan a directory tree and yield all entries in completion order.
|
|
125
|
+
|
|
126
|
+
Spawns worker threads that pull directories from a shared queue.
|
|
127
|
+
Results are yielded as they become available. Once all workers finish,
|
|
128
|
+
any remaining buffered results are drained and yielded.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
root_path: Root directory to begin scanning from.
|
|
132
|
+
|
|
133
|
+
Yields:
|
|
134
|
+
os.DirEntry: Every entry found in the tree that was not pruned
|
|
135
|
+
by dirs or max_depth. Entries are yielded in worker
|
|
136
|
+
completion order, not filesystem order.
|
|
137
|
+
"""
|
|
138
|
+
# Queue holds (path, depth) tuples. Root starts at depth 0.
|
|
139
|
+
dir_queue: Queue[tuple[Path, int]] = Queue()
|
|
140
|
+
result_queue: Queue[os.DirEntry] = Queue()
|
|
141
|
+
state = _ScanState()
|
|
142
|
+
|
|
143
|
+
state.add(1)
|
|
144
|
+
dir_queue.put((root_path, 0))
|
|
145
|
+
|
|
146
|
+
logger.info(
|
|
147
|
+
f"Starting scan: {root_path} "
|
|
148
|
+
f"({self._max_workers} workers, max_depth={self._max_depth!r})"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
with ThreadPoolExecutor(max_workers=self._max_workers) as executor:
|
|
152
|
+
for _ in range(self._max_workers):
|
|
153
|
+
executor.submit(self._worker, dir_queue, result_queue, state)
|
|
154
|
+
|
|
155
|
+
while not state.is_done:
|
|
156
|
+
try:
|
|
157
|
+
yield result_queue.get(timeout=0.05)
|
|
158
|
+
except Empty:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
# Executor has joined — all workers are done. Drain any remaining results.
|
|
162
|
+
drained = 0
|
|
163
|
+
while not result_queue.empty():
|
|
164
|
+
try:
|
|
165
|
+
yield result_queue.get_nowait()
|
|
166
|
+
drained += 1
|
|
167
|
+
except Empty:
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
if drained:
|
|
171
|
+
logger.debug(f"Drained {drained} remaining entries after workers finished")
|
|
172
|
+
|
|
173
|
+
logger.info(f"Scan complete: {root_path}")
|
|
174
|
+
|
|
175
|
+
def _worker(
|
|
176
|
+
self,
|
|
177
|
+
dir_queue: Queue,
|
|
178
|
+
result_queue: Queue,
|
|
179
|
+
state: _ScanState,
|
|
180
|
+
) -> None:
|
|
181
|
+
"""Worker loop that pulls directories from the queue and scans them.
|
|
182
|
+
|
|
183
|
+
Runs until the scan state signals completion. For each directory
|
|
184
|
+
dequeued, scans its contents and re-enqueues any discovered
|
|
185
|
+
subdirectories for other workers to pick up, unless the depth limit
|
|
186
|
+
has been reached.
|
|
187
|
+
|
|
188
|
+
New subdirectories are registered with state before being enqueued
|
|
189
|
+
to prevent a race condition where all workers complete before new
|
|
190
|
+
work is visible to the state counter.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
dir_queue: Shared queue of (directory, depth) tuples to scan.
|
|
194
|
+
result_queue: Shared queue to put discovered entries into.
|
|
195
|
+
state: Shared scan state used to track pending work.
|
|
196
|
+
"""
|
|
197
|
+
while not state.is_done:
|
|
198
|
+
try:
|
|
199
|
+
current_dir, depth = dir_queue.get(timeout=0.05)
|
|
200
|
+
except Empty:
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
logger.debug(f"Worker picked up: {current_dir} (depth={depth})")
|
|
204
|
+
new_dirs = self._scan_dir(current_dir, depth, result_queue)
|
|
205
|
+
|
|
206
|
+
# Register new work BEFORE completing current — prevents false done signal.
|
|
207
|
+
state.add(len(new_dirs))
|
|
208
|
+
for d, child_depth in new_dirs:
|
|
209
|
+
dir_queue.put((d, child_depth))
|
|
210
|
+
state.complete()
|
|
211
|
+
|
|
212
|
+
def _scan_dir(
|
|
213
|
+
self, dir_path: Path, depth: int, result_queue: Queue
|
|
214
|
+
) -> list[tuple[Path, int]]:
|
|
215
|
+
"""Scan a single directory and collect results and subdirectories.
|
|
216
|
+
|
|
217
|
+
All entries are placed into result_queue. Subdirectories that are not
|
|
218
|
+
filtered by dirs are returned for re-enqueueing, unless max_depth has
|
|
219
|
+
been reached.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
dir_path: Path to the directory to scan.
|
|
223
|
+
depth: Depth of dir_path relative to the scan root (root = 0).
|
|
224
|
+
result_queue: Queue to put discovered DirEntry objects into.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
List of (subdirectory path, child depth) tuples to enqueue for
|
|
228
|
+
further scanning. Empty when at or beyond max_depth.
|
|
229
|
+
"""
|
|
230
|
+
new_dirs: list[tuple[Path, int]] = []
|
|
231
|
+
at_depth_limit = self._max_depth is not None and depth >= self._max_depth
|
|
232
|
+
logger.debug(f"Scanning: {dir_path} (depth={depth}, limit={self._max_depth!r})")
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
with os.scandir(dir_path) as it:
|
|
236
|
+
for entry in it:
|
|
237
|
+
try:
|
|
238
|
+
is_dir = entry.is_dir(follow_symlinks=False)
|
|
239
|
+
except OSError as e:
|
|
240
|
+
logger.warning(f"Skipping entry {entry.path}: {e}")
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
result_queue.put(entry)
|
|
244
|
+
|
|
245
|
+
if is_dir:
|
|
246
|
+
if (
|
|
247
|
+
self._filter_mode == FilterMode.IGNORE
|
|
248
|
+
and entry.name in self._dirs
|
|
249
|
+
):
|
|
250
|
+
logger.debug(f"Skipping ignored directory: {entry.path}")
|
|
251
|
+
elif (
|
|
252
|
+
self._filter_mode == FilterMode.INCLUDE
|
|
253
|
+
and entry.name not in self._dirs
|
|
254
|
+
):
|
|
255
|
+
logger.debug(
|
|
256
|
+
f"Skipping directory not in include list: {entry.path}"
|
|
257
|
+
)
|
|
258
|
+
elif at_depth_limit:
|
|
259
|
+
logger.debug(
|
|
260
|
+
f"Depth limit reached ({depth}/{self._max_depth}), "
|
|
261
|
+
f"not descending into: {entry.path}"
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
logger.debug(f"Queuing subdirectory: {entry.path}")
|
|
265
|
+
new_dirs.append((Path(entry.path), depth + 1))
|
|
266
|
+
|
|
267
|
+
except PermissionError:
|
|
268
|
+
logger.warning(f"Permission denied: {dir_path}")
|
|
269
|
+
except OSError as e:
|
|
270
|
+
logger.warning(f"Cannot scan {dir_path}: {e}")
|
|
271
|
+
|
|
272
|
+
logger.debug(
|
|
273
|
+
f"Finished scanning: {dir_path} — found {len(new_dirs)} subdirectories"
|
|
274
|
+
)
|
|
275
|
+
return new_dirs
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Callable, Iterator
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from enum import Enum
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FilterMode(Enum):
|
|
11
|
+
IGNORE = "ignore"
|
|
12
|
+
INCLUDE = "include"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ScanFilter:
|
|
19
|
+
"""Post-scan filter that wraps a DriveScanner iterator.
|
|
20
|
+
|
|
21
|
+
Operates on yielded entries only — traversal behaviour (depth,
|
|
22
|
+
which dirs to recurse into) is controlled by DriveScanner itself.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
scanner = DriveScanner(max_depth=3)
|
|
26
|
+
|
|
27
|
+
# Only yield .pdf and .pptx files:
|
|
28
|
+
f = ScanFilter(extensions={".pdf", ".pptx"})
|
|
29
|
+
|
|
30
|
+
# Yield everything except .pdf:
|
|
31
|
+
f = ScanFilter(extensions={".pdf"}, extensions_mode="ignore")
|
|
32
|
+
|
|
33
|
+
# Yield entries whose name does NOT match a pattern:
|
|
34
|
+
f = ScanFilter(name_pattern=".*", name_pattern_mode="ignore")
|
|
35
|
+
|
|
36
|
+
for entry in f.apply(scanner.scan(root)):
|
|
37
|
+
print(entry.path)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
only_files: bool = False,
|
|
43
|
+
only_dirs: bool = False,
|
|
44
|
+
extensions: set[str] | None = None,
|
|
45
|
+
extensions_mode: FilterMode = FilterMode.INCLUDE,
|
|
46
|
+
name_pattern: str | None = None,
|
|
47
|
+
name_pattern_mode: FilterMode = FilterMode.INCLUDE,
|
|
48
|
+
custom: Callable[[os.DirEntry], bool] | None = None,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Initialises the ScanFilter.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
only_files: If True, only yield file entries.
|
|
54
|
+
only_dirs: If True, only yield directory entries.
|
|
55
|
+
Mutually exclusive with only_files.
|
|
56
|
+
extensions: Set of file extensions to filter on (e.g. {".pdf", ".py"}).
|
|
57
|
+
Has no effect on directory entries. Case-insensitive.
|
|
58
|
+
extensions_mode: Controls how extensions is applied:
|
|
59
|
+
- FilterMode.INCLUDE — only yield files whose extension is in extensions.
|
|
60
|
+
- FilterMode.IGNORE — skip files whose extension is in extensions.
|
|
61
|
+
name_pattern: A glob pattern matched against entry.name (e.g. ".*", "*.min.*").
|
|
62
|
+
name_pattern_mode: Controls how name_pattern is applied:
|
|
63
|
+
- FilterMode.INCLUDE — only yield entries whose name matches the pattern.
|
|
64
|
+
- FilterMode.IGNORE — skip entries whose name matches the pattern.
|
|
65
|
+
custom: Optional callable that receives a DirEntry and returns True to
|
|
66
|
+
keep the entry, False to drop it. Applied last, after all other filters.
|
|
67
|
+
"""
|
|
68
|
+
if only_files and only_dirs:
|
|
69
|
+
raise ValueError("only_files and only_dirs are mutually exclusive")
|
|
70
|
+
|
|
71
|
+
self._only_files = only_files
|
|
72
|
+
self._only_dirs = only_dirs
|
|
73
|
+
self._extensions = {ext.lower() for ext in extensions} if extensions else None
|
|
74
|
+
self._extensions_mode = extensions_mode
|
|
75
|
+
self._name_pattern = name_pattern
|
|
76
|
+
self._name_pattern_mode = name_pattern_mode
|
|
77
|
+
self._custom = custom
|
|
78
|
+
|
|
79
|
+
logger.debug(
|
|
80
|
+
f"ScanFilter initialised — "
|
|
81
|
+
f"only_files={self._only_files}, "
|
|
82
|
+
f"only_dirs={self._only_dirs}, "
|
|
83
|
+
f"extensions={self._extensions}, "
|
|
84
|
+
f"extensions_mode={self._extensions_mode!r}, "
|
|
85
|
+
f"name_pattern={self._name_pattern!r}, "
|
|
86
|
+
f"name_pattern_mode={self._name_pattern_mode!r}, "
|
|
87
|
+
f"custom={'provided' if custom else 'None'}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def apply(self, entries: Iterator[os.DirEntry]) -> Iterator[os.DirEntry]:
|
|
91
|
+
"""Apply all configured filters to an entry iterator.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
entries: Iterator of DirEntry objects, typically from DriveScanner.scan().
|
|
95
|
+
|
|
96
|
+
Yields:
|
|
97
|
+
os.DirEntry: Entries that pass all active filters.
|
|
98
|
+
"""
|
|
99
|
+
for entry in entries:
|
|
100
|
+
if self._only_files and not entry.is_file(follow_symlinks=False):
|
|
101
|
+
logger.debug(f"ScanFilter: skipping non-file: {entry.path}")
|
|
102
|
+
continue
|
|
103
|
+
if self._only_dirs and not entry.is_dir(follow_symlinks=False):
|
|
104
|
+
logger.debug(f"ScanFilter: skipping non-directory: {entry.path}")
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if self._extensions and not entry.is_dir(follow_symlinks=False):
|
|
108
|
+
ext = os.path.splitext(entry.name)[1].lower()
|
|
109
|
+
match = ext in self._extensions
|
|
110
|
+
if self._extensions_mode == FilterMode.INCLUDE and not match:
|
|
111
|
+
logger.debug(
|
|
112
|
+
f"ScanFilter: extension {ext!r} not in include list: {entry.path}"
|
|
113
|
+
)
|
|
114
|
+
continue
|
|
115
|
+
if self._extensions_mode == FilterMode.IGNORE and match:
|
|
116
|
+
logger.debug(
|
|
117
|
+
f"ScanFilter: extension {ext!r} in ignore list: {entry.path}"
|
|
118
|
+
)
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if self._name_pattern:
|
|
122
|
+
match = fnmatch.fnmatch(entry.name, self._name_pattern)
|
|
123
|
+
if self._name_pattern_mode == FilterMode.INCLUDE and not match:
|
|
124
|
+
logger.debug(
|
|
125
|
+
f"ScanFilter: name {entry.name!r} doesn't match include "
|
|
126
|
+
f"pattern {self._name_pattern!r}: {entry.path}"
|
|
127
|
+
)
|
|
128
|
+
continue
|
|
129
|
+
if self._name_pattern_mode == FilterMode.IGNORE and match:
|
|
130
|
+
logger.debug(
|
|
131
|
+
f"ScanFilter: name {entry.name!r} matches ignore "
|
|
132
|
+
f"pattern {self._name_pattern!r}: {entry.path}"
|
|
133
|
+
)
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
if self._custom and not self._custom(entry):
|
|
137
|
+
logger.debug(f"ScanFilter: rejected by custom filter: {entry.path}")
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
yield entry
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(slots=True, frozen=True)
|
|
9
|
+
class FileEntry:
|
|
10
|
+
name: str
|
|
11
|
+
extension: str | None
|
|
12
|
+
path: str
|
|
13
|
+
dir_path: str
|
|
14
|
+
size: int
|
|
15
|
+
created_at: datetime
|
|
16
|
+
modified_at: datetime
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_dir_entry(cls, entry: os.DirEntry) -> "FileEntry":
|
|
20
|
+
stat = entry.stat(follow_symlinks=False)
|
|
21
|
+
name, ext = os.path.splitext(entry.name)
|
|
22
|
+
return cls(
|
|
23
|
+
name=name,
|
|
24
|
+
extension=ext[1:].lower() if ext else None,
|
|
25
|
+
path=entry.path,
|
|
26
|
+
dir_path=os.path.dirname(entry.path),
|
|
27
|
+
size=stat.st_size,
|
|
28
|
+
created_at=datetime.fromtimestamp(stat.st_ctime),
|
|
29
|
+
modified_at=datetime.fromtimestamp(stat.st_mtime),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(slots=True, frozen=True)
|
|
34
|
+
class DirectoryEntry:
|
|
35
|
+
name: str
|
|
36
|
+
path: str
|
|
37
|
+
parent_path: str
|
|
38
|
+
created_at: datetime
|
|
39
|
+
modified_at: datetime
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def from_dir_entry(cls, entry: os.DirEntry) -> "DirectoryEntry":
|
|
43
|
+
stat = entry.stat(follow_symlinks=False)
|
|
44
|
+
return cls(
|
|
45
|
+
name=entry.name,
|
|
46
|
+
path=entry.path,
|
|
47
|
+
parent_path=os.path.dirname(entry.path),
|
|
48
|
+
created_at=datetime.fromtimestamp(stat.st_ctime),
|
|
49
|
+
modified_at=datetime.fromtimestamp(stat.st_mtime),
|
|
50
|
+
)
|