vcti-path-format 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vcti_path_format-1.2.0/LICENSE +8 -0
- vcti_path_format-1.2.0/PKG-INFO +274 -0
- vcti_path_format-1.2.0/README.md +257 -0
- vcti_path_format-1.2.0/pyproject.toml +41 -0
- vcti_path_format-1.2.0/setup.cfg +4 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/__init__.py +29 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/descriptor.py +94 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/evaluator/__init__.py +29 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/evaluator/base.py +87 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/evaluator/heuristic.py +367 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/feature_validator/__init__.py +23 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/feature_validator/base.py +88 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/feature_validator/extension.py +55 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/feature_validator/magic_bytes.py +62 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/feature_validator/registry.py +53 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/identifier.py +178 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/py.typed +0 -0
- vcti_path_format-1.2.0/src/vcti/pathformat/registry.py +30 -0
- vcti_path_format-1.2.0/src/vcti_path_format.egg-info/PKG-INFO +274 -0
- vcti_path_format-1.2.0/src/vcti_path_format.egg-info/SOURCES.txt +33 -0
- vcti_path_format-1.2.0/src/vcti_path_format.egg-info/dependency_links.txt +1 -0
- vcti_path_format-1.2.0/src/vcti_path_format.egg-info/requires.txt +9 -0
- vcti_path_format-1.2.0/src/vcti_path_format.egg-info/top_level.txt +1 -0
- vcti_path_format-1.2.0/src/vcti_path_format.egg-info/zip-safe +1 -0
- vcti_path_format-1.2.0/tests/test_descriptor.py +64 -0
- vcti_path_format-1.2.0/tests/test_evaluator_base.py +63 -0
- vcti_path_format-1.2.0/tests/test_extension_validator.py +62 -0
- vcti_path_format-1.2.0/tests/test_feature_validator_base.py +103 -0
- vcti_path_format-1.2.0/tests/test_heuristic_evaluator.py +291 -0
- vcti_path_format-1.2.0/tests/test_identifier.py +90 -0
- vcti_path_format-1.2.0/tests/test_integration.py +156 -0
- vcti_path_format-1.2.0/tests/test_magic_bytes_validator.py +76 -0
- vcti_path_format-1.2.0/tests/test_registry.py +44 -0
- vcti_path_format-1.2.0/tests/test_validator_registry.py +45 -0
- vcti_path_format-1.2.0/tests/test_version.py +15 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Copyright (c) 2018-2026 Visual Collaboration Technologies Inc.
|
|
2
|
+
All Rights Reserved.
|
|
3
|
+
|
|
4
|
+
This software is proprietary and confidential. Unauthorized copying,
|
|
5
|
+
distribution, or use of this software, via any medium, is strictly
|
|
6
|
+
prohibited. Access is granted only to authorized VCollab developers
|
|
7
|
+
and individuals explicitly authorized by Visual Collaboration
|
|
8
|
+
Technologies Inc.
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vcti-path-format
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: File format identification framework with heuristic evaluators and feature validators for Python
|
|
5
|
+
Author: Visual Collaboration Technologies Inc.
|
|
6
|
+
Requires-Python: <3.15,>=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: vcti-plugin-catalog>=1.0.1
|
|
10
|
+
Requires-Dist: vcti-lookup>=1.0.1
|
|
11
|
+
Provides-Extra: test
|
|
12
|
+
Requires-Dist: pytest; extra == "test"
|
|
13
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
14
|
+
Provides-Extra: lint
|
|
15
|
+
Requires-Dist: ruff; extra == "lint"
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# Path Format
|
|
19
|
+
|
|
20
|
+
File format identification framework with heuristic evaluators and feature validators for Python.
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install vcti-path-format>=1.2.0
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### In `pyproject.toml` dependencies
|
|
29
|
+
|
|
30
|
+
```toml
|
|
31
|
+
dependencies = [
|
|
32
|
+
"vcti-path-format>=1.2.0",
|
|
33
|
+
]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from vcti.pathformat import (
|
|
43
|
+
FormatDescriptor,
|
|
44
|
+
FormatIdentifier,
|
|
45
|
+
FormatRegistry,
|
|
46
|
+
MatchConfidence,
|
|
47
|
+
)
|
|
48
|
+
from vcti.pathformat.evaluator import HeuristicEvaluator
|
|
49
|
+
|
|
50
|
+
# Define a format descriptor with validators
|
|
51
|
+
hdf5_descriptor = FormatDescriptor(
|
|
52
|
+
id="hdf5-file",
|
|
53
|
+
name="HDF5 File",
|
|
54
|
+
evaluator=(
|
|
55
|
+
HeuristicEvaluator()
|
|
56
|
+
.check_magic_bytes(b"\x89HDF\r\n\x1a\n") # GATE
|
|
57
|
+
.check_extension([".h5", ".hdf5", ".he5"]) # EVIDENCE
|
|
58
|
+
),
|
|
59
|
+
attributes={"path_type": "file", "structure": "hdf5"},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Register in a format registry
|
|
63
|
+
registry = FormatRegistry()
|
|
64
|
+
registry.register(hdf5_descriptor)
|
|
65
|
+
|
|
66
|
+
# Identify a file
|
|
67
|
+
identifier = FormatIdentifier(registry)
|
|
68
|
+
results = identifier.identify_file_format(Path("data.h5"))
|
|
69
|
+
|
|
70
|
+
for result in results:
|
|
71
|
+
print(f"{result.descriptor.name}: {result.confidence.name}")
|
|
72
|
+
|
|
73
|
+
# Get best match above a confidence threshold
|
|
74
|
+
best = identifier.get_best_match(
|
|
75
|
+
Path("data.h5"),
|
|
76
|
+
min_confidence=MatchConfidence.LIKELY,
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Core Concepts
|
|
83
|
+
|
|
84
|
+
### FormatDescriptor
|
|
85
|
+
|
|
86
|
+
Extends `Descriptor[Evaluator]` from vcti-plugin-catalog. Wraps an evaluator
|
|
87
|
+
with format metadata and attributes.
|
|
88
|
+
|
|
89
|
+
### FormatRegistry
|
|
90
|
+
|
|
91
|
+
Extends `Registry[FormatDescriptor]`. Central catalog of known formats with
|
|
92
|
+
attribute-based filtering via `registry.lookup`.
|
|
93
|
+
|
|
94
|
+
### FormatIdentifier
|
|
95
|
+
|
|
96
|
+
Evaluates a path against all (or filtered) registered formats and returns
|
|
97
|
+
results sorted by confidence.
|
|
98
|
+
|
|
99
|
+
### HeuristicEvaluator
|
|
100
|
+
|
|
101
|
+
Builder-pattern evaluator that aggregates validation evidence:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
evaluator = (
|
|
105
|
+
HeuristicEvaluator()
|
|
106
|
+
.check_magic_bytes(b"\x89PNG\r\n\x1a\n") # GATE
|
|
107
|
+
.check_extension([".png"]) # EVIDENCE
|
|
108
|
+
.add_validator(custom_validator) # Custom
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Heuristic rules:**
|
|
113
|
+
- Failed GATE -> `CERTAINLY_NOT`
|
|
114
|
+
- All passed + GATE present -> `DEFINITE`
|
|
115
|
+
- All passed + no GATE -> `LIKELY`
|
|
116
|
+
- Some EVIDENCE failed -> `UNLIKELY`
|
|
117
|
+
- No validators -> `CANT_EVALUATE`
|
|
118
|
+
|
|
119
|
+
### Feature Validators
|
|
120
|
+
|
|
121
|
+
| Validator | Role | Tier | Checks |
|
|
122
|
+
|-----------|------|------|--------|
|
|
123
|
+
| `MagicBytesValidator` | GATE | IDENTIFICATION | File signature bytes |
|
|
124
|
+
| `ExtensionValidator` | EVIDENCE | IDENTIFICATION | File extension |
|
|
125
|
+
|
|
126
|
+
Custom validators implement the `FeatureValidator` protocol.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Validation Tiers
|
|
131
|
+
|
|
132
|
+
Control evaluation depth with `max_tier`:
|
|
133
|
+
|
|
134
|
+
| Tier | Cost | Examples |
|
|
135
|
+
|------|------|---------|
|
|
136
|
+
| `IDENTIFICATION` | Cheap | Magic bytes, file extension |
|
|
137
|
+
| `STRUCTURE` | Medium | Schema validation, header parsing |
|
|
138
|
+
| `SEMANTIC` | Expensive | Content analysis, business logic |
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from vcti.pathformat import ValidationTier
|
|
142
|
+
|
|
143
|
+
# Only run cheap checks
|
|
144
|
+
results = identifier.identify_file_format(path, max_tier=ValidationTier.IDENTIFICATION)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Custom Validators
|
|
150
|
+
|
|
151
|
+
Implement the `FeatureValidator` protocol to add domain-specific checks:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from pathlib import Path
|
|
155
|
+
from vcti.pathformat.feature_validator import (
|
|
156
|
+
FeatureValidator,
|
|
157
|
+
ValidationResult,
|
|
158
|
+
ValidationTier,
|
|
159
|
+
ValidatorRole,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
class HeaderValidator:
|
|
163
|
+
"""Checks for a text header line in the first line of a file."""
|
|
164
|
+
|
|
165
|
+
id = "header-check"
|
|
166
|
+
description = "Header line validator"
|
|
167
|
+
role = ValidatorRole.EVIDENCE
|
|
168
|
+
tier = ValidationTier.STRUCTURE
|
|
169
|
+
|
|
170
|
+
def __init__(self, expected_header: str):
|
|
171
|
+
self.expected_header = expected_header
|
|
172
|
+
|
|
173
|
+
def validate(self, path: Path) -> ValidationResult:
|
|
174
|
+
try:
|
|
175
|
+
first_line = path.read_text(encoding="utf-8").split("\n", 1)[0]
|
|
176
|
+
is_passed = first_line.strip() == self.expected_header
|
|
177
|
+
except (OSError, UnicodeDecodeError):
|
|
178
|
+
is_passed = False
|
|
179
|
+
return ValidationResult(
|
|
180
|
+
validator_id=self.id,
|
|
181
|
+
role=self.role,
|
|
182
|
+
is_passed=is_passed,
|
|
183
|
+
details=f"Header {'matches' if is_passed else 'mismatch'}",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Use with the builder pattern
|
|
187
|
+
evaluator = (
|
|
188
|
+
HeuristicEvaluator()
|
|
189
|
+
.check_extension([".csv"])
|
|
190
|
+
.add_validator(HeaderValidator("id,name,value"))
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Evaluator Caching
|
|
197
|
+
|
|
198
|
+
`HeuristicEvaluator` includes an LRU cache keyed by `(path, max_tier)`:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
# Default: 128 entries
|
|
202
|
+
evaluator = HeuristicEvaluator(cache_size=128)
|
|
203
|
+
|
|
204
|
+
# Disable caching
|
|
205
|
+
evaluator = HeuristicEvaluator(cache_size=0)
|
|
206
|
+
|
|
207
|
+
# Bypass cache for a single call
|
|
208
|
+
report = descriptor.evaluate(path, use_cache=False)
|
|
209
|
+
|
|
210
|
+
# Inspect and manage
|
|
211
|
+
info = evaluator.cache_info() # (hits, misses, maxsize, currsize) or None
|
|
212
|
+
evaluator.clear_cache()
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Cache entries become stale if file contents change. Call `clear_cache()` after
|
|
216
|
+
known file modifications, or pass `use_cache=False` for one-off re-evaluation.
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Pre-filtering with Rules
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from vcti.lookup import Rule
|
|
224
|
+
|
|
225
|
+
# Only evaluate formats with structure="hdf5"
|
|
226
|
+
results = identifier.identify_file_format(
|
|
227
|
+
path,
|
|
228
|
+
rules=[Rule("structure", "==", "hdf5")],
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Error Handling
|
|
235
|
+
|
|
236
|
+
The framework raises typed exceptions:
|
|
237
|
+
|
|
238
|
+
| Exception | When |
|
|
239
|
+
|-----------|------|
|
|
240
|
+
| `FileNotFoundError` | Path does not exist |
|
|
241
|
+
| `PathAccessError` | Path is not a file or directory, or cannot be read |
|
|
242
|
+
| `EvaluatorError` | Base class for evaluator errors |
|
|
243
|
+
| `ValidationError` | A validator raised an unexpected exception |
|
|
244
|
+
| `InvalidValidatorError` | Invalid validator passed to builder |
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from vcti.pathformat import PathAccessError
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
results = identifier.identify_file_format(path)
|
|
251
|
+
except FileNotFoundError:
|
|
252
|
+
print("File not found")
|
|
253
|
+
except PathAccessError as e:
|
|
254
|
+
print(f"Cannot access path: {e}")
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Ecosystem
|
|
260
|
+
|
|
261
|
+
This package is the identification engine in a three-repo system:
|
|
262
|
+
|
|
263
|
+
| Package | Role |
|
|
264
|
+
|---------|------|
|
|
265
|
+
| **vcti-path-format** | Framework: evaluators, validators, registry, identifier |
|
|
266
|
+
| [vcti-path-format-attributes](https://pypi.org/project/vcti-path-format-attributes/) | Vocabulary: standardized attribute enums |
|
|
267
|
+
| [vcti-path-format-descriptors](https://pypi.org/project/vcti-path-format-descriptors/) | Built-in format definitions (HDF5, CAX, etc.) |
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Dependencies
|
|
272
|
+
|
|
273
|
+
- [vcti-plugin-catalog](https://pypi.org/project/vcti-plugin-catalog/) (>=1.0.1) — descriptor/registry framework
|
|
274
|
+
- [vcti-lookup](https://pypi.org/project/vcti-lookup/) (>=1.0.1) — attribute-based filtering
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# Path Format
|
|
2
|
+
|
|
3
|
+
File format identification framework with heuristic evaluators and feature validators for Python.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install vcti-path-format>=1.2.0
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
### In `pyproject.toml` dependencies
|
|
12
|
+
|
|
13
|
+
```toml
|
|
14
|
+
dependencies = [
|
|
15
|
+
"vcti-path-format>=1.2.0",
|
|
16
|
+
]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from vcti.pathformat import (
|
|
26
|
+
FormatDescriptor,
|
|
27
|
+
FormatIdentifier,
|
|
28
|
+
FormatRegistry,
|
|
29
|
+
MatchConfidence,
|
|
30
|
+
)
|
|
31
|
+
from vcti.pathformat.evaluator import HeuristicEvaluator
|
|
32
|
+
|
|
33
|
+
# Define a format descriptor with validators
|
|
34
|
+
hdf5_descriptor = FormatDescriptor(
|
|
35
|
+
id="hdf5-file",
|
|
36
|
+
name="HDF5 File",
|
|
37
|
+
evaluator=(
|
|
38
|
+
HeuristicEvaluator()
|
|
39
|
+
.check_magic_bytes(b"\x89HDF\r\n\x1a\n") # GATE
|
|
40
|
+
.check_extension([".h5", ".hdf5", ".he5"]) # EVIDENCE
|
|
41
|
+
),
|
|
42
|
+
attributes={"path_type": "file", "structure": "hdf5"},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Register in a format registry
|
|
46
|
+
registry = FormatRegistry()
|
|
47
|
+
registry.register(hdf5_descriptor)
|
|
48
|
+
|
|
49
|
+
# Identify a file
|
|
50
|
+
identifier = FormatIdentifier(registry)
|
|
51
|
+
results = identifier.identify_file_format(Path("data.h5"))
|
|
52
|
+
|
|
53
|
+
for result in results:
|
|
54
|
+
print(f"{result.descriptor.name}: {result.confidence.name}")
|
|
55
|
+
|
|
56
|
+
# Get best match above a confidence threshold
|
|
57
|
+
best = identifier.get_best_match(
|
|
58
|
+
Path("data.h5"),
|
|
59
|
+
min_confidence=MatchConfidence.LIKELY,
|
|
60
|
+
)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Core Concepts
|
|
66
|
+
|
|
67
|
+
### FormatDescriptor
|
|
68
|
+
|
|
69
|
+
Extends `Descriptor[Evaluator]` from vcti-plugin-catalog. Wraps an evaluator
|
|
70
|
+
with format metadata and attributes.
|
|
71
|
+
|
|
72
|
+
### FormatRegistry
|
|
73
|
+
|
|
74
|
+
Extends `Registry[FormatDescriptor]`. Central catalog of known formats with
|
|
75
|
+
attribute-based filtering via `registry.lookup`.
|
|
76
|
+
|
|
77
|
+
### FormatIdentifier
|
|
78
|
+
|
|
79
|
+
Evaluates a path against all (or filtered) registered formats and returns
|
|
80
|
+
results sorted by confidence.
|
|
81
|
+
|
|
82
|
+
### HeuristicEvaluator
|
|
83
|
+
|
|
84
|
+
Builder-pattern evaluator that aggregates validation evidence:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
evaluator = (
|
|
88
|
+
HeuristicEvaluator()
|
|
89
|
+
.check_magic_bytes(b"\x89PNG\r\n\x1a\n") # GATE
|
|
90
|
+
.check_extension([".png"]) # EVIDENCE
|
|
91
|
+
.add_validator(custom_validator) # Custom
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Heuristic rules:**
|
|
96
|
+
- Failed GATE -> `CERTAINLY_NOT`
|
|
97
|
+
- All passed + GATE present -> `DEFINITE`
|
|
98
|
+
- All passed + no GATE -> `LIKELY`
|
|
99
|
+
- Some EVIDENCE failed -> `UNLIKELY`
|
|
100
|
+
- No validators -> `CANT_EVALUATE`
|
|
101
|
+
|
|
102
|
+
### Feature Validators
|
|
103
|
+
|
|
104
|
+
| Validator | Role | Tier | Checks |
|
|
105
|
+
|-----------|------|------|--------|
|
|
106
|
+
| `MagicBytesValidator` | GATE | IDENTIFICATION | File signature bytes |
|
|
107
|
+
| `ExtensionValidator` | EVIDENCE | IDENTIFICATION | File extension |
|
|
108
|
+
|
|
109
|
+
Custom validators implement the `FeatureValidator` protocol.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Validation Tiers
|
|
114
|
+
|
|
115
|
+
Control evaluation depth with `max_tier`:
|
|
116
|
+
|
|
117
|
+
| Tier | Cost | Examples |
|
|
118
|
+
|------|------|---------|
|
|
119
|
+
| `IDENTIFICATION` | Cheap | Magic bytes, file extension |
|
|
120
|
+
| `STRUCTURE` | Medium | Schema validation, header parsing |
|
|
121
|
+
| `SEMANTIC` | Expensive | Content analysis, business logic |
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from vcti.pathformat import ValidationTier
|
|
125
|
+
|
|
126
|
+
# Only run cheap checks
|
|
127
|
+
results = identifier.identify_file_format(path, max_tier=ValidationTier.IDENTIFICATION)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Custom Validators
|
|
133
|
+
|
|
134
|
+
Implement the `FeatureValidator` protocol to add domain-specific checks:
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from pathlib import Path
|
|
138
|
+
from vcti.pathformat.feature_validator import (
|
|
139
|
+
FeatureValidator,
|
|
140
|
+
ValidationResult,
|
|
141
|
+
ValidationTier,
|
|
142
|
+
ValidatorRole,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
class HeaderValidator:
|
|
146
|
+
"""Checks for a text header line in the first line of a file."""
|
|
147
|
+
|
|
148
|
+
id = "header-check"
|
|
149
|
+
description = "Header line validator"
|
|
150
|
+
role = ValidatorRole.EVIDENCE
|
|
151
|
+
tier = ValidationTier.STRUCTURE
|
|
152
|
+
|
|
153
|
+
def __init__(self, expected_header: str):
|
|
154
|
+
self.expected_header = expected_header
|
|
155
|
+
|
|
156
|
+
def validate(self, path: Path) -> ValidationResult:
|
|
157
|
+
try:
|
|
158
|
+
first_line = path.read_text(encoding="utf-8").split("\n", 1)[0]
|
|
159
|
+
is_passed = first_line.strip() == self.expected_header
|
|
160
|
+
except (OSError, UnicodeDecodeError):
|
|
161
|
+
is_passed = False
|
|
162
|
+
return ValidationResult(
|
|
163
|
+
validator_id=self.id,
|
|
164
|
+
role=self.role,
|
|
165
|
+
is_passed=is_passed,
|
|
166
|
+
details=f"Header {'matches' if is_passed else 'mismatch'}",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Use with the builder pattern
|
|
170
|
+
evaluator = (
|
|
171
|
+
HeuristicEvaluator()
|
|
172
|
+
.check_extension([".csv"])
|
|
173
|
+
.add_validator(HeaderValidator("id,name,value"))
|
|
174
|
+
)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Evaluator Caching
|
|
180
|
+
|
|
181
|
+
`HeuristicEvaluator` includes an LRU cache keyed by `(path, max_tier)`:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
# Default: 128 entries
|
|
185
|
+
evaluator = HeuristicEvaluator(cache_size=128)
|
|
186
|
+
|
|
187
|
+
# Disable caching
|
|
188
|
+
evaluator = HeuristicEvaluator(cache_size=0)
|
|
189
|
+
|
|
190
|
+
# Bypass cache for a single call
|
|
191
|
+
report = descriptor.evaluate(path, use_cache=False)
|
|
192
|
+
|
|
193
|
+
# Inspect and manage
|
|
194
|
+
info = evaluator.cache_info() # (hits, misses, maxsize, currsize) or None
|
|
195
|
+
evaluator.clear_cache()
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Cache entries become stale if file contents change. Call `clear_cache()` after
|
|
199
|
+
known file modifications, or pass `use_cache=False` for one-off re-evaluation.
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Pre-filtering with Rules
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from vcti.lookup import Rule
|
|
207
|
+
|
|
208
|
+
# Only evaluate formats with structure="hdf5"
|
|
209
|
+
results = identifier.identify_file_format(
|
|
210
|
+
path,
|
|
211
|
+
rules=[Rule("structure", "==", "hdf5")],
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Error Handling
|
|
218
|
+
|
|
219
|
+
The framework raises typed exceptions:
|
|
220
|
+
|
|
221
|
+
| Exception | When |
|
|
222
|
+
|-----------|------|
|
|
223
|
+
| `FileNotFoundError` | Path does not exist |
|
|
224
|
+
| `PathAccessError` | Path is not a file or directory, or cannot be read |
|
|
225
|
+
| `EvaluatorError` | Base class for evaluator errors |
|
|
226
|
+
| `ValidationError` | A validator raised an unexpected exception |
|
|
227
|
+
| `InvalidValidatorError` | Invalid validator passed to builder |
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
from vcti.pathformat import PathAccessError
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
results = identifier.identify_file_format(path)
|
|
234
|
+
except FileNotFoundError:
|
|
235
|
+
print("File not found")
|
|
236
|
+
except PathAccessError as e:
|
|
237
|
+
print(f"Cannot access path: {e}")
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## Ecosystem
|
|
243
|
+
|
|
244
|
+
This package is the identification engine in a three-repo system:
|
|
245
|
+
|
|
246
|
+
| Package | Role |
|
|
247
|
+
|---------|------|
|
|
248
|
+
| **vcti-path-format** | Framework: evaluators, validators, registry, identifier |
|
|
249
|
+
| [vcti-path-format-attributes](https://pypi.org/project/vcti-path-format-attributes/) | Vocabulary: standardized attribute enums |
|
|
250
|
+
| [vcti-path-format-descriptors](https://pypi.org/project/vcti-path-format-descriptors/) | Built-in format definitions (HDF5, CAX, etc.) |
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Dependencies
|
|
255
|
+
|
|
256
|
+
- [vcti-plugin-catalog](https://pypi.org/project/vcti-plugin-catalog/) (>=1.0.1) — descriptor/registry framework
|
|
257
|
+
- [vcti-lookup](https://pypi.org/project/vcti-lookup/) (>=1.0.1) — attribute-based filtering
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vcti-path-format"
|
|
7
|
+
version = "1.2.0"
|
|
8
|
+
description = "File format identification framework with heuristic evaluators and feature validators for Python"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{name = "Visual Collaboration Technologies Inc."}
|
|
12
|
+
]
|
|
13
|
+
requires-python = ">=3.12,<3.15"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"vcti-plugin-catalog>=1.0.1",
|
|
16
|
+
"vcti-lookup>=1.0.1",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.optional-dependencies]
|
|
20
|
+
test = ["pytest", "pytest-cov"]
|
|
21
|
+
lint = ["ruff"]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
where = ["src"]
|
|
25
|
+
include = ["vcti.pathformat", "vcti.pathformat.*"]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.package-data]
|
|
28
|
+
"vcti.pathformat" = ["py.typed"]
|
|
29
|
+
|
|
30
|
+
[tool.setuptools]
|
|
31
|
+
zip-safe = true
|
|
32
|
+
|
|
33
|
+
[tool.pytest.ini_options]
|
|
34
|
+
addopts = "--cov=vcti.pathformat --cov-report=term-missing --cov-fail-under=95"
|
|
35
|
+
|
|
36
|
+
[tool.ruff]
|
|
37
|
+
target-version = "py312"
|
|
38
|
+
line-length = 99
|
|
39
|
+
|
|
40
|
+
[tool.ruff.lint]
|
|
41
|
+
select = ["E", "F", "W", "I", "UP"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""vcti.pathformat — File format identification framework with heuristic evaluators."""
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import version
|
|
6
|
+
|
|
7
|
+
from .descriptor import FormatDescriptor
|
|
8
|
+
from .evaluator.base import EvaluationReport, Evaluator, MatchConfidence
|
|
9
|
+
from .evaluator.heuristic import PathAccessError
|
|
10
|
+
from .feature_validator.base import ValidationResult, ValidationTier
|
|
11
|
+
from .identifier import FormatIdentifier, IdentificationResult, identify_file_format
|
|
12
|
+
from .registry import FormatRegistry
|
|
13
|
+
|
|
14
|
+
__version__ = version("vcti-path-format")
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"__version__",
|
|
18
|
+
"EvaluationReport",
|
|
19
|
+
"Evaluator",
|
|
20
|
+
"FormatDescriptor",
|
|
21
|
+
"FormatIdentifier",
|
|
22
|
+
"FormatRegistry",
|
|
23
|
+
"IdentificationResult",
|
|
24
|
+
"MatchConfidence",
|
|
25
|
+
"PathAccessError",
|
|
26
|
+
"ValidationResult",
|
|
27
|
+
"ValidationTier",
|
|
28
|
+
"identify_file_format",
|
|
29
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
|
|
2
|
+
# See LICENSE for details.
|
|
3
|
+
"""Format descriptor for supported file or folder formats.
|
|
4
|
+
|
|
5
|
+
Defines the FormatDescriptor class, which encapsulates metadata, validation logic,
|
|
6
|
+
and attributes for a specific data format.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from vcti.plugincatalog import Descriptor
|
|
13
|
+
|
|
14
|
+
from .evaluator.base import EvaluationReport, Evaluator, MatchConfidence
|
|
15
|
+
from .feature_validator.base import ValidationTier
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FormatDescriptor(Descriptor[Evaluator]):
|
|
19
|
+
"""Describes a supported data format (file or folder).
|
|
20
|
+
|
|
21
|
+
Each FormatDescriptor instance defines the metadata and validation logic
|
|
22
|
+
for a specific data format. Extends the generic Descriptor with an Evaluator instance.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
id: Unique identifier for the format (e.g., 'csv', 'hdf5-file').
|
|
26
|
+
name: Human-readable name for the format.
|
|
27
|
+
evaluator: Evaluator instance for determining match confidence.
|
|
28
|
+
description: Optional description of the format.
|
|
29
|
+
attributes: Optional format-specific attributes as key-value pairs.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
id: str,
|
|
35
|
+
name: str,
|
|
36
|
+
evaluator: Evaluator,
|
|
37
|
+
description: str | None = None,
|
|
38
|
+
attributes: dict[str, Any] | None = None,
|
|
39
|
+
):
|
|
40
|
+
if not isinstance(evaluator, Evaluator):
|
|
41
|
+
raise TypeError(
|
|
42
|
+
f"evaluator must be an Evaluator instance, got {type(evaluator).__name__}"
|
|
43
|
+
)
|
|
44
|
+
super().__init__(
|
|
45
|
+
id=id,
|
|
46
|
+
name=name,
|
|
47
|
+
instance=evaluator,
|
|
48
|
+
description=description,
|
|
49
|
+
attributes=attributes,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def evaluator(self) -> Evaluator:
|
|
54
|
+
"""Get the evaluator instance."""
|
|
55
|
+
return self.instance
|
|
56
|
+
|
|
57
|
+
def evaluate(
|
|
58
|
+
self,
|
|
59
|
+
path: Path,
|
|
60
|
+
max_tier: ValidationTier = ValidationTier.SEMANTIC,
|
|
61
|
+
use_cache: bool = True,
|
|
62
|
+
) -> EvaluationReport:
|
|
63
|
+
"""Validate file features and evaluate match confidence.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
path: Path to the file or folder to evaluate.
|
|
67
|
+
max_tier: Maximum validation tier to execute (inclusive).
|
|
68
|
+
use_cache: If True and evaluator supports caching, use cached results.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
The evaluation report for format matching.
|
|
72
|
+
"""
|
|
73
|
+
return self.evaluator.evaluate(path, max_tier, use_cache=use_cache)
|
|
74
|
+
|
|
75
|
+
def evaluate_confidence(
|
|
76
|
+
self,
|
|
77
|
+
path: Path,
|
|
78
|
+
max_tier: ValidationTier = ValidationTier.SEMANTIC,
|
|
79
|
+
use_cache: bool = True,
|
|
80
|
+
) -> MatchConfidence:
|
|
81
|
+
"""Convenience wrapper returning only the confidence value.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
path: Path to the file or folder to evaluate.
|
|
85
|
+
max_tier: Maximum validation tier to execute (inclusive).
|
|
86
|
+
use_cache: If True and evaluator supports caching, use cached results.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The confidence level of the match.
|
|
90
|
+
"""
|
|
91
|
+
return self.evaluate(path, max_tier, use_cache).confidence
|
|
92
|
+
|
|
93
|
+
def __repr__(self) -> str:
|
|
94
|
+
return f"FormatDescriptor(id={self.id!r}, name={self.name!r})"
|