vcti-path-format 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """vcti.pathformat — File format identification framework with heuristic evaluators."""
4
+
5
+ from importlib.metadata import version
6
+
7
+ from .descriptor import FormatDescriptor
8
+ from .evaluator.base import EvaluationReport, Evaluator, MatchConfidence
9
+ from .evaluator.heuristic import PathAccessError
10
+ from .feature_validator.base import ValidationResult, ValidationTier
11
+ from .identifier import FormatIdentifier, IdentificationResult, identify_file_format
12
+ from .registry import FormatRegistry
13
+
14
+ __version__ = version("vcti-path-format")
15
+
16
+ __all__ = [
17
+ "__version__",
18
+ "EvaluationReport",
19
+ "Evaluator",
20
+ "FormatDescriptor",
21
+ "FormatIdentifier",
22
+ "FormatRegistry",
23
+ "IdentificationResult",
24
+ "MatchConfidence",
25
+ "PathAccessError",
26
+ "ValidationResult",
27
+ "ValidationTier",
28
+ "identify_file_format",
29
+ ]
@@ -0,0 +1,94 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Format descriptor for supported file or folder formats.
4
+
5
+ Defines the FormatDescriptor class, which encapsulates metadata, validation logic,
6
+ and attributes for a specific data format.
7
+ """
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from vcti.plugincatalog import Descriptor
13
+
14
+ from .evaluator.base import EvaluationReport, Evaluator, MatchConfidence
15
+ from .feature_validator.base import ValidationTier
16
+
17
+
18
+ class FormatDescriptor(Descriptor[Evaluator]):
19
+ """Describes a supported data format (file or folder).
20
+
21
+ Each FormatDescriptor instance defines the metadata and validation logic
22
+ for a specific data format. Extends the generic Descriptor with an Evaluator instance.
23
+
24
+ Args:
25
+ id: Unique identifier for the format (e.g., 'csv', 'hdf5-file').
26
+ name: Human-readable name for the format.
27
+ evaluator: Evaluator instance for determining match confidence.
28
+ description: Optional description of the format.
29
+ attributes: Optional format-specific attributes as key-value pairs.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ id: str,
35
+ name: str,
36
+ evaluator: Evaluator,
37
+ description: str | None = None,
38
+ attributes: dict[str, Any] | None = None,
39
+ ):
40
+ if not isinstance(evaluator, Evaluator):
41
+ raise TypeError(
42
+ f"evaluator must be an Evaluator instance, got {type(evaluator).__name__}"
43
+ )
44
+ super().__init__(
45
+ id=id,
46
+ name=name,
47
+ instance=evaluator,
48
+ description=description,
49
+ attributes=attributes,
50
+ )
51
+
52
+ @property
53
+ def evaluator(self) -> Evaluator:
54
+ """Get the evaluator instance."""
55
+ return self.instance
56
+
57
+ def evaluate(
58
+ self,
59
+ path: Path,
60
+ max_tier: ValidationTier = ValidationTier.SEMANTIC,
61
+ use_cache: bool = True,
62
+ ) -> EvaluationReport:
63
+ """Validate file features and evaluate match confidence.
64
+
65
+ Args:
66
+ path: Path to the file or folder to evaluate.
67
+ max_tier: Maximum validation tier to execute (inclusive).
68
+ use_cache: If True and evaluator supports caching, use cached results.
69
+
70
+ Returns:
71
+ The evaluation report for format matching.
72
+ """
73
+ return self.evaluator.evaluate(path, max_tier, use_cache=use_cache)
74
+
75
+ def evaluate_confidence(
76
+ self,
77
+ path: Path,
78
+ max_tier: ValidationTier = ValidationTier.SEMANTIC,
79
+ use_cache: bool = True,
80
+ ) -> MatchConfidence:
81
+ """Convenience wrapper returning only the confidence value.
82
+
83
+ Args:
84
+ path: Path to the file or folder to evaluate.
85
+ max_tier: Maximum validation tier to execute (inclusive).
86
+ use_cache: If True and evaluator supports caching, use cached results.
87
+
88
+ Returns:
89
+ The confidence level of the match.
90
+ """
91
+ return self.evaluate(path, max_tier, use_cache).confidence
92
+
93
+ def __repr__(self) -> str:
94
+ return f"FormatDescriptor(id={self.id!r}, name={self.name!r})"
@@ -0,0 +1,29 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Evaluator package for file format matching.
4
+
5
+ Provides base evaluator interface and the heuristic evaluator implementation
6
+ for determining format match confidence from validation evidence.
7
+ """
8
+
9
+ from .base import EvaluationReport, Evaluator, MatchConfidence
10
+ from .heuristic import (
11
+ HEURISTIC_EVALUATOR_ID,
12
+ EvaluatorError,
13
+ HeuristicEvaluator,
14
+ InvalidValidatorError,
15
+ PathAccessError,
16
+ ValidationError,
17
+ )
18
+
19
+ __all__ = [
20
+ "EvaluationReport",
21
+ "Evaluator",
22
+ "EvaluatorError",
23
+ "HEURISTIC_EVALUATOR_ID",
24
+ "HeuristicEvaluator",
25
+ "InvalidValidatorError",
26
+ "MatchConfidence",
27
+ "PathAccessError",
28
+ "ValidationError",
29
+ ]
@@ -0,0 +1,87 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Base classes for file format evaluators.
4
+
5
+ Defines the MatchConfidence enum, EvaluationReport dataclass, and Evaluator
6
+ abstract base class for assessing format match confidence.
7
+ """
8
+
9
+ from abc import ABC, abstractmethod
10
+ from dataclasses import dataclass, field
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import ClassVar
14
+
15
+ from ..feature_validator.base import ValidationResult, ValidationTier
16
+
17
+
18
+ class MatchConfidence(Enum):
19
+ """Confidence levels in format matching.
20
+
21
+ Members use explicit integer values to guarantee ordering stability.
22
+ Comparison by ``.value`` is used for sorting and threshold checks.
23
+
24
+ Attributes:
25
+ CERTAINLY_NOT: Format is definitely not a match.
26
+ UNLIKELY: Format is unlikely to be a match.
27
+ CANT_EVALUATE: Cannot evaluate with available evidence.
28
+ LIKELY: Format is likely a match.
29
+ DEFINITE: Format is definitely a match.
30
+ """
31
+
32
+ CERTAINLY_NOT = 1
33
+ UNLIKELY = 2
34
+ CANT_EVALUATE = 3
35
+ LIKELY = 4
36
+ DEFINITE = 5
37
+
38
+
39
+ @dataclass(frozen=True, slots=True)
40
+ class EvaluationReport:
41
+ """Result of an evaluation.
42
+
43
+ Attributes:
44
+ confidence: The confidence level of the format match.
45
+ details: Additional details about the evaluation.
46
+ validator_results: Individual results from each validator that executed.
47
+ Empty tuple when no validators ran or when produced by a non-heuristic
48
+ evaluator.
49
+ """
50
+
51
+ confidence: MatchConfidence
52
+ details: str
53
+ validator_results: tuple[ValidationResult, ...] = field(default_factory=tuple)
54
+
55
+
56
+ class Evaluator(ABC):
57
+ """Abstract base class for evaluator implementations.
58
+
59
+ Evaluators assess validation evidence and determine overall match
60
+ confidence for a format.
61
+
62
+ Attributes:
63
+ id: Unique identifier of the evaluator.
64
+ description: Description of the evaluator.
65
+ """
66
+
67
+ id: ClassVar[str]
68
+ description: ClassVar[str]
69
+
70
+ @abstractmethod
71
+ def evaluate(
72
+ self,
73
+ path: Path,
74
+ max_tier: ValidationTier = ValidationTier.SEMANTIC,
75
+ use_cache: bool = True,
76
+ ) -> EvaluationReport:
77
+ """Evaluate validation evidence and determine match confidence.
78
+
79
+ Args:
80
+ path: Path to the file or folder to evaluate.
81
+ max_tier: Maximum validation tier to execute (inclusive).
82
+ use_cache: If True and caching is supported, use cached results.
83
+
84
+ Returns:
85
+ Evaluation report with confidence and details.
86
+ """
87
+ ...
@@ -0,0 +1,367 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Heuristic evidence-based evaluator implementation.
4
+
5
+ Defines HeuristicEvaluator with rules for determining match confidence
6
+ based on validation evidence. Provides a builder pattern for constructing
7
+ evaluators with common validators.
8
+ """
9
+
10
+ import logging
11
+ import sys
12
+ from collections.abc import Sequence
13
+ from functools import lru_cache
14
+ from pathlib import Path
15
+ from typing import ClassVar, NamedTuple, Protocol
16
+
17
+ from ..feature_validator.base import (
18
+ FeatureValidator,
19
+ ValidationResult,
20
+ ValidationTier,
21
+ ValidatorRole,
22
+ )
23
+ from ..feature_validator.extension import ExtensionValidator
24
+ from ..feature_validator.magic_bytes import MagicBytesValidator
25
+ from .base import EvaluationReport, Evaluator, MatchConfidence
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ _NORMALIZE_CASE = sys.platform == "win32"
30
+
31
+ HEURISTIC_EVALUATOR_ID = "heuristic"
32
+
33
+
34
+ class CacheInfo(NamedTuple):
35
+ """Cache statistics from lru_cache."""
36
+
37
+ hits: int
38
+ misses: int
39
+ maxsize: int
40
+ currsize: int
41
+
42
+
43
+ class CachedEvaluateFunction(Protocol):
44
+ """Protocol for the cached _evaluate_impl function with lru_cache methods."""
45
+
46
+ def __call__(self, path_str: str, max_tier: ValidationTier) -> EvaluationReport: ...
47
+ def cache_clear(self) -> None: ...
48
+ def cache_info(self) -> CacheInfo: ...
49
+
50
+
51
+ class EvaluatorError(Exception):
52
+ """Base exception for evaluator errors."""
53
+
54
+
55
+ class ValidationError(EvaluatorError):
56
+ """Raised when validation fails unexpectedly.
57
+
58
+ Attributes:
59
+ validator_id: Identifier of the validator that failed, if available.
60
+ tier: Validation tier at which the failure occurred, if available.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ message: str,
66
+ *,
67
+ validator_id: str | None = None,
68
+ tier: ValidationTier | None = None,
69
+ ):
70
+ super().__init__(message)
71
+ self.validator_id = validator_id
72
+ self.tier = tier
73
+
74
+
75
+ class InvalidValidatorError(EvaluatorError):
76
+ """Raised when an invalid validator is provided."""
77
+
78
+
79
+ class PathAccessError(EvaluatorError):
80
+ """Raised when path cannot be accessed for validation.
81
+
82
+ Attributes:
83
+ path: The path that could not be accessed, if available.
84
+ """
85
+
86
+ def __init__(self, message: str, *, path: str | None = None):
87
+ super().__init__(message)
88
+ self.path = path
89
+
90
+
91
+ class HeuristicEvaluator(Evaluator):
92
+ """Heuristic-based evaluator with builder pattern support.
93
+
94
+ Aggregates validation evidence and determines match confidence
95
+ using heuristic rules.
96
+
97
+ Flow Control:
98
+ - Automatically orders validators by tier (IDENTIFICATION -> STRUCTURE -> SEMANTIC)
99
+ - Within each tier, GATE validators run before EVIDENCE validators
100
+ - Stops immediately on first GATE validator failure (fail-fast)
101
+
102
+ Heuristic Rules:
103
+ - Any failed GATE validator -> CERTAINLY_NOT
104
+ - All validators passed and at least one GATE -> DEFINITE
105
+ - All validators passed but no GATE -> LIKELY
106
+ - Some EVIDENCE validators failed (no GATE failed) -> UNLIKELY
107
+ - No evidence -> CANT_EVALUATE
108
+
109
+ Example:
110
+ >>> evaluator = (HeuristicEvaluator()
111
+ ... .check_magic_bytes(b"\\x50\\x4B\\x03\\x04")
112
+ ... .check_extension([".zip", ".jar"])
113
+ ... )
114
+ """
115
+
116
+ id: ClassVar[str] = HEURISTIC_EVALUATOR_ID
117
+ description: ClassVar[str] = "Heuristic Evaluator"
118
+
119
+ def __init__(
120
+ self,
121
+ validators: Sequence[FeatureValidator] | None = None,
122
+ cache_size: int = 128,
123
+ ):
124
+ """Initialize the HeuristicEvaluator.
125
+
126
+ Args:
127
+ validators: Optional sequence of feature validators.
128
+ cache_size: Maximum cached results (default: 128, 0 to disable).
129
+ """
130
+ self.validators: list[FeatureValidator] = list(validators) if validators else []
131
+ self._cache_size = cache_size
132
+
133
+ self._evaluate_cached: CachedEvaluateFunction
134
+ if cache_size > 0:
135
+ self._evaluate_cached = lru_cache(maxsize=cache_size)(self._evaluate_impl) # type: ignore[assignment]
136
+ else:
137
+ self._evaluate_cached = self._evaluate_impl # type: ignore[assignment]
138
+
139
+ def check_magic_bytes(self, signature: bytes, position: int = 0) -> "HeuristicEvaluator":
140
+ """Add a magic bytes validator to check file signature.
141
+
142
+ Args:
143
+ signature: The expected magic byte signature.
144
+ position: Byte position where the signature should be found (default: 0).
145
+
146
+ Returns:
147
+ Self for method chaining.
148
+
149
+ Raises:
150
+ InvalidValidatorError: If signature is empty or position is negative.
151
+ """
152
+ if not signature:
153
+ raise InvalidValidatorError("Magic byte signature cannot be empty")
154
+ if position < 0:
155
+ raise InvalidValidatorError(f"Position must be non-negative, got {position}")
156
+
157
+ try:
158
+ self.validators.append(MagicBytesValidator(signature=signature, position=position))
159
+ except Exception as e:
160
+ raise InvalidValidatorError(f"Failed to create MagicBytesValidator: {e}") from e
161
+
162
+ return self
163
+
164
+ def check_extension(self, extensions: Sequence[str]) -> "HeuristicEvaluator":
165
+ """Add an extension validator to check file extension.
166
+
167
+ Args:
168
+ extensions: List of allowed file extensions (with dot, e.g., [".csv"]).
169
+
170
+ Returns:
171
+ Self for method chaining.
172
+
173
+ Raises:
174
+ InvalidValidatorError: If extensions list is empty.
175
+ """
176
+ if not extensions:
177
+ raise InvalidValidatorError("Extensions list cannot be empty")
178
+
179
+ try:
180
+ self.validators.append(ExtensionValidator(extensions=extensions))
181
+ except Exception as e:
182
+ raise InvalidValidatorError(f"Failed to create ExtensionValidator: {e}") from e
183
+
184
+ return self
185
+
186
+ def add_validator(self, validator: FeatureValidator) -> "HeuristicEvaluator":
187
+ """Add a custom validator to the evaluator.
188
+
189
+ Args:
190
+ validator: A custom feature validator instance.
191
+
192
+ Returns:
193
+ Self for method chaining.
194
+
195
+ Raises:
196
+ InvalidValidatorError: If validator is None or lacks required interface.
197
+ """
198
+ if validator is None:
199
+ raise InvalidValidatorError("Validator cannot be None")
200
+
201
+ if not isinstance(validator, FeatureValidator):
202
+ raise InvalidValidatorError(
203
+ f"Validator must satisfy the FeatureValidator protocol, "
204
+ f"got {type(validator).__name__}"
205
+ )
206
+
207
+ self.validators.append(validator)
208
+ return self
209
+
210
+ def evaluate(
211
+ self,
212
+ path: Path,
213
+ max_tier: ValidationTier = ValidationTier.SEMANTIC,
214
+ use_cache: bool = True,
215
+ ) -> EvaluationReport:
216
+ """Evaluate validation results using heuristic rules.
217
+
218
+ Args:
219
+ path: Path to validate and evaluate.
220
+ max_tier: Maximum validation tier to execute (inclusive).
221
+ use_cache: If True and caching is enabled, use cached results.
222
+
223
+ Returns:
224
+ Evaluation report with confidence and details.
225
+ """
226
+ resolved = str(path.resolve())
227
+ if _NORMALIZE_CASE:
228
+ resolved = resolved.lower()
229
+ if use_cache and self._cache_size > 0:
230
+ return self._evaluate_cached(resolved, max_tier)
231
+ else:
232
+ return self._evaluate_impl(resolved, max_tier)
233
+
234
+ def _evaluate_impl(
235
+ self,
236
+ path_str: str,
237
+ max_tier: ValidationTier,
238
+ ) -> EvaluationReport:
239
+ """Internal evaluation logic (wrapped by lru_cache if enabled).
240
+
241
+ Args:
242
+ path_str: String path for hashability in cache.
243
+ max_tier: Maximum validation tier to execute.
244
+
245
+ Raises:
246
+ PathAccessError: If path cannot be accessed.
247
+ ValidationError: If validation fails unexpectedly.
248
+ """
249
+ try:
250
+ path = Path(path_str)
251
+ except Exception as e:
252
+ raise PathAccessError(f"Invalid path string '{path_str}': {e}", path=path_str) from e
253
+
254
+ if not path.exists():
255
+ raise PathAccessError(f"Path does not exist: {path}", path=path_str)
256
+
257
+ # Filter validators by max_tier
258
+ filtered_validators = [v for v in self.validators if v.tier.value <= max_tier.value]
259
+ logger.debug(
260
+ "Evaluating %s: %d/%d validators within max_tier=%s",
261
+ path_str,
262
+ len(filtered_validators),
263
+ len(self.validators),
264
+ max_tier.name,
265
+ )
266
+
267
+ # Order: first by tier, then by role (GATE before EVIDENCE)
268
+ try:
269
+ ordered_validators = sorted(
270
+ filtered_validators, key=lambda v: (v.tier.value, v.role.value)
271
+ )
272
+ except Exception as e:
273
+ raise ValidationError(f"Failed to order validators: {e}") from e
274
+
275
+ results: list[ValidationResult] = []
276
+ has_gate = False
277
+ first_failed_gate: ValidationResult | None = None
278
+ all_passed = True
279
+
280
+ for validator in ordered_validators:
281
+ try:
282
+ result = validator.validate(path)
283
+ except Exception as e:
284
+ raise ValidationError(
285
+ f"Validator '{validator.id}' raised unexpected error: {e}",
286
+ validator_id=validator.id,
287
+ tier=validator.tier,
288
+ ) from e
289
+
290
+ results.append(result)
291
+ if result.role == ValidatorRole.GATE:
292
+ has_gate = True
293
+ if not result.is_passed and first_failed_gate is None:
294
+ first_failed_gate = result
295
+ all_passed = False
296
+ logger.debug("Gate '%s' failed — short-circuiting", validator.id)
297
+ break
298
+ elif not result.is_passed:
299
+ all_passed = False
300
+
301
+ result_tuple = tuple(results)
302
+
303
+ if not results:
304
+ return EvaluationReport(
305
+ confidence=MatchConfidence.CANT_EVALUATE,
306
+ details="no validators executed",
307
+ )
308
+
309
+ if first_failed_gate:
310
+ return EvaluationReport(
311
+ confidence=MatchConfidence.CERTAINLY_NOT,
312
+ details=(
313
+ f"gate '{first_failed_gate.validator_id}' failed: {first_failed_gate.details}"
314
+ ),
315
+ validator_results=result_tuple,
316
+ )
317
+
318
+ if all_passed:
319
+ confidence = MatchConfidence.DEFINITE if has_gate else MatchConfidence.LIKELY
320
+ details = (
321
+ "all validators passed (gate present)"
322
+ if has_gate
323
+ else "all validators passed (no gate)"
324
+ )
325
+ return EvaluationReport(
326
+ confidence=confidence, details=details, validator_results=result_tuple
327
+ )
328
+
329
+ failed_non_gate_ids = [
330
+ res.validator_id
331
+ for res in results
332
+ if not res.is_passed and res.role != ValidatorRole.GATE
333
+ ]
334
+ failure_summary = (
335
+ f"non-gate validators failed: {', '.join(failed_non_gate_ids)}"
336
+ if failed_non_gate_ids
337
+ else "validation failed"
338
+ )
339
+ return EvaluationReport(
340
+ confidence=MatchConfidence.UNLIKELY,
341
+ details=failure_summary,
342
+ validator_results=result_tuple,
343
+ )
344
+
345
+ def clear_cache(self) -> None:
346
+ """Clear the evaluation cache."""
347
+ if self._cache_size > 0 and hasattr(self._evaluate_cached, "cache_clear"):
348
+ self._evaluate_cached.cache_clear()
349
+
350
+ def cache_info(self) -> tuple[int, int, int, int] | None:
351
+ """Get cache statistics (hits, misses, maxsize, currsize).
352
+
353
+ Returns:
354
+ Tuple of (hits, misses, maxsize, currsize), or None if disabled.
355
+ """
356
+ if self._cache_size > 0 and hasattr(self._evaluate_cached, "cache_info"):
357
+ info = self._evaluate_cached.cache_info()
358
+ return (info.hits, info.misses, info.maxsize, info.currsize)
359
+ return None
360
+
361
+ def __repr__(self) -> str:
362
+ cache_status = (
363
+ f"cache_size={self._cache_size}" if self._cache_size > 0 else "cache=disabled"
364
+ )
365
+ return (
366
+ f"<HeuristicEvaluator id={self.id} validators={len(self.validators)} {cache_status}>"
367
+ )
@@ -0,0 +1,23 @@
1
+ # Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
2
+ # See LICENSE for details.
3
+ """Validator package for file format validation.
4
+
5
+ Exposes base types and built-in validators for convenient import.
6
+ """
7
+
8
+ from .base import FeatureValidator, ValidationResult, ValidationTier, ValidatorRole
9
+ from .extension import EXTENSION_VALIDATOR_ID, ExtensionValidator
10
+ from .magic_bytes import MAGIC_BYTES_VALIDATOR_ID, MagicBytesValidator
11
+ from .registry import FeatureValidatorRegistry
12
+
13
+ __all__ = [
14
+ "EXTENSION_VALIDATOR_ID",
15
+ "ExtensionValidator",
16
+ "FeatureValidator",
17
+ "FeatureValidatorRegistry",
18
+ "MAGIC_BYTES_VALIDATOR_ID",
19
+ "MagicBytesValidator",
20
+ "ValidationResult",
21
+ "ValidationTier",
22
+ "ValidatorRole",
23
+ ]