PyPI - vcti-path-format - Versions diffs - 1.2.0__py3-none-any.whl - Mend

vcti-path-format 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vcti/pathformat/__init__.py +29 -0
vcti/pathformat/descriptor.py +94 -0
vcti/pathformat/evaluator/__init__.py +29 -0
vcti/pathformat/evaluator/base.py +87 -0
vcti/pathformat/evaluator/heuristic.py +367 -0
vcti/pathformat/feature_validator/__init__.py +23 -0
vcti/pathformat/feature_validator/base.py +88 -0
vcti/pathformat/feature_validator/extension.py +55 -0
vcti/pathformat/feature_validator/magic_bytes.py +62 -0
vcti/pathformat/feature_validator/registry.py +53 -0
vcti/pathformat/identifier.py +178 -0
vcti/pathformat/py.typed +0 -0
vcti/pathformat/registry.py +30 -0
vcti_path_format-1.2.0.dist-info/METADATA +274 -0
vcti_path_format-1.2.0.dist-info/RECORD +19 -0
vcti_path_format-1.2.0.dist-info/WHEEL +5 -0
vcti_path_format-1.2.0.dist-info/licenses/LICENSE +8 -0
vcti_path_format-1.2.0.dist-info/top_level.txt +1 -0
vcti_path_format-1.2.0.dist-info/zip-safe +1 -0

vcti/pathformat/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
+# See LICENSE for details.
+"""vcti.pathformat — File format identification framework with heuristic evaluators."""
+from importlib.metadata import version
+from .descriptor import FormatDescriptor
+from .evaluator.base import EvaluationReport, Evaluator, MatchConfidence
+from .evaluator.heuristic import PathAccessError
+from .feature_validator.base import ValidationResult, ValidationTier
+from .identifier import FormatIdentifier, IdentificationResult, identify_file_format
+from .registry import FormatRegistry
+__version__ = version("vcti-path-format")
+__all__ = [
+    "__version__",
+    "EvaluationReport",
+    "Evaluator",
+    "FormatDescriptor",
+    "FormatIdentifier",
+    "FormatRegistry",
+    "IdentificationResult",
+    "MatchConfidence",
+    "PathAccessError",
+    "ValidationResult",
+    "ValidationTier",
+    "identify_file_format",
+]

vcti/pathformat/descriptor.py ADDED Viewed

@@ -0,0 +1,94 @@
+# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
+# See LICENSE for details.
+"""Format descriptor for supported file or folder formats.
+Defines the FormatDescriptor class, which encapsulates metadata, validation logic,
+and attributes for a specific data format.
+"""
+from pathlib import Path
+from typing import Any
+from vcti.plugincatalog import Descriptor
+from .evaluator.base import EvaluationReport, Evaluator, MatchConfidence
+from .feature_validator.base import ValidationTier
+class FormatDescriptor(Descriptor[Evaluator]):
+    """Describes a supported data format (file or folder).
+    Each FormatDescriptor instance defines the metadata and validation logic
+    for a specific data format. Extends the generic Descriptor with an Evaluator instance.
+    Args:
+        id: Unique identifier for the format (e.g., 'csv', 'hdf5-file').
+        name: Human-readable name for the format.
+        evaluator: Evaluator instance for determining match confidence.
+        description: Optional description of the format.
+        attributes: Optional format-specific attributes as key-value pairs.
+    """
+    def __init__(
+        self,
+        id: str,
+        name: str,
+        evaluator: Evaluator,
+        description: str | None = None,
+        attributes: dict[str, Any] | None = None,
+    ):
+        if not isinstance(evaluator, Evaluator):
+            raise TypeError(
+                f"evaluator must be an Evaluator instance, got {type(evaluator).__name__}"
+            )
+        super().__init__(
+            id=id,
+            name=name,
+            instance=evaluator,
+            description=description,
+            attributes=attributes,
+        )
+    @property
+    def evaluator(self) -> Evaluator:
+        """Get the evaluator instance."""
+        return self.instance
+    def evaluate(
+        self,
+        path: Path,
+        max_tier: ValidationTier = ValidationTier.SEMANTIC,
+        use_cache: bool = True,
+    ) -> EvaluationReport:
+        """Validate file features and evaluate match confidence.
+        Args:
+            path: Path to the file or folder to evaluate.
+            max_tier: Maximum validation tier to execute (inclusive).
+            use_cache: If True and evaluator supports caching, use cached results.
+        Returns:
+            The evaluation report for format matching.
+        """
+        return self.evaluator.evaluate(path, max_tier, use_cache=use_cache)
+    def evaluate_confidence(
+        self,
+        path: Path,
+        max_tier: ValidationTier = ValidationTier.SEMANTIC,
+        use_cache: bool = True,
+    ) -> MatchConfidence:
+        """Convenience wrapper returning only the confidence value.
+        Args:
+            path: Path to the file or folder to evaluate.
+            max_tier: Maximum validation tier to execute (inclusive).
+            use_cache: If True and evaluator supports caching, use cached results.
+        Returns:
+            The confidence level of the match.
+        """
+        return self.evaluate(path, max_tier, use_cache).confidence
+    def __repr__(self) -> str:
+        return f"FormatDescriptor(id={self.id!r}, name={self.name!r})"

vcti/pathformat/evaluator/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
+# See LICENSE for details.
+"""Evaluator package for file format matching.
+Provides base evaluator interface and the heuristic evaluator implementation
+for determining format match confidence from validation evidence.
+"""
+from .base import EvaluationReport, Evaluator, MatchConfidence
+from .heuristic import (
+    HEURISTIC_EVALUATOR_ID,
+    EvaluatorError,
+    HeuristicEvaluator,
+    InvalidValidatorError,
+    PathAccessError,
+    ValidationError,
+)
+__all__ = [
+    "EvaluationReport",
+    "Evaluator",
+    "EvaluatorError",
+    "HEURISTIC_EVALUATOR_ID",
+    "HeuristicEvaluator",
+    "InvalidValidatorError",
+    "MatchConfidence",
+    "PathAccessError",
+    "ValidationError",
+]

vcti/pathformat/evaluator/base.py ADDED Viewed

@@ -0,0 +1,87 @@
+# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
+# See LICENSE for details.
+"""Base classes for file format evaluators.
+Defines the MatchConfidence enum, EvaluationReport dataclass, and Evaluator
+abstract base class for assessing format match confidence.
+"""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import ClassVar
+from ..feature_validator.base import ValidationResult, ValidationTier
+class MatchConfidence(Enum):
+    """Confidence levels in format matching.
+    Members use explicit integer values to guarantee ordering stability.
+    Comparison by ``.value`` is used for sorting and threshold checks.
+    Attributes:
+        CERTAINLY_NOT: Format is definitely not a match.
+        UNLIKELY: Format is unlikely to be a match.
+        CANT_EVALUATE: Cannot evaluate with available evidence.
+        LIKELY: Format is likely a match.
+        DEFINITE: Format is definitely a match.
+    """
+    CERTAINLY_NOT = 1
+    UNLIKELY = 2
+    CANT_EVALUATE = 3
+    LIKELY = 4
+    DEFINITE = 5
+@dataclass(frozen=True, slots=True)
+class EvaluationReport:
+    """Result of an evaluation.
+    Attributes:
+        confidence: The confidence level of the format match.
+        details: Additional details about the evaluation.
+        validator_results: Individual results from each validator that executed.
+            Empty tuple when no validators ran or when produced by a non-heuristic
+            evaluator.
+    """
+    confidence: MatchConfidence
+    details: str
+    validator_results: tuple[ValidationResult, ...] = field(default_factory=tuple)
+class Evaluator(ABC):
+    """Abstract base class for evaluator implementations.
+    Evaluators assess validation evidence and determine overall match
+    confidence for a format.
+    Attributes:
+        id: Unique identifier of the evaluator.
+        description: Description of the evaluator.
+    """
+    id: ClassVar[str]
+    description: ClassVar[str]
+    @abstractmethod
+    def evaluate(
+        self,
+        path: Path,
+        max_tier: ValidationTier = ValidationTier.SEMANTIC,
+        use_cache: bool = True,
+    ) -> EvaluationReport:
+        """Evaluate validation evidence and determine match confidence.
+        Args:
+            path: Path to the file or folder to evaluate.
+            max_tier: Maximum validation tier to execute (inclusive).
+            use_cache: If True and caching is supported, use cached results.
+        Returns:
+            Evaluation report with confidence and details.
+        """
+        ...

vcti/pathformat/evaluator/heuristic.py ADDED Viewed

@@ -0,0 +1,367 @@
+# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
+# See LICENSE for details.
+"""Heuristic evidence-based evaluator implementation.
+Defines HeuristicEvaluator with rules for determining match confidence
+based on validation evidence. Provides a builder pattern for constructing
+evaluators with common validators.
+"""
+import logging
+import sys
+from collections.abc import Sequence
+from functools import lru_cache
+from pathlib import Path
+from typing import ClassVar, NamedTuple, Protocol
+from ..feature_validator.base import (
+    FeatureValidator,
+    ValidationResult,
+    ValidationTier,
+    ValidatorRole,
+)
+from ..feature_validator.extension import ExtensionValidator
+from ..feature_validator.magic_bytes import MagicBytesValidator
+from .base import EvaluationReport, Evaluator, MatchConfidence
+logger = logging.getLogger(__name__)
+_NORMALIZE_CASE = sys.platform == "win32"
+HEURISTIC_EVALUATOR_ID = "heuristic"
+class CacheInfo(NamedTuple):
+    """Cache statistics from lru_cache."""
+    hits: int
+    misses: int
+    maxsize: int
+    currsize: int
+class CachedEvaluateFunction(Protocol):
+    """Protocol for the cached _evaluate_impl function with lru_cache methods."""
+    def __call__(self, path_str: str, max_tier: ValidationTier) -> EvaluationReport: ...
+    def cache_clear(self) -> None: ...
+    def cache_info(self) -> CacheInfo: ...
+class EvaluatorError(Exception):
+    """Base exception for evaluator errors."""
+class ValidationError(EvaluatorError):
+    """Raised when validation fails unexpectedly.
+    Attributes:
+        validator_id: Identifier of the validator that failed, if available.
+        tier: Validation tier at which the failure occurred, if available.
+    """
+    def __init__(
+        self,
+        message: str,
+        *,
+        validator_id: str | None = None,
+        tier: ValidationTier | None = None,
+    ):
+        super().__init__(message)
+        self.validator_id = validator_id
+        self.tier = tier
+class InvalidValidatorError(EvaluatorError):
+    """Raised when an invalid validator is provided."""
+class PathAccessError(EvaluatorError):
+    """Raised when path cannot be accessed for validation.
+    Attributes:
+        path: The path that could not be accessed, if available.
+    """
+    def __init__(self, message: str, *, path: str | None = None):
+        super().__init__(message)
+        self.path = path
+class HeuristicEvaluator(Evaluator):
+    """Heuristic-based evaluator with builder pattern support.
+    Aggregates validation evidence and determines match confidence
+    using heuristic rules.
+    Flow Control:
+        - Automatically orders validators by tier (IDENTIFICATION -> STRUCTURE -> SEMANTIC)
+        - Within each tier, GATE validators run before EVIDENCE validators
+        - Stops immediately on first GATE validator failure (fail-fast)
+    Heuristic Rules:
+        - Any failed GATE validator -> CERTAINLY_NOT
+        - All validators passed and at least one GATE -> DEFINITE
+        - All validators passed but no GATE -> LIKELY
+        - Some EVIDENCE validators failed (no GATE failed) -> UNLIKELY
+        - No evidence -> CANT_EVALUATE
+    Example:
+        >>> evaluator = (HeuristicEvaluator()
+        ...     .check_magic_bytes(b"\\x50\\x4B\\x03\\x04")
+        ...     .check_extension([".zip", ".jar"])
+        ... )
+    """
+    id: ClassVar[str] = HEURISTIC_EVALUATOR_ID
+    description: ClassVar[str] = "Heuristic Evaluator"
+    def __init__(
+        self,
+        validators: Sequence[FeatureValidator] | None = None,
+        cache_size: int = 128,
+    ):
+        """Initialize the HeuristicEvaluator.
+        Args:
+            validators: Optional sequence of feature validators.
+            cache_size: Maximum cached results (default: 128, 0 to disable).
+        """
+        self.validators: list[FeatureValidator] = list(validators) if validators else []
+        self._cache_size = cache_size
+        self._evaluate_cached: CachedEvaluateFunction
+        if cache_size > 0:
+            self._evaluate_cached = lru_cache(maxsize=cache_size)(self._evaluate_impl)  # type: ignore[assignment]
+        else:
+            self._evaluate_cached = self._evaluate_impl  # type: ignore[assignment]
+    def check_magic_bytes(self, signature: bytes, position: int = 0) -> "HeuristicEvaluator":
+        """Add a magic bytes validator to check file signature.
+        Args:
+            signature: The expected magic byte signature.
+            position: Byte position where the signature should be found (default: 0).
+        Returns:
+            Self for method chaining.
+        Raises:
+            InvalidValidatorError: If signature is empty or position is negative.
+        """
+        if not signature:
+            raise InvalidValidatorError("Magic byte signature cannot be empty")
+        if position < 0:
+            raise InvalidValidatorError(f"Position must be non-negative, got {position}")
+        try:
+            self.validators.append(MagicBytesValidator(signature=signature, position=position))
+        except Exception as e:
+            raise InvalidValidatorError(f"Failed to create MagicBytesValidator: {e}") from e
+        return self
+    def check_extension(self, extensions: Sequence[str]) -> "HeuristicEvaluator":
+        """Add an extension validator to check file extension.
+        Args:
+            extensions: List of allowed file extensions (with dot, e.g., [".csv"]).
+        Returns:
+            Self for method chaining.
+        Raises:
+            InvalidValidatorError: If extensions list is empty.
+        """
+        if not extensions:
+            raise InvalidValidatorError("Extensions list cannot be empty")
+        try:
+            self.validators.append(ExtensionValidator(extensions=extensions))
+        except Exception as e:
+            raise InvalidValidatorError(f"Failed to create ExtensionValidator: {e}") from e
+        return self
+    def add_validator(self, validator: FeatureValidator) -> "HeuristicEvaluator":
+        """Add a custom validator to the evaluator.
+        Args:
+            validator: A custom feature validator instance.
+        Returns:
+            Self for method chaining.
+        Raises:
+            InvalidValidatorError: If validator is None or lacks required interface.
+        """
+        if validator is None:
+            raise InvalidValidatorError("Validator cannot be None")
+        if not isinstance(validator, FeatureValidator):
+            raise InvalidValidatorError(
+                f"Validator must satisfy the FeatureValidator protocol, "
+                f"got {type(validator).__name__}"
+            )
+        self.validators.append(validator)
+        return self
+    def evaluate(
+        self,
+        path: Path,
+        max_tier: ValidationTier = ValidationTier.SEMANTIC,
+        use_cache: bool = True,
+    ) -> EvaluationReport:
+        """Evaluate validation results using heuristic rules.
+        Args:
+            path: Path to validate and evaluate.
+            max_tier: Maximum validation tier to execute (inclusive).
+            use_cache: If True and caching is enabled, use cached results.
+        Returns:
+            Evaluation report with confidence and details.
+        """
+        resolved = str(path.resolve())
+        if _NORMALIZE_CASE:
+            resolved = resolved.lower()
+        if use_cache and self._cache_size > 0:
+            return self._evaluate_cached(resolved, max_tier)
+        else:
+            return self._evaluate_impl(resolved, max_tier)
+    def _evaluate_impl(
+        self,
+        path_str: str,
+        max_tier: ValidationTier,
+    ) -> EvaluationReport:
+        """Internal evaluation logic (wrapped by lru_cache if enabled).
+        Args:
+            path_str: String path for hashability in cache.
+            max_tier: Maximum validation tier to execute.
+        Raises:
+            PathAccessError: If path cannot be accessed.
+            ValidationError: If validation fails unexpectedly.
+        """
+        try:
+            path = Path(path_str)
+        except Exception as e:
+            raise PathAccessError(f"Invalid path string '{path_str}': {e}", path=path_str) from e
+        if not path.exists():
+            raise PathAccessError(f"Path does not exist: {path}", path=path_str)
+        # Filter validators by max_tier
+        filtered_validators = [v for v in self.validators if v.tier.value <= max_tier.value]
+        logger.debug(
+            "Evaluating %s: %d/%d validators within max_tier=%s",
+            path_str,
+            len(filtered_validators),
+            len(self.validators),
+            max_tier.name,
+        )
+        # Order: first by tier, then by role (GATE before EVIDENCE)
+        try:
+            ordered_validators = sorted(
+                filtered_validators, key=lambda v: (v.tier.value, v.role.value)
+            )
+        except Exception as e:
+            raise ValidationError(f"Failed to order validators: {e}") from e
+        results: list[ValidationResult] = []
+        has_gate = False
+        first_failed_gate: ValidationResult | None = None
+        all_passed = True
+        for validator in ordered_validators:
+            try:
+                result = validator.validate(path)
+            except Exception as e:
+                raise ValidationError(
+                    f"Validator '{validator.id}' raised unexpected error: {e}",
+                    validator_id=validator.id,
+                    tier=validator.tier,
+                ) from e
+            results.append(result)
+            if result.role == ValidatorRole.GATE:
+                has_gate = True
+                if not result.is_passed and first_failed_gate is None:
+                    first_failed_gate = result
+                    all_passed = False
+                    logger.debug("Gate '%s' failed — short-circuiting", validator.id)
+                    break
+            elif not result.is_passed:
+                all_passed = False
+        result_tuple = tuple(results)
+        if not results:
+            return EvaluationReport(
+                confidence=MatchConfidence.CANT_EVALUATE,
+                details="no validators executed",
+            )
+        if first_failed_gate:
+            return EvaluationReport(
+                confidence=MatchConfidence.CERTAINLY_NOT,
+                details=(
+                    f"gate '{first_failed_gate.validator_id}' failed: {first_failed_gate.details}"
+                ),
+                validator_results=result_tuple,
+            )
+        if all_passed:
+            confidence = MatchConfidence.DEFINITE if has_gate else MatchConfidence.LIKELY
+            details = (
+                "all validators passed (gate present)"
+                if has_gate
+                else "all validators passed (no gate)"
+            )
+            return EvaluationReport(
+                confidence=confidence, details=details, validator_results=result_tuple
+            )
+        failed_non_gate_ids = [
+            res.validator_id
+            for res in results
+            if not res.is_passed and res.role != ValidatorRole.GATE
+        ]
+        failure_summary = (
+            f"non-gate validators failed: {', '.join(failed_non_gate_ids)}"
+            if failed_non_gate_ids
+            else "validation failed"
+        )
+        return EvaluationReport(
+            confidence=MatchConfidence.UNLIKELY,
+            details=failure_summary,
+            validator_results=result_tuple,
+        )
+    def clear_cache(self) -> None:
+        """Clear the evaluation cache."""
+        if self._cache_size > 0 and hasattr(self._evaluate_cached, "cache_clear"):
+            self._evaluate_cached.cache_clear()
+    def cache_info(self) -> tuple[int, int, int, int] | None:
+        """Get cache statistics (hits, misses, maxsize, currsize).
+        Returns:
+            Tuple of (hits, misses, maxsize, currsize), or None if disabled.
+        """
+        if self._cache_size > 0 and hasattr(self._evaluate_cached, "cache_info"):
+            info = self._evaluate_cached.cache_info()
+            return (info.hits, info.misses, info.maxsize, info.currsize)
+        return None
+    def __repr__(self) -> str:
+        cache_status = (
+            f"cache_size={self._cache_size}" if self._cache_size > 0 else "cache=disabled"
+        )
+        return (
+            f"<HeuristicEvaluator id={self.id} validators={len(self.validators)} {cache_status}>"
+        )

vcti/pathformat/feature_validator/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright Visual Collaboration Technologies Inc. All Rights Reserved.
+# See LICENSE for details.
+"""Validator package for file format validation.
+Exposes base types and built-in validators for convenient import.
+"""
+from .base import FeatureValidator, ValidationResult, ValidationTier, ValidatorRole
+from .extension import EXTENSION_VALIDATOR_ID, ExtensionValidator
+from .magic_bytes import MAGIC_BYTES_VALIDATOR_ID, MagicBytesValidator
+from .registry import FeatureValidatorRegistry
+__all__ = [
+    "EXTENSION_VALIDATOR_ID",
+    "ExtensionValidator",
+    "FeatureValidator",
+    "FeatureValidatorRegistry",
+    "MAGIC_BYTES_VALIDATOR_ID",
+    "MagicBytesValidator",
+    "ValidationResult",
+    "ValidationTier",
+    "ValidatorRole",
+]