PyPI - classifyre-cli - Versions diffs - 0.4.12__tar.gz → 0.4.13__tar.gz - Mend

classifyre-cli 0.4.12tar.gz → 0.4.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (191) hide show

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/.turbo/turbo-build.log RENAMED Viewed

@@ -1,3 +1,3 @@
 $ uv sync
-Resolved 267 packages in 181ms
+Resolved 268 packages in 156ms
 Checked 50 packages in 1ms

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: classifyre-cli
-Version: 0.4.12
+Version: 0.4.13
 Summary: Classifyre CLI — scan and classify unstructured data sources
 License: MIT
 Keywords: data,ingestion,metadata,pii,secrets,unstructured

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/package.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@classifyre/cli",
-  "version": "0.4.12",
+  "version": "0.4.13",
   "private": true,
   "scripts": {
     "build": "uv sync",

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "classifyre-cli"
-version = "0.4.12"
+version = "0.4.13"
 description = "Classifyre CLI — scan and classify unstructured data sources"
 readme = "README.md"
 requires-python = ">=3.12"

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/scripts/generate_models.py RENAMED Viewed

@@ -16,6 +16,14 @@ _PIPELINE_TYPE_DEFAULTS: dict[str, str] = {
     "LLMPipelineSchema": "LLM",
 }
+# Pipeline schema classes whose `severity` field has a string default from JSON schema
+# ('info') but must be an enum instance to avoid Pydantic serialization warnings
+# ("Expected `enum` - serialized value may not be as expected").
+_SEVERITY_ENUM_DEFAULT_CLASSES = {
+    "LLMPipelineSchema",
+    "TextClassificationPipelineSchema",
+}
 def _patch_pipeline_type_defaults(source: str) -> str:
     """Add `= 'X'` default to discriminator `type` fields on pipeline schemas."""
@@ -28,6 +36,22 @@ def _patch_pipeline_type_defaults(source: str) -> str:
     return source
+def _patch_severity_enum_defaults(source: str) -> str:
+    """Replace string 'info' severity Field defaults with Severity.info enum instances.
+    datamodel-codegen emits Field('info', ...) from the JSON schema default, but
+    Pydantic v2 warns at serialization time when the stored value is a plain string
+    rather than a Severity enum member.  This patch rewrites only the severity field
+    inside each affected class so the fix survives future codegen runs.
+    """
+    for cls_name in _SEVERITY_ENUM_DEFAULT_CLASSES:
+        # Match from the class definition up through the severity Field default string.
+        pattern = rf"(class {re.escape(cls_name)}\(.*?severity: Severity \| None = Field\(\n\s+)'info'(\s*,)"
+        replacement = rf"\1Severity.info\2"
+        source = re.sub(pattern, replacement, source, flags=re.DOTALL)
+    return source
 def run_codegen(input_file):
     """Generate Pydantic models from a single JSON schema file."""
     cmd = [
@@ -74,6 +98,7 @@ def main():
     detector_schema = SCHEMA_DIR / "all_detectors.json"
     content = run_codegen(detector_schema)
     content = _patch_pipeline_type_defaults(content)
+    content = _patch_severity_enum_defaults(content)
     (MODEL_DIR / "generated_detectors.py").write_text(content)
     print("Wrote src/models/generated_detectors.py")

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/broken_links/detector.py RENAMED Viewed

@@ -156,6 +156,10 @@ class BrokenLinksDetector(BaseDetector):
             if status_code in {405, 501}:
                 return self._scan_with_get(url, line, start, end, "head_not_supported")
+            # Some servers block HEAD (403) but serve content via GET.
+            if status_code == 403:
+                return self._scan_with_get(url, line, start, end, "head_forbidden")
             if status_code >= 400:
                 return LinkScanResult(
                     url=url,
@@ -169,17 +173,10 @@ class BrokenLinksDetector(BaseDetector):
             content_length = self._parse_content_length(head_response.headers)
             if content_length == 0:
-                return LinkScanResult(
-                    url=url,
-                    line=line,
-                    start=start,
-                    end=end,
-                    finding_type="empty_content",
-                    confidence=0.9,
-                    metadata={"status_code": status_code, "reason": "empty_head_content_length"},
-                )
+                # Content-Length: 0 on HEAD can be misleading (e.g., YouTube, TikTok).
+                # Fall back to GET to verify the page truly has no content.
+                return self._scan_with_get(url, line, start, end, "empty_head_content_length")
-            # Some servers omit Content-Length, so perform a lightweight GET check.
             if content_length is None:
                 return self._scan_with_get(url, line, start, end, "missing_content_length")

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_llm.py RENAMED Viewed

@@ -241,7 +241,7 @@ class LLMRunner(BaseRunner):
     ) -> list[DetectionResult]:
         schema = self._schema
         threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
-        default_severity = schema.severity or Severity.info
+        default_severity = schema.severity if schema.severity is not None else Severity.info
         extracted = self._coerce_fields(payload.get("fields"))
         raw_labels = payload.get("labels")

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_text_classification.py RENAMED Viewed

@@ -66,7 +66,7 @@ class TextClassificationRunner(BaseRunner):
         chunk_overlap: int = getattr(schema.chunk_overlap, "root", schema.chunk_overlap) or 0
         max_length: int | None = getattr(schema.max_length, "root", schema.max_length)
         threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.7
-        default_severity = schema.severity or Severity.info
+        default_severity = schema.severity if schema.severity is not None else Severity.info
         best_scores: dict[str, float] = {}
         try:

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_detectors.py RENAMED Viewed

@@ -1080,7 +1080,7 @@ class LLMPipelineSchema(BaseModel):
         False, description='Allow more than one label per asset.'
     )
     severity: Severity | None = Field(
-        'info',
+        Severity.info,
         description='Default severity when no severity_map rule matches a predicted label.',
     )
     severity_map: list[PipelineSeverityRule] | None = Field(
@@ -1177,7 +1177,7 @@ class TextClassificationPipelineSchema(BaseModel):
         le=1.0,
     )
     severity: Severity | None = Field(
-        'info', description='Default severity when no severity_map rule matches.'
+        Severity.info, description='Default severity when no severity_map rule matches.'
     )
     severity_map: list[PipelineSeverityRule] | None = Field(
         None,

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_input.py RENAMED Viewed

@@ -42,6 +42,7 @@ class AssetType(StrEnum):
     JIRA = 'JIRA'
     SERVICEDESK = 'SERVICEDESK'
     SQLITE = 'SQLITE'
+    NOTION = 'NOTION'
 class SourceCategory(StrEnum):
@@ -1870,6 +1871,7 @@ class Type(StrEnum):
     JIRA = 'JIRA'
     SERVICEDESK = 'SERVICEDESK'
     SQLITE = 'SQLITE'
+    NOTION = 'NOTION'
 class SlackInput(CoreInput):
@@ -2657,6 +2659,7 @@ class Type17(StrEnum):
     JIRA = 'JIRA'
     SERVICEDESK = 'SERVICEDESK'
     SQLITE = 'SQLITE'
+    NOTION = 'NOTION'
 class ConfluenceInput(CoreInput):
@@ -2765,6 +2768,135 @@ class SQLiteInput(CoreInput):
     resources: ResourceOverrides | None = None
+class NotionRequired(BaseModel):
+    """
+    Notion has no required connection fields; the integration token lives in the masked section.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+class NotionMasked(BaseModel):
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    notion_token: str = Field(
+        ...,
+        description='Notion API token used as a Bearer credential. Accepts an internal integration secret (ntn_...) or an OAuth public-integration access token.',
+    )
+class NotionOptionalConnection(BaseModel):
+    """
+    HTTP, version, and retry settings for Notion API calls.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    notion_version: str | None = Field(
+        '2025-09-03',
+        description='Notion-Version header sent with every request. Defaults to the data-sources API version.',
+    )
+    request_timeout_seconds: float | None = Field(
+        30, description='HTTP request timeout for Notion API calls', ge=1.0
+    )
+    rate_limit_delay_seconds: float | None = Field(
+        0,
+        description='Additional delay between API requests to reduce rate-limit pressure',
+        ge=0.0,
+    )
+    max_retries: int | None = Field(
+        3,
+        description='Maximum retry attempts for transient API failures and rate limits',
+        ge=0,
+        le=10,
+    )
+class NotionOptionalScope(BaseModel):
+    """
+    Optional Notion scope filters. When omitted, all content shared with the integration is eligible for sampling.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    page_ids: list[str] | None = Field(
+        None,
+        description='Restrict extraction to specific page IDs (up to 250)',
+        max_length=250,
+    )
+    data_source_ids: list[str] | None = Field(
+        None,
+        description='Restrict extraction to specific data source IDs (up to 250)',
+        max_length=250,
+    )
+    search_query: str | None = Field(
+        None,
+        description='Optional full-text query passed to the Notion search endpoint to narrow discovery',
+        min_length=1,
+    )
+class NotionOptionalContent(BaseModel):
+    """
+    Notion content extraction controls.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    include_comments: bool | None = Field(
+        True,
+        description='Include page and block comments and aggregate them into a per-page comments asset',
+    )
+    include_files: bool | None = Field(
+        True,
+        description='Materialize files from file/image/pdf/video blocks, file properties, and page icon/cover as related assets',
+    )
+    include_linked_pages: bool | None = Field(
+        True,
+        description='Wire parent, relation, and mention references between pages into the asset links graph',
+    )
+    include_data_sources: bool | None = Field(
+        True,
+        description='Emit Notion data sources (databases) as assets with their schema and link their row pages',
+    )
+    file_max_bytes: int | None = Field(
+        5242880,
+        description='Maximum bytes downloaded per file for MIME inference and text extraction',
+        ge=1024,
+    )
+class NotionOptional(BaseModel):
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    connection: NotionOptionalConnection | None = None
+    scope: NotionOptionalScope | None = None
+    content: NotionOptionalContent | None = None
+class NotionInput(CoreInput):
+    type: Literal['NOTION'] = Field('NOTION', description='Type of the asset or source')
+    required: NotionRequired
+    masked: NotionMasked
+    optional: NotionOptional | None = None
+    detectors: list[Detector] | None = Field(
+        None, description='Detectors to run on ingested content'
+    )
+    custom_detectors: list[CustomDetectorSelection] | None = Field(
+        None,
+        description='Reusable custom detector IDs selected from the custom detector catalog.',
+    )
+    sampling: SamplingConfig
+    resources: ResourceOverrides | None = None
 class SourceInput(
     RootModel[
         SlackInput
@@ -2787,6 +2919,7 @@ class SourceInput(
         | JiraInput
         | ServiceDeskInput
         | SQLiteInput
+        | NotionInput
     ]
 ):
     root: (
@@ -2810,6 +2943,7 @@ class SourceInput(
         | JiraInput
         | ServiceDeskInput
         | SQLiteInput
+        | NotionInput
     ) = Field(
         ...,
         description='Merged configuration schema with all source types and common definitions',

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_single_asset_scan_results.py RENAMED Viewed

@@ -210,7 +210,14 @@ class SingleAssetScanResults(BaseModel):
         ..., description='Linked asset hashes referenced by this asset', title='Links'
     )
     asset_type: AssetType = Field(
-        ..., description='Canonical asset content type', title='Asset Type'
+        ...,
+        description='Canonical asset content type (used for detector routing)',
+        title='Asset Type',
+    )
+    asset_kind: str | None = Field(
+        None,
+        description='Catalog asset kind discriminator (file, image, page, comment, table, ...). Persisted as the asset type for display/filtering.',
+        title='Asset Kind',
     )
     source_id: str | None = Field(
         None,
@@ -238,3 +245,8 @@ class SingleAssetScanResults(BaseModel):
         description='Statistics about the detector scan for this asset',
         title='Scan Stats',
     )
+    metadata: dict[str, Any] | None = Field(
+        None,
+        description='Source-specific asset metadata using normalized keys (size_bytes, row_count, etc.) where applicable',
+        title='Metadata',
+    )

classifyre_cli-0.4.13/src/sources/asset_metadata.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Asset-metadata contract: the single source of truth for what each source
+extracts is the ``x-asset-metadata`` catalog embedded in
+``packages/schemas/src/schemas/all_input_sources.json``.
+This module loads/resolves that catalog and validates metadata dicts against it.
+Validation is strict (raises) under pytest or when ``CLASSIFYRE_STRICT_METADATA``
+is set, and otherwise logs a warning during real ingestion — so drift between a
+source's emitted keys and the declared catalog is caught either in CI or at runtime.
+"""
+from __future__ import annotations
+import logging
+import os
+from functools import cache
+from typing import Any
+from ..utils.validation import _load_schema
+logger = logging.getLogger(__name__)
+_CATALOG_KEY = "x-asset-metadata"
+ResolvedField = dict[str, Any]  # {name, type, description, required}
+class AssetMetadataContractError(AssertionError):
+    """Raised (in strict mode) when emitted metadata violates the catalog."""
+def _strict_mode() -> bool:
+    return bool(
+        os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("CLASSIFYRE_STRICT_METADATA")
+    )
+@cache
+def load_catalog() -> dict[str, Any]:
+    """Load and cache the ``x-asset-metadata`` catalog from the merged schema."""
+    schema = _load_schema("all_input_sources.json")
+    catalog = schema.get(_CATALOG_KEY)
+    if not isinstance(catalog, dict):
+        raise AssetMetadataContractError(
+            f"Missing '{_CATALOG_KEY}' catalog in all_input_sources.json"
+        )
+    return catalog
+def _source_key(source_type: str) -> str:
+    # Catalog keys mirror the AssetType enum (uppercased source_type).
+    return source_type.upper()
+def describe_type(prop_schema: dict[str, Any]) -> str:
+    """Render a JSON-Schema property type as a short display string."""
+    json_type = prop_schema.get("type")
+    if json_type == "array":
+        items = prop_schema.get("items", {})
+        item_type = items.get("type", "string") if isinstance(items, dict) else "string"
+        return f"{item_type}[]"
+    return str(json_type) if json_type else "string"
+def resolve_fields(source_type: str, asset_kind: str) -> list[ResolvedField]:
+    """Resolve the declared fields for a (source, asset kind).
+    The asset entry composes one or more reusable ``contentTypes`` via ``use``
+    plus its own ``properties``; ``required`` is the union of each used content
+    type's required list and the entry's own. Raises if the entry is absent.
+    """
+    catalog = load_catalog()
+    sources = catalog.get("sources", {})
+    source_entry = sources.get(_source_key(source_type))
+    if not isinstance(source_entry, dict) or asset_kind not in source_entry:
+        raise AssetMetadataContractError(
+            f"No catalog entry for source '{source_type}' asset kind '{asset_kind}'"
+        )
+    entry = source_entry[asset_kind]
+    content_types = catalog.get("contentTypes", {})
+    properties: dict[str, dict[str, Any]] = {}
+    required: set[str] = set()
+    for content_type_name in entry.get("use", []):
+        content_type = content_types.get(content_type_name, {})
+        properties.update(content_type.get("properties", {}))
+        required.update(content_type.get("required", []))
+    properties.update(entry.get("properties", {}))
+    required.update(entry.get("required", []))
+    return [
+        {
+            "name": name,
+            "type": describe_type(prop),
+            "description": prop.get("description", ""),
+            "required": name in required,
+        }
+        for name, prop in properties.items()
+    ]
+def validate_metadata(
+    source_type: str,
+    asset_kind: str,
+    data: dict[str, Any],
+) -> dict[str, Any]:
+    """Validate an emitted metadata dict against the catalog and return it.
+    Strict mode raises ``AssetMetadataContractError``; otherwise it logs a
+    warning. Checks: no undeclared keys, and every required field is present
+    with a non-null value.
+    """
+    try:
+        fields = resolve_fields(source_type, asset_kind)
+    except AssetMetadataContractError as exc:
+        if _strict_mode():
+            raise
+        logger.warning("Asset metadata contract: %s", exc)
+        return data
+    declared = {field["name"] for field in fields}
+    required = {field["name"] for field in fields if field["required"]}
+    present_non_null = {key for key, value in data.items() if value is not None}
+    undeclared = sorted(set(data) - declared)
+    missing_required = sorted(required - present_non_null)
+    if undeclared or missing_required:
+        message = (
+            f"[{source_type}/{asset_kind}] "
+            f"undeclared={undeclared} missing_required={missing_required}"
+        )
+        if _strict_mode():
+            raise AssetMetadataContractError(message)
+        logger.warning("Asset metadata contract drift: %s", message)
+    return data

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/base.py RENAMED Viewed

@@ -17,6 +17,11 @@ class BaseSource(ABC):
     Abstract base class for all metadata extraction sources.
     """
+    # Stable source identifier, overridden by each concrete source (e.g.
+    # "postgresql", "wordpress"). Uppercased it maps to the AssetType enum and
+    # the x-asset-metadata catalog key.
+    source_type: str = ""
     # Default batch size for streaming asset results
     BATCH_SIZE: int = 50
     HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
@@ -130,6 +135,22 @@ class BaseSource(ABC):
         """
         return calculate_checksum(data)
+    def metadata_fields(self, asset_kind: str, data: dict[str, Any]) -> dict[str, Any]:
+        """Build the ``asset_kind`` + ``metadata`` kwargs for SingleAssetScanResults.
+        Spread into the constructor: ``**self.metadata_fields("page", {...})``.
+        ``asset_kind`` is the catalog discriminator (persisted as the asset type
+        for display); ``metadata`` is validated against ``x-asset-metadata`` for
+        this source/kind — strict (raises) under pytest / ``CLASSIFYRE_STRICT_METADATA``,
+        otherwise a warning.
+        """
+        from .asset_metadata import validate_metadata
+        return {
+            "asset_kind": asset_kind,
+            "metadata": validate_metadata(self.source_type, asset_kind, data),
+        }
     @abstractmethod
     def abort(self) -> None:
         """

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/confluence/source.py RENAMED Viewed

@@ -313,6 +313,20 @@ class ConfluenceSource(BaseSource):
             "status": page.get("status"),
             "links_count": len(related_hashes),
         }
+        asset_metadata: dict[str, Any] = {
+            "page_id": page_id,
+            "title": title,
+            "links_count": len(related_hashes),
+        }
+        space_id = page.get("spaceId")
+        if space_id is not None:
+            asset_metadata["space_key"] = str(space_id)
+        status = page.get("status")
+        if isinstance(status, str) and status:
+            asset_metadata["status"] = status
+        author_id = page.get("authorId")
+        if isinstance(author_id, str) and author_id:
+            asset_metadata["author"] = author_id
         page_asset = SingleAssetScanResults(
             hash=page_hash,
             checksum=self.calculate_checksum(page_metadata),
@@ -330,6 +344,7 @@ class ConfluenceSource(BaseSource):
                 )
             ),
             runner_id=self.runner_id,
+            **self.metadata_fields("page", asset_metadata),
         )
         return [page_asset, *related_assets]
@@ -368,6 +383,15 @@ class ConfluenceSource(BaseSource):
             if download_url:
                 self._attachment_download_url_by_hash[attachment_hash] = download_url
+            attachment_metadata: dict[str, Any] = {
+                "title": attachment_name,
+                "page_hash": page_hash,
+            }
+            if mime:
+                attachment_metadata["mime_type"] = mime
+            file_size = attachment.get("fileSize")
+            if isinstance(file_size, int):
+                attachment_metadata["size_bytes"] = file_size
             assets.append(
                 SingleAssetScanResults(
                     hash=attachment_hash,
@@ -380,6 +404,7 @@ class ConfluenceSource(BaseSource):
                     created_at=now,
                     updated_at=now,
                     runner_id=self.runner_id,
+                    **self.metadata_fields("attachment", attachment_metadata),
                 )
             )
             hashes.append(attachment_hash)
@@ -453,6 +478,10 @@ class ConfluenceSource(BaseSource):
             created_at=now,
             updated_at=now,
             runner_id=self.runner_id,
+            **self.metadata_fields(
+                "comments",
+                {"page_id": page_id, "comments_count": len(comment_items)},
+            ),
         )
         return comments_asset, [comments_hash]
@@ -578,6 +607,7 @@ class ConfluenceSource(BaseSource):
             created_at=now,
             updated_at=now,
             runner_id=self.runner_id,
+            **self.metadata_fields("linked_file", {"referenced_by": page_hash}),
         )
     def _display_name_from_url(self, url: str) -> str:

{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/databricks/source.py RENAMED Viewed

@@ -719,6 +719,7 @@ class DatabricksSource(BaseTabularSource):
             created_at=now,
             updated_at=now,
             runner_id=self.runner_id,
+            **self.metadata_fields("notebook", metadata),
         )
     # ── Pipelines ────────────────────────────────────────────────────────
@@ -789,6 +790,7 @@ class DatabricksSource(BaseTabularSource):
             created_at=now,
             updated_at=now,
             runner_id=self.runner_id,
+            **self.metadata_fields("pipeline", metadata),
         )
     # ── Custom extract_raw (tables + notebooks + pipelines) ──────────────

classifyre-cli 0.4.12__tar.gz → 0.4.13__tar.gz

classifyre-cli 0.4.12tar.gz → 0.4.13tar.gz