PyPI - classifyre-cli - Versions diffs - 0.4.2__py3-none-any.whl - Mend

classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

classifyre_cli-0.4.2.dist-info/METADATA +167 -0
classifyre_cli-0.4.2.dist-info/RECORD +101 -0
classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
src/__init__.py +1 -0
src/detectors/__init__.py +105 -0
src/detectors/base.py +97 -0
src/detectors/broken_links/__init__.py +3 -0
src/detectors/broken_links/detector.py +280 -0
src/detectors/config.py +59 -0
src/detectors/content/__init__.py +0 -0
src/detectors/custom/__init__.py +13 -0
src/detectors/custom/detector.py +45 -0
src/detectors/custom/runners/__init__.py +56 -0
src/detectors/custom/runners/_base.py +177 -0
src/detectors/custom/runners/_factory.py +51 -0
src/detectors/custom/runners/_feature_extraction.py +138 -0
src/detectors/custom/runners/_gliner2.py +324 -0
src/detectors/custom/runners/_image_classification.py +98 -0
src/detectors/custom/runners/_llm.py +22 -0
src/detectors/custom/runners/_object_detection.py +107 -0
src/detectors/custom/runners/_regex.py +147 -0
src/detectors/custom/runners/_text_classification.py +109 -0
src/detectors/custom/trainer.py +293 -0
src/detectors/dependencies.py +109 -0
src/detectors/pii/__init__.py +0 -0
src/detectors/pii/detector.py +883 -0
src/detectors/secrets/__init__.py +0 -0
src/detectors/secrets/detector.py +399 -0
src/detectors/threat/__init__.py +0 -0
src/detectors/threat/code_security_detector.py +206 -0
src/detectors/threat/yara_detector.py +177 -0
src/main.py +608 -0
src/models/generated_detectors.py +1296 -0
src/models/generated_input.py +2732 -0
src/models/generated_single_asset_scan_results.py +240 -0
src/outputs/__init__.py +3 -0
src/outputs/base.py +69 -0
src/outputs/console.py +62 -0
src/outputs/factory.py +156 -0
src/outputs/file.py +83 -0
src/outputs/rest.py +258 -0
src/pipeline/__init__.py +7 -0
src/pipeline/content_provider.py +26 -0
src/pipeline/detector_pipeline.py +742 -0
src/pipeline/parsed_content_provider.py +59 -0
src/sandbox/__init__.py +5 -0
src/sandbox/runner.py +145 -0
src/sources/__init__.py +95 -0
src/sources/atlassian_common.py +389 -0
src/sources/azure_blob_storage/__init__.py +3 -0
src/sources/azure_blob_storage/source.py +130 -0
src/sources/base.py +296 -0
src/sources/confluence/__init__.py +3 -0
src/sources/confluence/source.py +733 -0
src/sources/databricks/__init__.py +3 -0
src/sources/databricks/source.py +1279 -0
src/sources/dependencies.py +81 -0
src/sources/google_cloud_storage/__init__.py +3 -0
src/sources/google_cloud_storage/source.py +114 -0
src/sources/hive/__init__.py +3 -0
src/sources/hive/source.py +709 -0
src/sources/jira/__init__.py +3 -0
src/sources/jira/source.py +605 -0
src/sources/mongodb/__init__.py +3 -0
src/sources/mongodb/source.py +550 -0
src/sources/mssql/__init__.py +3 -0
src/sources/mssql/source.py +1034 -0
src/sources/mysql/__init__.py +3 -0
src/sources/mysql/source.py +797 -0
src/sources/neo4j/__init__.py +0 -0
src/sources/neo4j/source.py +523 -0
src/sources/object_storage/base.py +679 -0
src/sources/oracle/__init__.py +3 -0
src/sources/oracle/source.py +982 -0
src/sources/postgresql/__init__.py +3 -0
src/sources/postgresql/source.py +774 -0
src/sources/powerbi/__init__.py +3 -0
src/sources/powerbi/source.py +774 -0
src/sources/recipe_normalizer.py +179 -0
src/sources/s3_compatible_storage/README.md +66 -0
src/sources/s3_compatible_storage/__init__.py +3 -0
src/sources/s3_compatible_storage/source.py +150 -0
src/sources/servicedesk/__init__.py +3 -0
src/sources/servicedesk/source.py +620 -0
src/sources/slack/__init__.py +3 -0
src/sources/slack/source.py +534 -0
src/sources/snowflake/__init__.py +3 -0
src/sources/snowflake/source.py +912 -0
src/sources/tableau/__init__.py +3 -0
src/sources/tableau/source.py +799 -0
src/sources/tabular_utils.py +165 -0
src/sources/wordpress/__init__.py +3 -0
src/sources/wordpress/source.py +590 -0
src/telemetry.py +96 -0
src/utils/__init__.py +1 -0
src/utils/content_extraction.py +108 -0
src/utils/file_parser.py +777 -0
src/utils/hashing.py +82 -0
src/utils/uv_sync.py +79 -0
src/utils/validation.py +56 -0

src/pipeline/parsed_content_provider.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""ContentProvider that wraps a BaseSource and applies file_parser for binary→text conversion."""
+from __future__ import annotations
+import asyncio
+from collections.abc import AsyncGenerator
+from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
+from ..sources.base import BaseSource
+class ParsedContentProvider:
+    """
+    Wraps a BaseSource, providing text pages and raw bytes to the pipeline.
+    Text path: delegates to ``source.fetch_content_pages()`` first.  If the source
+    returns nothing, falls back to ``source.fetch_content_bytes()`` → ``iter_file_pages()``.
+    Binary path: delegates directly to ``source.fetch_content_bytes()``.
+    """
+    def __init__(self, source: BaseSource) -> None:
+        self._source = source
+    async def fetch_text_pages(self, asset_id: str) -> AsyncGenerator[str, None]:
+        saw_text = False
+        async for _raw, text in self._source.fetch_content_pages(asset_id):
+            if text:
+                saw_text = True
+                yield text
+        if saw_text:
+            return
+        result = await self._source.fetch_content_bytes(asset_id)
+        if result is None:
+            return
+        raw_bytes, mime = result
+        pages: list[str] = await asyncio.to_thread(
+            list,
+            self._source.iter_asset_pages(raw_bytes, mime),
+        )
+        for page in pages:
+            yield page
+    async def fetch_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
+        return await self._source.fetch_content_bytes(asset_id)
+    def enrich_finding_location(
+        self,
+        finding: DetectionResult,
+        asset: SingleAssetScanResults,
+        text_content: str,
+    ) -> None:
+        self._source.enrich_finding_location(finding, asset, text_content)
+    def resolve_link_for_detection(self, link: str) -> str | None:
+        return self._source.resolve_link_for_detection(link)

src/sandbox/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Sandbox module for running detectors on local files."""
+from .runner import SandboxRunner
+__all__ = ["SandboxRunner"]

src/sandbox/runner.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""SandboxRunner: run detectors on a local file."""
+from __future__ import annotations
+import asyncio
+import logging
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+from ..models.generated_single_asset_scan_results import DetectionResult
+from ..utils.file_parser import ParsedFile, parse_file
+logger = logging.getLogger(__name__)
+_CONTENT_SIZE_LIMIT = 1_048_576  # 1 MB
+class SandboxRunner:
+    """Run a set of detectors against a single local file."""
+    def __init__(self, detectors_config: list[dict[str, Any]]) -> None:
+        self._config = detectors_config
+    def _build_detectors(self) -> list[Any]:
+        from ..detectors import get_detector
+        from ..detectors.config import parse_detector_config
+        detectors = []
+        for item in self._config:
+            if not item.get("enabled", True):
+                continue
+            detector_type = item.get("type", "").upper()
+            raw_config = item.get("config", {})
+            try:
+                detector_name, typed_config = parse_detector_config(
+                    detector_type=detector_type,
+                    raw_config=raw_config,
+                )
+                detector = get_detector(detector_name, typed_config)
+                detectors.append(detector)
+                logger.info(f"Initialized sandbox detector: {detector_name}")
+            except Exception as e:
+                logger.error(f"Failed to initialize detector {detector_type}: {e}")
+        return detectors
+    @staticmethod
+    def _is_binary_detector(detector: Any) -> bool:
+        for ct in detector.get_supported_content_types():
+            if ct.startswith(("image/", "audio/", "video/")) or ct == "application/octet-stream":
+                return True
+        return False
+    @staticmethod
+    def _supports_mime(supported: list[str], mime_type: str) -> bool:
+        if mime_type in supported:
+            return True
+        for s in supported:
+            if s.endswith("/*") and mime_type.startswith(s[:-1]):
+                return True
+        return False
+    async def run_async(self, file_path: Path) -> tuple[ParsedFile, list[DetectionResult]]:
+        """Parse the file and run all enabled detectors."""
+        parsed = parse_file(file_path)
+        detectors = self._build_detectors()
+        if not detectors:
+            return parsed, []
+        tasks = []
+        active_detectors = []
+        if parsed.is_binary:
+            raw_bytes = file_path.read_bytes()
+            mime_type = parsed.mime_type
+            if len(raw_bytes) > _CONTENT_SIZE_LIMIT:
+                logger.warning(
+                    f"Binary content ({len(raw_bytes)} bytes) exceeds limit "
+                    f"({_CONTENT_SIZE_LIMIT} bytes); truncating."
+                )
+                raw_bytes = raw_bytes[:_CONTENT_SIZE_LIMIT]
+            for detector in detectors:
+                if self._is_binary_detector(detector) and self._supports_mime(
+                    detector.get_supported_content_types(), mime_type
+                ):
+                    tasks.append(detector.detect(raw_bytes, mime_type))
+                    active_detectors.append(detector)
+        else:
+            if parsed.parse_error:
+                logger.warning(
+                    "Text extraction failed (%s): %s", parsed.mime_type, parsed.parse_error
+                )
+            text = parsed.text_content
+            if not text.strip():
+                logger.warning(
+                    "No text content extracted from %s file; skipping text detectors.",
+                    parsed.mime_type,
+                )
+                return parsed, []
+            if len(text) > _CONTENT_SIZE_LIMIT:
+                logger.warning(
+                    f"Content size ({len(text)} bytes) exceeds limit "
+                    f"({_CONTENT_SIZE_LIMIT} bytes); truncating."
+                )
+                text = text[:_CONTENT_SIZE_LIMIT]
+            for detector in detectors:
+                supported = detector.get_supported_content_types()
+                if "text/plain" in supported:
+                    tasks.append(detector.detect(text, "text/plain"))
+                    active_detectors.append(detector)
+        if not tasks:
+            return parsed, []
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        all_findings: list[DetectionResult] = []
+        detected_at = datetime.now(UTC)
+        for detector, result in zip(active_detectors, results, strict=False):
+            if isinstance(result, Exception):
+                logger.error(f"Detector {detector.__class__.__name__} failed: {result}")
+                continue
+            if isinstance(result, list):
+                for finding in result:
+                    if isinstance(finding, DetectionResult):
+                        all_findings.append(
+                            finding.model_copy(
+                                update={
+                                    "runner_id": "sandbox",
+                                    "detected_at": detected_at,
+                                }
+                            )
+                        )
+        return parsed, all_findings
+    def run(self, file_path: Path) -> tuple[ParsedFile, list[DetectionResult]]:
+        """Synchronous wrapper around run_async."""
+        return asyncio.run(self.run_async(file_path))

src/sources/__init__.py ADDED Viewed

@@ -0,0 +1,95 @@
+import importlib
+import inspect
+import logging
+import pkgutil
+from typing import Any
+from .base import BaseSource
+logger = logging.getLogger(__name__)
+_registry: dict[str, type[BaseSource]] = {}
+def _discover_sources() -> None:
+    """
+    Automatically discover and register all BaseSource subclasses
+    in the subpackages of src.sources.
+    """
+    if _registry:
+        return
+    # Iterate over all subpackages in the current directory
+    for _loader, module_name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
+        if is_pkg:
+            continue
+        try:
+            module = importlib.import_module(module_name)
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                # Check if it's a class, inherits from BaseSource, and is not BaseSource itself
+                if (
+                    isinstance(attr, type)
+                    and issubclass(attr, BaseSource)
+                    and attr is not BaseSource
+                    and not inspect.isabstract(attr)
+                ):
+                    # We can use a class attribute for the type name,
+                    # or derive it from the class name/module name.
+                    # Let's assume the class might have a 'source_type' attribute,
+                    # otherwise we fallback to a cleaned up class name.
+                    source_type = getattr(attr, "source_type", None)
+                    if not source_type:
+                        # Fallback: WordPressSource -> wordpress
+                        source_type = attr.__name__.replace("Source", "").lower()
+                    if source_type in _registry:
+                        logger.warning(
+                            f"Duplicate source type '{source_type}' registered by {attr.__name__}"
+                        )
+                    else:
+                        _registry[source_type] = attr
+                        logger.debug(f"Registered source type '{source_type}' from {module_name}")
+        except Exception as e:
+            logger.error(f"Failed to import module {module_name}: {e}")
+def get_source(
+    recipe: dict[str, Any],
+    source_id: str | None = None,
+    runner_id: str | None = None,
+) -> BaseSource:
+    """
+    Factory function to create a source instance from a recipe.
+    Args:
+        recipe: Source configuration
+        source_id: Optional source ID for asset attribution
+        runner_id: Optional runner ID for tracking
+    """
+    _discover_sources()
+    source_type = recipe.get("type", "").lower()
+    if not source_type:
+        raise ValueError("Recipe must have a 'type' field")
+    source_class = _registry.get(source_type)
+    if not source_class:
+        available = ", ".join(sorted(_registry.keys()))
+        raise ValueError(f"Source type '{source_type}' not found. Available sources: {available}")
+    # Prefer passing source_id + runner_id for full attribution.
+    try:
+        return source_class(recipe, source_id=source_id, runner_id=runner_id)
+    except TypeError:
+        try:
+            return source_class(recipe, runner_id=runner_id)
+        except TypeError:
+            return source_class(recipe)
+def list_available_sources() -> list[str]:
+    """Return a list of all registered source types."""
+    _discover_sources()
+    return sorted(_registry.keys())

src/sources/atlassian_common.py ADDED Viewed

@@ -0,0 +1,389 @@
+from __future__ import annotations
+import json
+import logging
+import random
+import re
+import time
+from datetime import UTC, datetime
+from typing import Any
+from urllib.parse import urljoin, urlsplit, urlunsplit
+import requests
+from ..utils.hashing import normalize_http_url
+logger = logging.getLogger(__name__)
+URL_RE = re.compile(r"https?://[\w\-._~:/?#\[\]@!$&'()*+,;=%]+", re.IGNORECASE)
+TABULAR_MIME_TYPES = {
+    "text/csv",
+    "text/tab-separated-values",
+    "application/vnd.ms-excel",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    "application/parquet",
+    "application/vnd.apache.parquet",
+}
+TABULAR_FILE_EXTENSIONS = {
+    ".csv",
+    ".tsv",
+    ".xls",
+    ".xlsx",
+    ".parquet",
+}
+def normalize_atlassian_base_url(url: str, *, strip_wiki: bool = False) -> str:
+    normalized = normalize_http_url(url)
+    if not normalized:
+        raise ValueError(f"Invalid Atlassian base URL: {url}")
+    parsed = urlsplit(normalized)
+    path = parsed.path.rstrip("/")
+    if strip_wiki and path.endswith("/wiki"):
+        path = path[: -len("/wiki")]
+    return urlunsplit((parsed.scheme, parsed.netloc, path, "", ""))
+def parse_datetime(value: str | None) -> datetime:
+    if not value:
+        return datetime.now(UTC)
+    normalized = value.replace("Z", "+00:00")
+    try:
+        parsed = datetime.fromisoformat(normalized)
+    except ValueError:
+        return datetime.now(UTC)
+    if parsed.tzinfo is None:
+        return parsed.replace(tzinfo=UTC)
+    return parsed
+def dedupe_preserve_order(values: list[str]) -> list[str]:
+    seen: set[str] = set()
+    unique_values: list[str] = []
+    for value in values:
+        if value in seen:
+            continue
+        seen.add(value)
+        unique_values.append(value)
+    return unique_values
+def deterministic_sample(items: list[Any], limit: int) -> list[Any]:
+    if limit >= len(items):
+        return items
+    generator = random.Random(0)
+    indexes = sorted(generator.sample(range(len(items)), k=limit))
+    return [items[i] for i in indexes]
+def extract_urls_from_text(text: str) -> list[str]:
+    if not text:
+        return []
+    return dedupe_preserve_order([match.group(0) for match in URL_RE.finditer(text)])
+def is_tabular_mime_type(mime_type: str) -> bool:
+    normalized = mime_type.split(";", 1)[0].strip().lower()
+    return normalized in TABULAR_MIME_TYPES
+def is_tabular_filename(file_name: str) -> bool:
+    path = urlsplit(file_name).path.lower()
+    return any(path.endswith(extension) for extension in TABULAR_FILE_EXTENSIONS)
+class AtlassianCloudClient:
+    def __init__(
+        self,
+        *,
+        base_url: str,
+        account_email: str,
+        api_token: str,
+        request_timeout_seconds: float = 30,
+        max_retries: int = 3,
+        rate_limit_delay_seconds: float = 0,
+    ) -> None:
+        self.base_url = base_url.rstrip("/")
+        self.request_timeout_seconds = max(float(request_timeout_seconds), 1.0)
+        self.max_retries = max(int(max_retries), 0)
+        self.rate_limit_delay_seconds = max(float(rate_limit_delay_seconds), 0.0)
+        self.session = requests.Session()
+        self.session.auth = (account_email, api_token)
+        self.session.headers.update(
+            {
+                "Accept": "application/json",
+            }
+        )
+    def close(self) -> None:
+        self.session.close()
+    def build_url(self, path_or_url: str) -> str:
+        if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
+            return path_or_url
+        if not path_or_url.startswith("/"):
+            path_or_url = f"/{path_or_url}"
+        return f"{self.base_url}{path_or_url}"
+    def _request(
+        self,
+        method: str,
+        path_or_url: str,
+        *,
+        params: dict[str, Any] | None = None,
+        headers: dict[str, str] | None = None,
+        stream: bool = False,
+    ) -> requests.Response:
+        url = self.build_url(path_or_url)
+        attempts = 0
+        max_attempts = self.max_retries + 1
+        while attempts < max_attempts:
+            attempts += 1
+            response = self.session.request(
+                method,
+                url,
+                params=params,
+                headers=headers,
+                timeout=self.request_timeout_seconds,
+                stream=stream,
+            )
+            if response.status_code == 429 and attempts < max_attempts:
+                retry_after_header = response.headers.get("Retry-After")
+                retry_after = 1
+                if retry_after_header:
+                    try:
+                        retry_after = max(int(float(retry_after_header)), 1)
+                    except ValueError:
+                        retry_after = 1
+                logger.warning("Atlassian rate limit hit for %s. Retrying in %ss", url, retry_after)
+                time.sleep(retry_after)
+                continue
+            if response.status_code >= 500 and attempts < max_attempts:
+                sleep_seconds = min(2 ** (attempts - 1), 8)
+                logger.warning(
+                    "Atlassian server error %s for %s. Retrying in %ss",
+                    response.status_code,
+                    url,
+                    sleep_seconds,
+                )
+                time.sleep(sleep_seconds)
+                continue
+            if self.rate_limit_delay_seconds > 0:
+                time.sleep(self.rate_limit_delay_seconds)
+            return response
+        return response
+    def get_json(
+        self,
+        path_or_url: str,
+        *,
+        params: dict[str, Any] | None = None,
+    ) -> dict[str, Any]:
+        response = self._request("GET", path_or_url, params=params)
+        response.raise_for_status()
+        try:
+            payload = response.json()
+        except ValueError as exc:
+            raise RuntimeError(
+                f"Atlassian API returned non-JSON response for {path_or_url}"
+            ) from exc
+        if not isinstance(payload, dict):
+            raise RuntimeError(f"Expected JSON object response for {path_or_url}")
+        return payload
+    def get_bytes(self, path_or_url: str) -> tuple[bytes, str]:
+        response = self._request("GET", path_or_url, stream=True)
+        response.raise_for_status()
+        chunks = []
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                chunks.append(chunk)
+        mime = response.headers.get("Content-Type", "").split(";")[0].strip().lower()
+        return b"".join(chunks), mime
+    def iter_confluence_results(
+        self,
+        path: str,
+        *,
+        params: dict[str, Any] | None = None,
+    ) -> list[dict[str, Any]]:
+        url = path
+        next_params = dict(params or {})
+        results: list[dict[str, Any]] = []
+        while True:
+            payload = self.get_json(url, params=next_params)
+            page_items = payload.get("results", [])
+            if isinstance(page_items, list):
+                for item in page_items:
+                    if isinstance(item, dict):
+                        results.append(item)
+            links = payload.get("_links", {})
+            next_link = links.get("next") if isinstance(links, dict) else None
+            if not isinstance(next_link, str) or not next_link:
+                break
+            url = urljoin(self.base_url + "/", next_link)
+            next_params = None
+        return results
+    def iter_jira_search_jql(
+        self,
+        *,
+        jql: str,
+        fields: list[str],
+        max_results: int = 100,
+    ) -> list[dict[str, Any]]:
+        results: list[dict[str, Any]] = []
+        next_page_token: str | None = None
+        while True:
+            params: dict[str, Any] = {
+                "jql": jql,
+                "maxResults": max_results,
+                "fields": ",".join(fields),
+            }
+            if next_page_token:
+                params["nextPageToken"] = next_page_token
+            payload = self.get_json("/rest/api/3/search/jql", params=params)
+            issues = payload.get("issues", [])
+            if isinstance(issues, list):
+                for issue in issues:
+                    if isinstance(issue, dict):
+                        results.append(issue)
+            if payload.get("isLast") is True:
+                break
+            if "nextPageToken" not in payload:
+                break
+            token = payload.get("nextPageToken")
+            if not token:
+                break
+            next_page_token = str(token)
+        return results
+    def iter_servicedesk_values(
+        self,
+        path: str,
+        *,
+        params: dict[str, Any] | None = None,
+        limit: int = 50,
+    ) -> list[dict[str, Any]]:
+        results: list[dict[str, Any]] = []
+        start = 0
+        base_params = dict(params or {})
+        page_limit = max(int(limit), 1)
+        while True:
+            request_params = dict(base_params)
+            request_params["start"] = start
+            request_params["limit"] = page_limit
+            payload = self.get_json(path, params=request_params)
+            values = payload.get("values", [])
+            if isinstance(values, list):
+                for item in values:
+                    if isinstance(item, dict):
+                        results.append(item)
+            is_last = payload.get("isLastPage")
+            if is_last is True:
+                break
+            size = payload.get("size")
+            try:
+                size_int = int(size)
+            except (TypeError, ValueError):
+                size_int = len(values) if isinstance(values, list) else 0
+            if size_int <= 0:
+                break
+            start += size_int
+        return results
+def parse_atlassian_document(value: Any) -> tuple[str, list[str]]:
+    text_parts: list[str] = []
+    url_candidates: list[str] = []
+    def visit(node: Any) -> None:
+        if node is None:
+            return
+        if isinstance(node, str):
+            text_parts.append(node)
+            url_candidates.extend(extract_urls_from_text(node))
+            return
+        if isinstance(node, list):
+            for item in node:
+                visit(item)
+            return
+        if isinstance(node, dict):
+            attrs = node.get("attrs")
+            if isinstance(attrs, dict):
+                for key in ("url", "href"):
+                    attr_value = attrs.get(key)
+                    if isinstance(attr_value, str):
+                        url_candidates.append(attr_value)
+            if "text" in node:
+                visit(node.get("text"))
+            if "content" in node:
+                visit(node.get("content"))
+            for key, val in node.items():
+                if key in {"attrs", "text", "content"}:
+                    continue
+                if isinstance(val, (dict, list, str)):
+                    visit(val)
+            return
+    visit(value)
+    return "\n".join(part for part in text_parts if part), dedupe_preserve_order(url_candidates)
+def looks_like_file_asset(url: str) -> bool:
+    path = urlsplit(url).path.lower()
+    file_extensions = (
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".webp",
+        ".svg",
+        ".bmp",
+        ".ico",
+        ".mp4",
+        ".webm",
+        ".mov",
+        ".mkv",
+        ".avi",
+        ".mp3",
+        ".wav",
+        ".aac",
+        ".ogg",
+        ".pdf",
+        ".doc",
+        ".docx",
+        ".xls",
+        ".xlsx",
+        ".ppt",
+        ".pptx",
+        ".zip",
+        ".rar",
+        ".7z",
+        ".tar",
+        ".gz",
+        ".json",
+        ".xml",
+        ".txt",
+        ".csv",
+        ".md",
+    )
+    return path.endswith(file_extensions)
+def json_dumps(data: dict[str, Any]) -> str:
+    return json.dumps(data, ensure_ascii=False, default=str)

src/sources/azure_blob_storage/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .source import AzureBlobStorageSource
+__all__ = ["AzureBlobStorageSource"]