classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Helpers for optional source dependencies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import logging
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
|
|
9
|
+
from src.utils.uv_sync import auto_install_enabled, sync_group
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MissingSourceDependencyError(RuntimeError):
|
|
15
|
+
"""Raised when an optional source dependency is unavailable."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
source_name: str,
|
|
20
|
+
dependencies: list[str],
|
|
21
|
+
uv_groups: list[str],
|
|
22
|
+
detail: str | None = None,
|
|
23
|
+
) -> None:
|
|
24
|
+
self.source_name = source_name
|
|
25
|
+
self.dependencies = dependencies
|
|
26
|
+
self.uv_groups = uv_groups
|
|
27
|
+
self.detail = detail
|
|
28
|
+
|
|
29
|
+
deps = ", ".join(dependencies)
|
|
30
|
+
group_hint = " or ".join(f"`uv sync --group {group}`" for group in uv_groups)
|
|
31
|
+
message = (
|
|
32
|
+
f"{source_name} source requires optional dependencies ({deps}). "
|
|
33
|
+
f"Install with {group_hint}."
|
|
34
|
+
)
|
|
35
|
+
if detail:
|
|
36
|
+
message = f"{message} {detail}"
|
|
37
|
+
super().__init__(message)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ordered_groups(groups: list[str]) -> list[str]:
|
|
41
|
+
return sorted(dict.fromkeys(groups))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def require_module(
|
|
45
|
+
module_name: str,
|
|
46
|
+
source_name: str,
|
|
47
|
+
uv_groups: list[str],
|
|
48
|
+
detail: str | None = None,
|
|
49
|
+
) -> ModuleType:
|
|
50
|
+
"""Import a module or raise MissingSourceDependencyError with uv guidance."""
|
|
51
|
+
try:
|
|
52
|
+
return importlib.import_module(module_name)
|
|
53
|
+
except Exception as exc: # pragma: no cover - environment dependent
|
|
54
|
+
detail_messages: list[str] = [f"Original error: {exc}"]
|
|
55
|
+
|
|
56
|
+
if auto_install_enabled() and uv_groups:
|
|
57
|
+
for group in _ordered_groups(uv_groups):
|
|
58
|
+
success, install_detail = sync_group(group)
|
|
59
|
+
if install_detail:
|
|
60
|
+
detail_messages.append(install_detail)
|
|
61
|
+
if not success:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
importlib.invalidate_caches()
|
|
66
|
+
return importlib.import_module(module_name)
|
|
67
|
+
except Exception as retry_exc: # pragma: no cover
|
|
68
|
+
detail_messages.append(
|
|
69
|
+
f"Module '{module_name}' still unavailable after installing '{group}': {retry_exc}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
base_detail = detail or "Optional dependency import failed"
|
|
73
|
+
error_detail = (
|
|
74
|
+
f"{base_detail}. {'; '.join(detail_messages)}" if detail_messages else base_detail
|
|
75
|
+
)
|
|
76
|
+
raise MissingSourceDependencyError(
|
|
77
|
+
source_name=source_name,
|
|
78
|
+
dependencies=[module_name.split(".", maxsplit=1)[0]],
|
|
79
|
+
uv_groups=uv_groups,
|
|
80
|
+
detail=error_detail,
|
|
81
|
+
) from exc
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ...models.generated_input import GoogleCloudStorageInput
|
|
8
|
+
from ..dependencies import require_module
|
|
9
|
+
from ..object_storage.base import ObjectRef, ObjectStorageSourceBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GoogleCloudStorageSource(ObjectStorageSourceBase):
|
|
13
|
+
source_type = "google_cloud_storage"
|
|
14
|
+
provider_label = "GOOGLE_CLOUD_STORAGE"
|
|
15
|
+
input_model = GoogleCloudStorageInput
|
|
16
|
+
|
|
17
|
+
def _required_bucket(self) -> str:
|
|
18
|
+
bucket = str(self.config.required.bucket).strip()
|
|
19
|
+
if not bucket:
|
|
20
|
+
raise ValueError("required.bucket must be set")
|
|
21
|
+
return bucket
|
|
22
|
+
|
|
23
|
+
def _build_client(self) -> Any:
|
|
24
|
+
storage_module = require_module(
|
|
25
|
+
module_name="google.cloud.storage",
|
|
26
|
+
source_name="Google Cloud Storage",
|
|
27
|
+
uv_groups=["google-cloud-storage"],
|
|
28
|
+
detail="Google Cloud Storage requires google-cloud-storage.",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
project = self._string_or_none(self._connection_option("project_id"))
|
|
32
|
+
credentials_json = self._masked_value("gcp_credentials_json")
|
|
33
|
+
credentials_file = self._string_or_none(self._connection_option("gcp_credentials_file"))
|
|
34
|
+
|
|
35
|
+
if credentials_json:
|
|
36
|
+
service_account_module = require_module(
|
|
37
|
+
module_name="google.oauth2.service_account",
|
|
38
|
+
source_name="Google Cloud Storage",
|
|
39
|
+
uv_groups=["google-cloud-storage"],
|
|
40
|
+
detail="Inline service account credentials require google-auth support.",
|
|
41
|
+
)
|
|
42
|
+
credentials = service_account_module.Credentials.from_service_account_info(
|
|
43
|
+
json.loads(credentials_json)
|
|
44
|
+
)
|
|
45
|
+
return storage_module.Client(project=project, credentials=credentials)
|
|
46
|
+
|
|
47
|
+
if credentials_file:
|
|
48
|
+
service_account_module = require_module(
|
|
49
|
+
module_name="google.oauth2.service_account",
|
|
50
|
+
source_name="Google Cloud Storage",
|
|
51
|
+
uv_groups=["google-cloud-storage"],
|
|
52
|
+
detail="File-based service account credentials require google-auth support.",
|
|
53
|
+
)
|
|
54
|
+
credentials = service_account_module.Credentials.from_service_account_file(
|
|
55
|
+
credentials_file
|
|
56
|
+
)
|
|
57
|
+
return storage_module.Client(project=project, credentials=credentials)
|
|
58
|
+
|
|
59
|
+
return storage_module.Client(project=project)
|
|
60
|
+
|
|
61
|
+
def _client(self) -> Any:
|
|
62
|
+
if self._cached_client is None:
|
|
63
|
+
self._cached_client = self._build_client()
|
|
64
|
+
return self._cached_client
|
|
65
|
+
|
|
66
|
+
def _list_objects(self) -> Iterator[ObjectRef]:
|
|
67
|
+
client = self._client()
|
|
68
|
+
bucket = self._required_bucket()
|
|
69
|
+
prefix = self._prefix()
|
|
70
|
+
max_keys = self._max_keys_per_page()
|
|
71
|
+
timeout = self._request_timeout_seconds()
|
|
72
|
+
|
|
73
|
+
blobs = client.list_blobs(
|
|
74
|
+
bucket_or_name=bucket,
|
|
75
|
+
prefix=prefix or None,
|
|
76
|
+
page_size=max_keys,
|
|
77
|
+
timeout=timeout,
|
|
78
|
+
)
|
|
79
|
+
for blob in blobs:
|
|
80
|
+
key = str(getattr(blob, "name", "") or "")
|
|
81
|
+
if not key or key.endswith("/"):
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
size = int(getattr(blob, "size", 0) or 0)
|
|
85
|
+
if size == 0 and not self._include_empty_objects():
|
|
86
|
+
continue
|
|
87
|
+
if not self._object_matches_extension_filters(key):
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
yield ObjectRef(
|
|
91
|
+
key=key,
|
|
92
|
+
size=size,
|
|
93
|
+
last_modified=self._parse_datetime(getattr(blob, "updated", None)),
|
|
94
|
+
etag=str(getattr(blob, "etag", "") or "") or None,
|
|
95
|
+
content_type_hint=str(getattr(blob, "content_type", "") or "") or None,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
|
|
99
|
+
client = self._client()
|
|
100
|
+
bucket = client.bucket(self._required_bucket())
|
|
101
|
+
blob = bucket.blob(ref.key)
|
|
102
|
+
|
|
103
|
+
max_bytes = self._max_object_bytes()
|
|
104
|
+
timeout = self._request_timeout_seconds()
|
|
105
|
+
|
|
106
|
+
if ref.size > max_bytes:
|
|
107
|
+
file_bytes = blob.download_as_bytes(start=0, end=max_bytes - 1, timeout=timeout)
|
|
108
|
+
return file_bytes, ref.content_type_hint, True
|
|
109
|
+
|
|
110
|
+
file_bytes = blob.download_as_bytes(timeout=timeout)
|
|
111
|
+
return file_bytes, ref.content_type_hint, False
|
|
112
|
+
|
|
113
|
+
def _external_url(self, key: str) -> str:
|
|
114
|
+
return f"gs://{self._required_bucket()}/{key}"
|