classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
src/utils/hashing.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any
|
|
5
|
+
from urllib.parse import urljoin, urlsplit, urlunsplit
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def hash_id(source_type: str, raw_id: str) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Hash the raw ID into a base64 encoded string with a source type prefix.
|
|
11
|
+
Note: This is actually just base64 encoding for reversibility in debugging,
|
|
12
|
+
not a cryptographic hash.
|
|
13
|
+
"""
|
|
14
|
+
final_raw_id = f"{source_type}_#_{raw_id}"
|
|
15
|
+
return base64.urlsafe_b64encode(final_raw_id.encode()).decode().rstrip("=")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def unhash_id(hashed_id: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Unhash the base64 encoded ID back to its raw form.
|
|
21
|
+
"""
|
|
22
|
+
# Add padding back if necessary
|
|
23
|
+
padding = len(hashed_id) % 4
|
|
24
|
+
if padding:
|
|
25
|
+
hashed_id += "=" * (4 - padding)
|
|
26
|
+
return base64.urlsafe_b64decode(hashed_id.encode()).decode()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def calculate_checksum(data: dict[str, Any]) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Calculate a stable SHA-256 checksum for a dictionary.
|
|
32
|
+
Keys are sorted to ensure stability.
|
|
33
|
+
"""
|
|
34
|
+
# Use sort_keys=True for stability
|
|
35
|
+
dump = json.dumps(data, sort_keys=True, default=str).encode("utf-8")
|
|
36
|
+
return hashlib.sha256(dump).hexdigest()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def normalize_http_url(url: str, *, base_url: str | None = None) -> str | None:
|
|
40
|
+
"""
|
|
41
|
+
Normalize an HTTP(S) URL for stable hashing and deduplication.
|
|
42
|
+
|
|
43
|
+
- Resolves relative URLs against `base_url` when provided
|
|
44
|
+
- Rejects non-HTTP(S) schemes (mailto:, tel:, javascript:, data:, etc.)
|
|
45
|
+
- Removes URL fragments
|
|
46
|
+
"""
|
|
47
|
+
candidate = (url or "").strip()
|
|
48
|
+
if not candidate:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
lowered = candidate.lower()
|
|
52
|
+
if lowered.startswith(("#", "javascript:", "mailto:", "tel:", "data:")):
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
if base_url:
|
|
56
|
+
candidate = urljoin(f"{base_url.rstrip('/')}/", candidate)
|
|
57
|
+
|
|
58
|
+
parsed = urlsplit(candidate)
|
|
59
|
+
if parsed.scheme.lower() not in {"http", "https"} or not parsed.netloc:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
path = parsed.path or "/"
|
|
63
|
+
return urlunsplit(
|
|
64
|
+
(
|
|
65
|
+
parsed.scheme.lower(),
|
|
66
|
+
parsed.netloc.lower(),
|
|
67
|
+
path,
|
|
68
|
+
parsed.query,
|
|
69
|
+
"",
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def hash_url(url: str, *, base_url: str | None = None) -> str:
|
|
75
|
+
"""
|
|
76
|
+
URL hash strategy: opaque SHA-256 digest of normalized absolute URL.
|
|
77
|
+
"""
|
|
78
|
+
normalized = normalize_http_url(url, base_url=base_url)
|
|
79
|
+
if not normalized:
|
|
80
|
+
raise ValueError(f"Invalid URL for hash: {url}")
|
|
81
|
+
digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
|
82
|
+
return f"url_sha256:{digest}"
|
src/utils/uv_sync.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Shared uv-sync state so that every `uv sync` call includes ALL accumulated groups.
|
|
2
|
+
|
|
3
|
+
`uv sync --group X` removes packages that belong to other groups. When sources
|
|
4
|
+
and detectors each call `uv sync --group <their_group>` independently, the last
|
|
5
|
+
call uninstalls packages from earlier groups. This module keeps a global set of
|
|
6
|
+
requested groups and always passes them all to `uv sync`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import shutil
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
import threading
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_lock = threading.Lock()
|
|
21
|
+
_synced_groups: set[str] = set()
|
|
22
|
+
_failed_groups: dict[str, str] = {}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _auto_install_enabled() -> bool:
|
|
26
|
+
value = os.environ.get("CLASSIFYRE_CLI_AUTO_INSTALL_OPTIONAL_DEPS", "1").strip().lower()
|
|
27
|
+
return value not in {"0", "false", "no"}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _uv_command() -> list[str]:
|
|
31
|
+
uv_binary = shutil.which("uv")
|
|
32
|
+
if uv_binary:
|
|
33
|
+
return [uv_binary]
|
|
34
|
+
return [sys.executable, "-m", "uv"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def sync_group(group: str) -> tuple[bool, str | None]:
|
|
38
|
+
"""Ensure *group* is installed, re-syncing with ALL previously requested groups."""
|
|
39
|
+
with _lock:
|
|
40
|
+
if group in _synced_groups:
|
|
41
|
+
return True, None
|
|
42
|
+
if group in _failed_groups:
|
|
43
|
+
return False, _failed_groups[group]
|
|
44
|
+
|
|
45
|
+
all_groups = _synced_groups | {group}
|
|
46
|
+
timeout = int(os.environ.get("CLASSIFYRE_UV_SYNC_TIMEOUT_SECONDS", "900"))
|
|
47
|
+
command = [*_uv_command(), "sync", "--frozen", "--no-dev"]
|
|
48
|
+
for g in sorted(all_groups):
|
|
49
|
+
command.extend(["--group", g])
|
|
50
|
+
|
|
51
|
+
logger.info("Installing optional dependency group '%s'...", group)
|
|
52
|
+
try:
|
|
53
|
+
result = subprocess.run(
|
|
54
|
+
command,
|
|
55
|
+
check=False,
|
|
56
|
+
capture_output=True,
|
|
57
|
+
text=True,
|
|
58
|
+
timeout=timeout,
|
|
59
|
+
)
|
|
60
|
+
except Exception as exc:
|
|
61
|
+
detail = f"Failed to execute uv sync for group '{group}': {exc}"
|
|
62
|
+
_failed_groups[group] = detail
|
|
63
|
+
logger.error(detail)
|
|
64
|
+
return False, detail
|
|
65
|
+
|
|
66
|
+
if result.returncode == 0:
|
|
67
|
+
_synced_groups.update(all_groups)
|
|
68
|
+
logger.info("Installed dependency group '%s'", group)
|
|
69
|
+
return True, None
|
|
70
|
+
|
|
71
|
+
detail = result.stderr.strip() or result.stdout.strip() or "Unknown uv sync error"
|
|
72
|
+
message = f"uv sync failed for group '{group}': {detail}"
|
|
73
|
+
_failed_groups[group] = message
|
|
74
|
+
logger.error(message)
|
|
75
|
+
return False, message
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def auto_install_enabled() -> bool:
|
|
79
|
+
return _auto_install_enabled()
|
src/utils/validation.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from jsonschema import validators
|
|
6
|
+
|
|
7
|
+
import schemas
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _load_schema(schema_filename: str) -> dict[str, Any]:
|
|
11
|
+
schema_dir = Path(schemas.__file__).parent
|
|
12
|
+
schema_path = schema_dir / schema_filename
|
|
13
|
+
|
|
14
|
+
with open(schema_path) as f:
|
|
15
|
+
return json.load(f)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _validate_schema(instance: dict[str, Any], schema: dict[str, Any]) -> None:
|
|
19
|
+
validator_cls = validators.validator_for(schema)
|
|
20
|
+
validator_cls.check_schema(schema)
|
|
21
|
+
validator = validator_cls(schema)
|
|
22
|
+
validator.validate(instance)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def validate_input(data: dict[str, Any], schema_name: str = "") -> None: # noqa: ARG001
|
|
26
|
+
"""
|
|
27
|
+
Validate input data against the unified all_input_sources schema.
|
|
28
|
+
The schema_name parameter is kept for compatibility but ignored.
|
|
29
|
+
"""
|
|
30
|
+
schema = _load_schema("all_input_sources.json")
|
|
31
|
+
_validate_schema(data, schema)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def validate_output(data: dict[str, Any], schema_name: str = "") -> None: # noqa: ARG001
|
|
35
|
+
"""
|
|
36
|
+
Validate output data against the unified single_asset_scan_results schema.
|
|
37
|
+
The schema_name parameter is kept for compatibility but ignored.
|
|
38
|
+
"""
|
|
39
|
+
schema = _load_schema("single_asset_scan_results.json")
|
|
40
|
+
_validate_schema(data, schema)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_test_connection(data: dict[str, Any]) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Validate test connection output against the core test-connection schema.
|
|
46
|
+
"""
|
|
47
|
+
schema: dict[str, Any] = {
|
|
48
|
+
"type": "object",
|
|
49
|
+
"required": ["status"],
|
|
50
|
+
"properties": {
|
|
51
|
+
"status": {"type": "string", "enum": ["SUCCESS", "FAILURE"]},
|
|
52
|
+
"message": {"type": "string"},
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
_validate_schema(data, schema)
|