classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
src/utils/hashing.py ADDED
@@ -0,0 +1,82 @@
1
+ import base64
2
+ import hashlib
3
+ import json
4
+ from typing import Any
5
+ from urllib.parse import urljoin, urlsplit, urlunsplit
6
+
7
+
8
+ def hash_id(source_type: str, raw_id: str) -> str:
9
+ """
10
+ Hash the raw ID into a base64 encoded string with a source type prefix.
11
+ Note: This is actually just base64 encoding for reversibility in debugging,
12
+ not a cryptographic hash.
13
+ """
14
+ final_raw_id = f"{source_type}_#_{raw_id}"
15
+ return base64.urlsafe_b64encode(final_raw_id.encode()).decode().rstrip("=")
16
+
17
+
18
+ def unhash_id(hashed_id: str) -> str:
19
+ """
20
+ Unhash the base64 encoded ID back to its raw form.
21
+ """
22
+ # Add padding back if necessary
23
+ padding = len(hashed_id) % 4
24
+ if padding:
25
+ hashed_id += "=" * (4 - padding)
26
+ return base64.urlsafe_b64decode(hashed_id.encode()).decode()
27
+
28
+
29
+ def calculate_checksum(data: dict[str, Any]) -> str:
30
+ """
31
+ Calculate a stable SHA-256 checksum for a dictionary.
32
+ Keys are sorted to ensure stability.
33
+ """
34
+ # Use sort_keys=True for stability
35
+ dump = json.dumps(data, sort_keys=True, default=str).encode("utf-8")
36
+ return hashlib.sha256(dump).hexdigest()
37
+
38
+
39
+ def normalize_http_url(url: str, *, base_url: str | None = None) -> str | None:
40
+ """
41
+ Normalize an HTTP(S) URL for stable hashing and deduplication.
42
+
43
+ - Resolves relative URLs against `base_url` when provided
44
+ - Rejects non-HTTP(S) schemes (mailto:, tel:, javascript:, data:, etc.)
45
+ - Removes URL fragments
46
+ """
47
+ candidate = (url or "").strip()
48
+ if not candidate:
49
+ return None
50
+
51
+ lowered = candidate.lower()
52
+ if lowered.startswith(("#", "javascript:", "mailto:", "tel:", "data:")):
53
+ return None
54
+
55
+ if base_url:
56
+ candidate = urljoin(f"{base_url.rstrip('/')}/", candidate)
57
+
58
+ parsed = urlsplit(candidate)
59
+ if parsed.scheme.lower() not in {"http", "https"} or not parsed.netloc:
60
+ return None
61
+
62
+ path = parsed.path or "/"
63
+ return urlunsplit(
64
+ (
65
+ parsed.scheme.lower(),
66
+ parsed.netloc.lower(),
67
+ path,
68
+ parsed.query,
69
+ "",
70
+ )
71
+ )
72
+
73
+
74
+ def hash_url(url: str, *, base_url: str | None = None) -> str:
75
+ """
76
+ URL hash strategy: opaque SHA-256 digest of normalized absolute URL.
77
+ """
78
+ normalized = normalize_http_url(url, base_url=base_url)
79
+ if not normalized:
80
+ raise ValueError(f"Invalid URL for hash: {url}")
81
+ digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()
82
+ return f"url_sha256:{digest}"
src/utils/uv_sync.py ADDED
@@ -0,0 +1,79 @@
1
+ """Shared uv-sync state so that every `uv sync` call includes ALL accumulated groups.
2
+
3
+ `uv sync --group X` removes packages that belong to other groups. When sources
4
+ and detectors each call `uv sync --group <their_group>` independently, the last
5
+ call uninstalls packages from earlier groups. This module keeps a global set of
6
+ requested groups and always passes them all to `uv sync`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import shutil
14
+ import subprocess
15
+ import sys
16
+ import threading
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ _lock = threading.Lock()
21
+ _synced_groups: set[str] = set()
22
+ _failed_groups: dict[str, str] = {}
23
+
24
+
25
+ def _auto_install_enabled() -> bool:
26
+ value = os.environ.get("CLASSIFYRE_CLI_AUTO_INSTALL_OPTIONAL_DEPS", "1").strip().lower()
27
+ return value not in {"0", "false", "no"}
28
+
29
+
30
+ def _uv_command() -> list[str]:
31
+ uv_binary = shutil.which("uv")
32
+ if uv_binary:
33
+ return [uv_binary]
34
+ return [sys.executable, "-m", "uv"]
35
+
36
+
37
+ def sync_group(group: str) -> tuple[bool, str | None]:
38
+ """Ensure *group* is installed, re-syncing with ALL previously requested groups."""
39
+ with _lock:
40
+ if group in _synced_groups:
41
+ return True, None
42
+ if group in _failed_groups:
43
+ return False, _failed_groups[group]
44
+
45
+ all_groups = _synced_groups | {group}
46
+ timeout = int(os.environ.get("CLASSIFYRE_UV_SYNC_TIMEOUT_SECONDS", "900"))
47
+ command = [*_uv_command(), "sync", "--frozen", "--no-dev"]
48
+ for g in sorted(all_groups):
49
+ command.extend(["--group", g])
50
+
51
+ logger.info("Installing optional dependency group '%s'...", group)
52
+ try:
53
+ result = subprocess.run(
54
+ command,
55
+ check=False,
56
+ capture_output=True,
57
+ text=True,
58
+ timeout=timeout,
59
+ )
60
+ except Exception as exc:
61
+ detail = f"Failed to execute uv sync for group '{group}': {exc}"
62
+ _failed_groups[group] = detail
63
+ logger.error(detail)
64
+ return False, detail
65
+
66
+ if result.returncode == 0:
67
+ _synced_groups.update(all_groups)
68
+ logger.info("Installed dependency group '%s'", group)
69
+ return True, None
70
+
71
+ detail = result.stderr.strip() or result.stdout.strip() or "Unknown uv sync error"
72
+ message = f"uv sync failed for group '{group}': {detail}"
73
+ _failed_groups[group] = message
74
+ logger.error(message)
75
+ return False, message
76
+
77
+
78
+ def auto_install_enabled() -> bool:
79
+ return _auto_install_enabled()
@@ -0,0 +1,56 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from jsonschema import validators
6
+
7
+ import schemas
8
+
9
+
10
+ def _load_schema(schema_filename: str) -> dict[str, Any]:
11
+ schema_dir = Path(schemas.__file__).parent
12
+ schema_path = schema_dir / schema_filename
13
+
14
+ with open(schema_path) as f:
15
+ return json.load(f)
16
+
17
+
18
+ def _validate_schema(instance: dict[str, Any], schema: dict[str, Any]) -> None:
19
+ validator_cls = validators.validator_for(schema)
20
+ validator_cls.check_schema(schema)
21
+ validator = validator_cls(schema)
22
+ validator.validate(instance)
23
+
24
+
25
+ def validate_input(data: dict[str, Any], schema_name: str = "") -> None: # noqa: ARG001
26
+ """
27
+ Validate input data against the unified all_input_sources schema.
28
+ The schema_name parameter is kept for compatibility but ignored.
29
+ """
30
+ schema = _load_schema("all_input_sources.json")
31
+ _validate_schema(data, schema)
32
+
33
+
34
+ def validate_output(data: dict[str, Any], schema_name: str = "") -> None: # noqa: ARG001
35
+ """
36
+ Validate output data against the unified single_asset_scan_results schema.
37
+ The schema_name parameter is kept for compatibility but ignored.
38
+ """
39
+ schema = _load_schema("single_asset_scan_results.json")
40
+ _validate_schema(data, schema)
41
+
42
+
43
+ def validate_test_connection(data: dict[str, Any]) -> None:
44
+ """
45
+ Validate test connection output against the core test-connection schema.
46
+ """
47
+ schema: dict[str, Any] = {
48
+ "type": "object",
49
+ "required": ["status"],
50
+ "properties": {
51
+ "status": {"type": "string", "enum": ["SUCCESS", "FAILURE"]},
52
+ "message": {"type": "string"},
53
+ },
54
+ }
55
+
56
+ _validate_schema(data, schema)