classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,81 @@
1
+ """Helpers for optional source dependencies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ import logging
7
+ from types import ModuleType
8
+
9
+ from src.utils.uv_sync import auto_install_enabled, sync_group
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class MissingSourceDependencyError(RuntimeError):
15
+ """Raised when an optional source dependency is unavailable."""
16
+
17
+ def __init__(
18
+ self,
19
+ source_name: str,
20
+ dependencies: list[str],
21
+ uv_groups: list[str],
22
+ detail: str | None = None,
23
+ ) -> None:
24
+ self.source_name = source_name
25
+ self.dependencies = dependencies
26
+ self.uv_groups = uv_groups
27
+ self.detail = detail
28
+
29
+ deps = ", ".join(dependencies)
30
+ group_hint = " or ".join(f"`uv sync --group {group}`" for group in uv_groups)
31
+ message = (
32
+ f"{source_name} source requires optional dependencies ({deps}). "
33
+ f"Install with {group_hint}."
34
+ )
35
+ if detail:
36
+ message = f"{message} {detail}"
37
+ super().__init__(message)
38
+
39
+
40
+ def _ordered_groups(groups: list[str]) -> list[str]:
41
+ return sorted(dict.fromkeys(groups))
42
+
43
+
44
+ def require_module(
45
+ module_name: str,
46
+ source_name: str,
47
+ uv_groups: list[str],
48
+ detail: str | None = None,
49
+ ) -> ModuleType:
50
+ """Import a module or raise MissingSourceDependencyError with uv guidance."""
51
+ try:
52
+ return importlib.import_module(module_name)
53
+ except Exception as exc: # pragma: no cover - environment dependent
54
+ detail_messages: list[str] = [f"Original error: {exc}"]
55
+
56
+ if auto_install_enabled() and uv_groups:
57
+ for group in _ordered_groups(uv_groups):
58
+ success, install_detail = sync_group(group)
59
+ if install_detail:
60
+ detail_messages.append(install_detail)
61
+ if not success:
62
+ continue
63
+
64
+ try:
65
+ importlib.invalidate_caches()
66
+ return importlib.import_module(module_name)
67
+ except Exception as retry_exc: # pragma: no cover
68
+ detail_messages.append(
69
+ f"Module '{module_name}' still unavailable after installing '{group}': {retry_exc}"
70
+ )
71
+
72
+ base_detail = detail or "Optional dependency import failed"
73
+ error_detail = (
74
+ f"{base_detail}. {'; '.join(detail_messages)}" if detail_messages else base_detail
75
+ )
76
+ raise MissingSourceDependencyError(
77
+ source_name=source_name,
78
+ dependencies=[module_name.split(".", maxsplit=1)[0]],
79
+ uv_groups=uv_groups,
80
+ detail=error_detail,
81
+ ) from exc
@@ -0,0 +1,3 @@
1
+ from .source import GoogleCloudStorageSource
2
+
3
+ __all__ = ["GoogleCloudStorageSource"]
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections.abc import Iterator
5
+ from typing import Any
6
+
7
+ from ...models.generated_input import GoogleCloudStorageInput
8
+ from ..dependencies import require_module
9
+ from ..object_storage.base import ObjectRef, ObjectStorageSourceBase
10
+
11
+
12
+ class GoogleCloudStorageSource(ObjectStorageSourceBase):
13
+ source_type = "google_cloud_storage"
14
+ provider_label = "GOOGLE_CLOUD_STORAGE"
15
+ input_model = GoogleCloudStorageInput
16
+
17
+ def _required_bucket(self) -> str:
18
+ bucket = str(self.config.required.bucket).strip()
19
+ if not bucket:
20
+ raise ValueError("required.bucket must be set")
21
+ return bucket
22
+
23
+ def _build_client(self) -> Any:
24
+ storage_module = require_module(
25
+ module_name="google.cloud.storage",
26
+ source_name="Google Cloud Storage",
27
+ uv_groups=["google-cloud-storage"],
28
+ detail="Google Cloud Storage requires google-cloud-storage.",
29
+ )
30
+
31
+ project = self._string_or_none(self._connection_option("project_id"))
32
+ credentials_json = self._masked_value("gcp_credentials_json")
33
+ credentials_file = self._string_or_none(self._connection_option("gcp_credentials_file"))
34
+
35
+ if credentials_json:
36
+ service_account_module = require_module(
37
+ module_name="google.oauth2.service_account",
38
+ source_name="Google Cloud Storage",
39
+ uv_groups=["google-cloud-storage"],
40
+ detail="Inline service account credentials require google-auth support.",
41
+ )
42
+ credentials = service_account_module.Credentials.from_service_account_info(
43
+ json.loads(credentials_json)
44
+ )
45
+ return storage_module.Client(project=project, credentials=credentials)
46
+
47
+ if credentials_file:
48
+ service_account_module = require_module(
49
+ module_name="google.oauth2.service_account",
50
+ source_name="Google Cloud Storage",
51
+ uv_groups=["google-cloud-storage"],
52
+ detail="File-based service account credentials require google-auth support.",
53
+ )
54
+ credentials = service_account_module.Credentials.from_service_account_file(
55
+ credentials_file
56
+ )
57
+ return storage_module.Client(project=project, credentials=credentials)
58
+
59
+ return storage_module.Client(project=project)
60
+
61
+ def _client(self) -> Any:
62
+ if self._cached_client is None:
63
+ self._cached_client = self._build_client()
64
+ return self._cached_client
65
+
66
+ def _list_objects(self) -> Iterator[ObjectRef]:
67
+ client = self._client()
68
+ bucket = self._required_bucket()
69
+ prefix = self._prefix()
70
+ max_keys = self._max_keys_per_page()
71
+ timeout = self._request_timeout_seconds()
72
+
73
+ blobs = client.list_blobs(
74
+ bucket_or_name=bucket,
75
+ prefix=prefix or None,
76
+ page_size=max_keys,
77
+ timeout=timeout,
78
+ )
79
+ for blob in blobs:
80
+ key = str(getattr(blob, "name", "") or "")
81
+ if not key or key.endswith("/"):
82
+ continue
83
+
84
+ size = int(getattr(blob, "size", 0) or 0)
85
+ if size == 0 and not self._include_empty_objects():
86
+ continue
87
+ if not self._object_matches_extension_filters(key):
88
+ continue
89
+
90
+ yield ObjectRef(
91
+ key=key,
92
+ size=size,
93
+ last_modified=self._parse_datetime(getattr(blob, "updated", None)),
94
+ etag=str(getattr(blob, "etag", "") or "") or None,
95
+ content_type_hint=str(getattr(blob, "content_type", "") or "") or None,
96
+ )
97
+
98
+ def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
99
+ client = self._client()
100
+ bucket = client.bucket(self._required_bucket())
101
+ blob = bucket.blob(ref.key)
102
+
103
+ max_bytes = self._max_object_bytes()
104
+ timeout = self._request_timeout_seconds()
105
+
106
+ if ref.size > max_bytes:
107
+ file_bytes = blob.download_as_bytes(start=0, end=max_bytes - 1, timeout=timeout)
108
+ return file_bytes, ref.content_type_hint, True
109
+
110
+ file_bytes = blob.download_as_bytes(timeout=timeout)
111
+ return file_bytes, ref.content_type_hint, False
112
+
113
+ def _external_url(self, key: str) -> str:
114
+ return f"gs://{self._required_bucket()}/{key}"
@@ -0,0 +1,3 @@
1
+ from .source import HiveSource
2
+
3
+ __all__ = ["HiveSource"]