classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,179 @@
1
+ from __future__ import annotations
2
+
3
+ from copy import deepcopy
4
+ from typing import Any
5
+
6
+ _VALID_SAMPLING_STRATEGIES = {"RANDOM", "LATEST", "ALL"}
7
+
8
+
9
+ def _as_dict(value: Any) -> dict[str, Any]:
10
+ if isinstance(value, dict):
11
+ return value
12
+ return {}
13
+
14
+
15
+ def _as_positive_int(value: Any) -> int | None:
16
+ try:
17
+ if value is None:
18
+ return None
19
+ parsed = int(value)
20
+ except (TypeError, ValueError):
21
+ return None
22
+ return parsed if parsed > 0 else None
23
+
24
+
25
+ def _normalize_sampling_strategy(value: Any) -> str | None:
26
+ if not isinstance(value, str):
27
+ return None
28
+ normalized = value.strip().upper()
29
+ return normalized if normalized in _VALID_SAMPLING_STRATEGIES else None
30
+
31
+
32
+ def _as_bool(value: Any) -> bool | None:
33
+ if isinstance(value, bool):
34
+ return value
35
+ return None
36
+
37
+
38
+ def _pick(*values: Any) -> Any:
39
+ for value in values:
40
+ if value is not None:
41
+ return value
42
+ return None
43
+
44
+
45
+ def _normalize_object_storage_shape(
46
+ normalized: dict[str, Any],
47
+ source_type_value: str,
48
+ ) -> None:
49
+ required = _as_dict(normalized.get("required"))
50
+ optional = _as_dict(normalized.get("optional"))
51
+ optional_connection = _as_dict(optional.get("connection"))
52
+ optional_scope = _as_dict(optional.get("scope"))
53
+
54
+ for key in (
55
+ "request_timeout_seconds",
56
+ "max_keys_per_page",
57
+ "max_object_bytes",
58
+ ):
59
+ value = normalized.get(key)
60
+ if value is None:
61
+ continue
62
+ optional_connection.setdefault(key, value)
63
+ normalized.pop(key, None)
64
+
65
+ for key in (
66
+ "prefix",
67
+ "include_extensions",
68
+ "exclude_extensions",
69
+ "include_empty_objects",
70
+ "include_object_metadata",
71
+ "include_content_preview",
72
+ ):
73
+ value = normalized.get(key)
74
+ if value is None:
75
+ continue
76
+ optional_scope.setdefault(key, value)
77
+ normalized.pop(key, None)
78
+
79
+ if source_type_value == "S3_COMPATIBLE_STORAGE":
80
+ if (bucket := normalized.pop("bucket", None)) is not None:
81
+ required.setdefault("bucket", bucket)
82
+ for key in ("endpoint_url", "region_name", "verify_ssl"):
83
+ value = normalized.get(key)
84
+ if value is None:
85
+ continue
86
+ optional_connection.setdefault(key, value)
87
+ normalized.pop(key, None)
88
+
89
+ if source_type_value == "AZURE_BLOB_STORAGE":
90
+ if (account_url := normalized.pop("account_url", None)) is not None:
91
+ required.setdefault("account_url", account_url)
92
+ if (container := normalized.pop("container", None)) is not None:
93
+ required.setdefault("container", container)
94
+
95
+ if source_type_value == "GOOGLE_CLOUD_STORAGE":
96
+ if (bucket := normalized.pop("bucket", None)) is not None:
97
+ required.setdefault("bucket", bucket)
98
+ for key in ("project_id", "gcp_credentials_file"):
99
+ value = normalized.get(key)
100
+ if value is None:
101
+ continue
102
+ optional_connection.setdefault(key, value)
103
+ normalized.pop(key, None)
104
+
105
+ required.pop("provider", None)
106
+ normalized["required"] = required
107
+ if optional_connection:
108
+ optional["connection"] = optional_connection
109
+ if optional_scope:
110
+ optional["scope"] = optional_scope
111
+ if optional:
112
+ normalized["optional"] = optional
113
+
114
+
115
+ def normalize_source_recipe(
116
+ recipe: dict[str, Any],
117
+ source_type: str | None = None,
118
+ ) -> dict[str, Any]:
119
+ normalized = deepcopy(recipe)
120
+ source_type_value = str(source_type or normalized.get("type") or "").upper()
121
+
122
+ if source_type_value:
123
+ normalized["type"] = source_type_value
124
+
125
+ optional = _as_dict(normalized.get("optional"))
126
+ optional_sampling = _as_dict(optional.get("sampling"))
127
+ sampling = _as_dict(normalized.get("sampling"))
128
+
129
+ strategy = _pick(
130
+ _normalize_sampling_strategy(sampling.get("strategy")),
131
+ _normalize_sampling_strategy(optional_sampling.get("strategy")),
132
+ _normalize_sampling_strategy(optional_sampling.get("mode")),
133
+ "RANDOM",
134
+ )
135
+
136
+ sampling["strategy"] = strategy
137
+ # Strip removed fields so legacy recipes with limit/max_columns don't fail validation
138
+ sampling.pop("limit", None)
139
+ sampling.pop("max_columns", None)
140
+
141
+ for key in (
142
+ "order_by_column",
143
+ "fallback_to_random",
144
+ "rows_per_page",
145
+ "include_column_names",
146
+ ):
147
+ if key not in sampling and key in optional_sampling:
148
+ sampling[key] = optional_sampling[key]
149
+
150
+ sampling.pop("fetch_all_until_first_success", None)
151
+
152
+ normalized["sampling"] = sampling
153
+
154
+ optional.pop("sampling", None)
155
+
156
+ if optional:
157
+ normalized["optional"] = optional
158
+
159
+ if source_type_value == "WORDPRESS":
160
+ required = _as_dict(normalized.get("required"))
161
+ if isinstance(normalized.get("url"), str):
162
+ required.setdefault("url", normalized.pop("url"))
163
+ normalized["required"] = required
164
+ normalized.setdefault("masked", _as_dict(normalized.get("masked")))
165
+
166
+ if source_type_value == "SLACK":
167
+ required = _as_dict(normalized.get("required"))
168
+ if isinstance(normalized.get("workspace"), str):
169
+ required.setdefault("workspace", normalized.pop("workspace"))
170
+ normalized["required"] = required
171
+
172
+ if source_type_value in {
173
+ "S3_COMPATIBLE_STORAGE",
174
+ "AZURE_BLOB_STORAGE",
175
+ "GOOGLE_CLOUD_STORAGE",
176
+ }:
177
+ _normalize_object_storage_shape(normalized, source_type_value)
178
+
179
+ return normalized
@@ -0,0 +1,66 @@
1
+ # S3-Compatible Storage Source
2
+
3
+ This source uses the `S3_COMPATIBLE_STORAGE` schema and reads credentials from `config.masked`.
4
+
5
+ ## Schema Field Mapping
6
+
7
+ Use these schema paths when configuring any S3-compatible provider:
8
+
9
+ - `config.required.bucket`: Bucket name (for example `testclassifyrebucket`)
10
+ - `config.masked.aws_access_key_id`: Access key ID (not key name)
11
+ - `config.masked.aws_secret_access_key`: Secret access key
12
+ - `config.masked.aws_session_token`: Optional temporary session token
13
+ - `config.optional.connection.endpoint_url`: Provider S3 endpoint URL
14
+ - `config.optional.connection.region_name`: Region used for SigV4 signing
15
+ - `config.optional.connection.verify_ssl`: TLS verification toggle (default `true`)
16
+ - `config.optional.connection.request_timeout_seconds`: Network timeout for list/download calls
17
+ - `config.optional.connection.max_keys_per_page`: Max objects per list page
18
+ - `config.optional.connection.max_object_bytes`: Max bytes downloaded per object for preview/extraction
19
+ - `config.optional.scope.prefix`: Optional prefix filter
20
+ - `config.optional.scope.include_extensions`: Optional include extension filter
21
+ - `config.optional.scope.exclude_extensions`: Optional exclude extension filter
22
+ - `config.optional.scope.include_empty_objects`: Include zero-byte objects
23
+ - `config.optional.scope.include_object_metadata`: Include object metadata
24
+ - `config.optional.scope.include_content_preview`: Download content for MIME/text preview
25
+ - `config.sampling.strategy`: `RANDOM`, `LATEST`, or `ALL`
26
+ - `config.sampling.rows_per_page`: Item limit per sample run (default 100, ignored when strategy is `ALL`)
27
+
28
+ ## Backblaze B2 Example
29
+
30
+ ```json
31
+ {
32
+ "type": "S3_COMPATIBLE_STORAGE",
33
+ "required": {
34
+ "bucket": "testclassifyrebucket"
35
+ },
36
+ "masked": {
37
+ "aws_access_key_id": "002b0e7121683000000000001",
38
+ "aws_secret_access_key": "K002ZQTPmV9xMTEGg/F3AGtDFzdZgnY"
39
+ },
40
+ "optional": {
41
+ "connection": {
42
+ "endpoint_url": "https://s3.us-west-002.backblazeb2.com",
43
+ "region_name": "us-west-002",
44
+ "verify_ssl": true,
45
+ "request_timeout_seconds": 30
46
+ },
47
+ "scope": {
48
+ "include_empty_objects": false,
49
+ "include_object_metadata": true,
50
+ "include_content_preview": true
51
+ }
52
+ },
53
+ "sampling": {
54
+ "strategy": "LATEST"
55
+ }
56
+ }
57
+ ```
58
+
59
+ ## Common Misconfigurations
60
+
61
+ - `InvalidRequest: The Credential is malformed`:
62
+ - `aws_access_key_id` is wrong (often secret key pasted into this field).
63
+ - `InvalidAccessKeyId`:
64
+ - Access key ID does not exist for this B2 application key.
65
+ - `SUCCESS` but `Listed 0 object(s)`:
66
+ - `optional.scope.prefix` does not match actual key paths, or extension filters exclude files.
@@ -0,0 +1,3 @@
1
+ from .source import S3CompatibleStorageSource
2
+
3
+ __all__ = ["S3CompatibleStorageSource"]
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections.abc import Iterator
5
+ from typing import Any
6
+ from urllib.parse import quote
7
+
8
+ from ...models.generated_input import S3CompatibleStorageInput
9
+ from ..dependencies import require_module
10
+ from ..object_storage.base import ObjectRef, ObjectStorageSourceBase
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class S3CompatibleStorageSource(ObjectStorageSourceBase):
16
+ source_type = "s3_compatible_storage"
17
+ provider_label = "S3_COMPATIBLE_STORAGE"
18
+ input_model = S3CompatibleStorageInput
19
+
20
+ def _required_bucket(self) -> str:
21
+ bucket = str(self.config.required.bucket).strip()
22
+ if not bucket:
23
+ raise ValueError("required.bucket must be set")
24
+ return bucket
25
+
26
+ def _build_client(self) -> Any:
27
+ boto3 = require_module(
28
+ module_name="boto3",
29
+ source_name="S3 Compatible Storage",
30
+ uv_groups=["s3-compatible-storage"],
31
+ detail="S3-compatible storage requires boto3.",
32
+ )
33
+
34
+ kwargs: dict[str, Any] = {}
35
+ region_name = self._string_or_none(self._connection_option("region_name"))
36
+ endpoint_url = self._string_or_none(self._connection_option("endpoint_url"))
37
+ aws_access_key_id = self._masked_value("aws_access_key_id")
38
+ aws_secret_access_key = self._masked_value("aws_secret_access_key")
39
+ aws_session_token = self._masked_value("aws_session_token")
40
+
41
+ if region_name:
42
+ kwargs["region_name"] = region_name
43
+ if endpoint_url:
44
+ kwargs["endpoint_url"] = endpoint_url
45
+ if aws_access_key_id and aws_secret_access_key:
46
+ kwargs["aws_access_key_id"] = aws_access_key_id
47
+ kwargs["aws_secret_access_key"] = aws_secret_access_key
48
+ if aws_session_token:
49
+ kwargs["aws_session_token"] = aws_session_token
50
+
51
+ kwargs["verify"] = self._verify_ssl()
52
+
53
+ try:
54
+ botocore_config = require_module(
55
+ module_name="botocore.config",
56
+ source_name="S3 Compatible Storage",
57
+ uv_groups=["s3-compatible-storage"],
58
+ detail="S3-compatible storage uses botocore timeout configuration.",
59
+ )
60
+ timeout = int(self._request_timeout_seconds())
61
+ kwargs["config"] = botocore_config.Config(
62
+ connect_timeout=timeout,
63
+ read_timeout=timeout,
64
+ )
65
+ except Exception:
66
+ logger.debug("Could not initialize botocore timeout configuration; using defaults")
67
+
68
+ return boto3.client("s3", **kwargs)
69
+
70
+ def _client(self) -> Any:
71
+ if self._cached_client is None:
72
+ self._cached_client = self._build_client()
73
+ return self._cached_client
74
+
75
+ def _list_objects(self) -> Iterator[ObjectRef]:
76
+ client = self._client()
77
+ bucket = self._required_bucket()
78
+ prefix = self._prefix()
79
+ max_keys = self._max_keys_per_page()
80
+
81
+ continuation_token: str | None = None
82
+
83
+ while True:
84
+ params: dict[str, Any] = {
85
+ "Bucket": bucket,
86
+ "MaxKeys": max_keys,
87
+ }
88
+ if prefix:
89
+ params["Prefix"] = prefix
90
+ if continuation_token:
91
+ params["ContinuationToken"] = continuation_token
92
+
93
+ response = client.list_objects_v2(**params)
94
+ for item in response.get("Contents", []) or []:
95
+ key = str(item.get("Key") or "")
96
+ if not key or key.endswith("/"):
97
+ continue
98
+
99
+ size = int(item.get("Size") or 0)
100
+ if size == 0 and not self._include_empty_objects():
101
+ continue
102
+ if not self._object_matches_extension_filters(key):
103
+ continue
104
+
105
+ yield ObjectRef(
106
+ key=key,
107
+ size=size,
108
+ last_modified=self._parse_datetime(item.get("LastModified")),
109
+ etag=str(item.get("ETag")).strip('"') if item.get("ETag") else None,
110
+ )
111
+
112
+ if not response.get("IsTruncated"):
113
+ break
114
+ continuation_token = response.get("NextContinuationToken")
115
+ if not continuation_token:
116
+ break
117
+
118
+ def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
119
+ client = self._client()
120
+ bucket = self._required_bucket()
121
+ max_bytes = self._max_object_bytes()
122
+
123
+ params: dict[str, Any] = {"Bucket": bucket, "Key": ref.key}
124
+ truncated = False
125
+ if ref.size > max_bytes:
126
+ params["Range"] = f"bytes=0-{max_bytes - 1}"
127
+ truncated = True
128
+
129
+ response = client.get_object(**params)
130
+ body = response["Body"]
131
+ try:
132
+ file_bytes = body.read()
133
+ finally:
134
+ try:
135
+ body.close()
136
+ except Exception:
137
+ logger.debug("Failed to close S3 response body")
138
+
139
+ content_type = response.get("ContentType")
140
+ return file_bytes, str(content_type) if content_type else None, truncated
141
+
142
+ def _external_url(self, key: str) -> str:
143
+ bucket = self._required_bucket()
144
+ endpoint_url = self._string_or_none(self._connection_option("endpoint_url"))
145
+ if endpoint_url:
146
+ endpoint = endpoint_url.rstrip("/")
147
+ encoded_bucket = quote(bucket, safe="")
148
+ encoded_key = quote(key, safe="/")
149
+ return f"{endpoint}/{encoded_bucket}/{encoded_key}"
150
+ return f"s3://{bucket}/{key}"
@@ -0,0 +1,3 @@
1
+ from .source import ServiceDeskSource
2
+
3
+ __all__ = ["ServiceDeskSource"]