classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,130 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections.abc import Iterator
5
+ from typing import Any
6
+ from urllib.parse import quote
7
+
8
+ from ...models.generated_input import AzureBlobStorageInput
9
+ from ..dependencies import require_module
10
+ from ..object_storage.base import ObjectRef, ObjectStorageSourceBase
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AzureBlobStorageSource(ObjectStorageSourceBase):
16
+ source_type = "azure_blob_storage"
17
+ provider_label = "AZURE_BLOB_STORAGE"
18
+ input_model = AzureBlobStorageInput
19
+
20
+ def _required_container(self) -> str:
21
+ container = str(self.config.required.container).strip()
22
+ if not container:
23
+ raise ValueError("required.container must be set")
24
+ return container
25
+
26
+ def _required_account_url(self) -> str:
27
+ account_url = str(self.config.required.account_url).strip()
28
+ if not account_url:
29
+ raise ValueError("required.account_url must be set")
30
+ return account_url.rstrip("/")
31
+
32
+ def _build_client(self) -> Any:
33
+ blob_module = require_module(
34
+ module_name="azure.storage.blob",
35
+ source_name="Azure Blob Storage",
36
+ uv_groups=["azure-blob-storage"],
37
+ detail="Azure Blob storage requires azure-storage-blob.",
38
+ )
39
+ blob_service_client_cls = blob_module.BlobServiceClient
40
+
41
+ connection_string = self._masked_value("azure_connection_string")
42
+ if connection_string:
43
+ return blob_service_client_cls.from_connection_string(connection_string)
44
+
45
+ account_url = self._required_account_url()
46
+ account_key = self._masked_value("azure_account_key")
47
+ sas_token = self._masked_value("azure_sas_token")
48
+
49
+ if account_key:
50
+ return blob_service_client_cls(account_url=account_url, credential=account_key)
51
+ if sas_token:
52
+ return blob_service_client_cls(account_url=account_url, credential=sas_token)
53
+
54
+ client_id = self._masked_value("azure_client_id")
55
+ client_secret = self._masked_value("azure_client_secret")
56
+ tenant_id = self._masked_value("azure_tenant_id")
57
+
58
+ identity_module = require_module(
59
+ module_name="azure.identity",
60
+ source_name="Azure Blob Storage",
61
+ uv_groups=["azure-blob-storage"],
62
+ detail="Managed identity and service principal auth require azure-identity.",
63
+ )
64
+ if client_id and client_secret and tenant_id:
65
+ credential = identity_module.ClientSecretCredential(
66
+ tenant_id=tenant_id,
67
+ client_id=client_id,
68
+ client_secret=client_secret,
69
+ )
70
+ else:
71
+ credential = identity_module.DefaultAzureCredential()
72
+
73
+ return blob_service_client_cls(account_url=account_url, credential=credential)
74
+
75
+ def _client(self) -> Any:
76
+ if self._cached_client is None:
77
+ self._cached_client = self._build_client()
78
+ return self._cached_client
79
+
80
+ def _list_objects(self) -> Iterator[ObjectRef]:
81
+ blob_service_client = self._client()
82
+ container_client = blob_service_client.get_container_client(self._required_container())
83
+
84
+ prefix = self._prefix()
85
+ max_keys = self._max_keys_per_page()
86
+ timeout = self._request_timeout_seconds()
87
+
88
+ list_blobs = container_client.list_blobs(name_starts_with=prefix, timeout=timeout)
89
+
90
+ for page in list_blobs.by_page(results_per_page=max_keys):
91
+ for item in page:
92
+ key = str(getattr(item, "name", "") or "")
93
+ if not key or key.endswith("/"):
94
+ continue
95
+
96
+ size = int(getattr(item, "size", 0) or 0)
97
+ if size == 0 and not self._include_empty_objects():
98
+ continue
99
+ if not self._object_matches_extension_filters(key):
100
+ continue
101
+
102
+ content_settings = getattr(item, "content_settings", None)
103
+ content_type_hint = getattr(content_settings, "content_type", None)
104
+ yield ObjectRef(
105
+ key=key,
106
+ size=size,
107
+ last_modified=self._parse_datetime(getattr(item, "last_modified", None)),
108
+ etag=str(getattr(item, "etag", "") or "") or None,
109
+ content_type_hint=str(content_type_hint) if content_type_hint else None,
110
+ )
111
+
112
+ def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
113
+ blob_service_client = self._client()
114
+ container_client = blob_service_client.get_container_client(self._required_container())
115
+ blob_client = container_client.get_blob_client(ref.key)
116
+
117
+ max_bytes = self._max_object_bytes()
118
+ timeout = self._request_timeout_seconds()
119
+ length = max_bytes if ref.size > max_bytes else None
120
+
121
+ downloader = blob_client.download_blob(offset=0, length=length, timeout=timeout)
122
+ file_bytes = downloader.readall()
123
+ return file_bytes, ref.content_type_hint, ref.size > max_bytes
124
+
125
+ def _external_url(self, key: str) -> str:
126
+ account_url = self._required_account_url().rstrip("/")
127
+ container = self._required_container()
128
+ encoded_container = quote(container, safe="")
129
+ encoded_key = quote(key, safe="/")
130
+ return f"{account_url}/{encoded_container}/{encoded_key}"
src/sources/base.py ADDED
@@ -0,0 +1,296 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from collections.abc import AsyncGenerator, Generator
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
7
+
8
+ if TYPE_CHECKING:
9
+ from ..utils.file_parser import ParsedBytes
10
+ from ..utils.hashing import calculate_checksum, normalize_http_url
11
+ from ..utils.validation import validate_output
12
+ from .recipe_normalizer import normalize_source_recipe
13
+
14
+
15
+ class BaseSource(ABC):
16
+ """
17
+ Abstract base class for all metadata extraction sources.
18
+ """
19
+
20
+ # Default batch size for streaming asset results
21
+ BATCH_SIZE: int = 50
22
+ HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
23
+
24
+ def __init__(
25
+ self,
26
+ recipe: dict[str, Any],
27
+ source_id: str | None = None,
28
+ runner_id: str | None = None,
29
+ ):
30
+ """
31
+ Initialize the source with a validated recipe.
32
+
33
+ Args:
34
+ recipe: The source configuration recipe
35
+ source_id: Optional source ID (for API runs)
36
+ runner_id: Optional runner ID (for API runs)
37
+ """
38
+ normalized_recipe = normalize_source_recipe(recipe, recipe.get("type"))
39
+ self._apply_initial_sampling_override(normalized_recipe)
40
+ recipe.clear()
41
+ recipe.update(normalized_recipe)
42
+ self.recipe = normalized_recipe
43
+ self.source_id = source_id
44
+ self.runner_id = runner_id
45
+ self._aborted = False
46
+ self._discovery_only = False
47
+ self._attachment_name_by_hash: dict[str, str] = {}
48
+
49
+ def _apply_initial_sampling_override(self, recipe: dict[str, Any]) -> None:
50
+ pass
51
+
52
+ @staticmethod
53
+ def _read_bool_env(name: str) -> bool | None:
54
+ raw = os.environ.get(name)
55
+ if raw is None:
56
+ return None
57
+ normalized = raw.strip().lower()
58
+ if normalized in {"1", "true", "yes", "y", "on"}:
59
+ return True
60
+ if normalized in {"0", "false", "no", "n", "off"}:
61
+ return False
62
+ return None
63
+
64
+ def set_discovery_only(self, value: bool) -> None:
65
+ self._discovery_only = value
66
+
67
+ def evict_asset_cache(self, asset_hash: str) -> None:
68
+ """Free cached content for a processed asset. Override in subclasses."""
69
+ pass
70
+
71
+ @abstractmethod
72
+ def test_connection(self) -> dict[str, Any]:
73
+ """
74
+ Verify that the connection to the source is working.
75
+ Should return a dictionary conforming to the test-connection schema.
76
+ """
77
+ pass
78
+
79
+ STREAM_DETECTIONS: bool = False
80
+
81
+ async def extract(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
82
+ """
83
+ Orchestrates extraction + detection. Calls ``extract_raw()`` for batches,
84
+ then runs the detector pipeline (if configured) before yielding results.
85
+
86
+ Sources should override ``extract_raw()`` instead of this method.
87
+ """
88
+ pipeline = self._build_pipeline()
89
+ async for batch in self.extract_raw():
90
+ if pipeline:
91
+ if self.STREAM_DETECTIONS:
92
+ async for processed in pipeline.process_stream(batch):
93
+ yield [processed]
94
+ continue
95
+ batch = await pipeline.process(batch) # noqa: PLW2901
96
+ if batch:
97
+ yield batch
98
+
99
+ @abstractmethod
100
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
101
+ """
102
+ The main extraction logic. Yields batches of raw assets **without**
103
+ running detectors. The base ``extract()`` wraps this with pipeline
104
+ processing automatically.
105
+
106
+ Yields:
107
+ Batches of SingleAssetScanResults objects
108
+ """
109
+ yield []
110
+
111
+ def _build_pipeline(self) -> Any:
112
+ config = getattr(self, "config", None)
113
+ detectors = getattr(config, "detectors", None) if config else None
114
+ if not detectors or not any(getattr(d, "enabled", False) for d in detectors):
115
+ return None
116
+ from ..pipeline.detector_pipeline import DetectorPipeline
117
+
118
+ return DetectorPipeline.from_recipe(self.recipe, self, self.runner_id)
119
+
120
+ @abstractmethod
121
+ def generate_hash_id(self, asset_id: str) -> str:
122
+ """
123
+ Generate a unique stable ID for an asset.
124
+ """
125
+ pass
126
+
127
+ def calculate_checksum(self, data: dict[str, Any]) -> str:
128
+ """
129
+ Calculate a stable SHA-256 checksum for a dictionary.
130
+ """
131
+ return calculate_checksum(data)
132
+
133
+ @abstractmethod
134
+ def abort(self) -> None:
135
+ """
136
+ Signal the source to stop extraction as soon as possible.
137
+ """
138
+ self._aborted = True
139
+
140
+ def cleanup(self) -> None:
141
+ """
142
+ Optional: Clean up resources (close sessions, delete temp files).
143
+ """
144
+ # Default implementation does nothing.
145
+
146
+ def get_stats(self) -> dict[str, Any]:
147
+ """
148
+ Optional: Return statistics about the current extraction (total items, success/fail counts).
149
+ """
150
+ return {}
151
+
152
+ def discover(self) -> dict[str, Any]:
153
+ """
154
+ Optional: Discover available resources (e.g., list all spaces/projects)
155
+ without performing a full extraction.
156
+ """
157
+ return {}
158
+
159
+ def validate_output(self, data: dict[str, Any]) -> None:
160
+ """
161
+ Optional: Use the validation utility to ensure output conforms to schema.
162
+ Can be called during extraction to fail early on bad data.
163
+ """
164
+ source_type = self.recipe.get("type", "").lower()
165
+ validate_output(data, source_type)
166
+
167
+ def ensure_location(self, external_url: str, *, fallback: str | None = None) -> str:
168
+ """
169
+ Ensure the asset has a non-empty external URL.
170
+ """
171
+ location = (external_url or "").strip()
172
+ if location:
173
+ return location
174
+
175
+ if fallback:
176
+ fallback_value = fallback.strip()
177
+ if fallback_value:
178
+ return fallback_value
179
+
180
+ raise ValueError("Asset external_url is required")
181
+
182
+ def _attachment_file_name(self, asset_id: str, fallback_url: str) -> str:
183
+ """Return the stored file name for an attachment, or fallback_url if not recorded."""
184
+ stored = self._attachment_name_by_hash.get(asset_id)
185
+ if isinstance(stored, str) and stored.strip():
186
+ return stored.strip()
187
+ return fallback_url
188
+
189
+ def ocr_enabled(self) -> bool:
190
+ """Return whether sampling-level OCR is enabled for this source."""
191
+ config = getattr(self, "config", None)
192
+ sampling = getattr(config, "sampling", None) if config is not None else None
193
+ return bool(getattr(sampling, "enable_ocr", False))
194
+
195
+ def parse_asset_bytes(
196
+ self,
197
+ file_bytes: bytes,
198
+ *,
199
+ declared_mime_type: str | None = None,
200
+ file_name: str = "",
201
+ ) -> "ParsedBytes":
202
+ from ..utils.file_parser import parse_bytes
203
+
204
+ return parse_bytes(
205
+ file_bytes,
206
+ declared_mime_type=declared_mime_type,
207
+ file_name=file_name,
208
+ enable_ocr=self.ocr_enabled(),
209
+ )
210
+
211
+ def iter_asset_pages(
212
+ self,
213
+ file_bytes: bytes,
214
+ mime_type: str,
215
+ batch_size: int = 100,
216
+ include_column_names: bool = True,
217
+ *,
218
+ file_name: str = "",
219
+ ) -> Generator[str, None, None]:
220
+ from ..utils.file_parser import iter_file_pages
221
+
222
+ return iter_file_pages(
223
+ file_bytes,
224
+ mime_type,
225
+ batch_size,
226
+ include_column_names,
227
+ file_name=file_name,
228
+ enable_ocr=self.ocr_enabled(),
229
+ )
230
+
231
+ async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
232
+ """
233
+ Fetch raw bytes and MIME type for an asset (for binary/image detectors).
234
+
235
+ Returns (raw_bytes, mime_type) or None if binary content is not available.
236
+ Sources that store raw file bytes should override this method.
237
+ """
238
+ return None
239
+
240
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
241
+ """
242
+ Async generator yielding (raw_content, text_content) pages for an asset.
243
+
244
+ Default: yields a single result from fetch_content.
245
+ Tabular sources override this to stream pages for ALL strategy.
246
+ """
247
+ result = await self.fetch_content(asset_id)
248
+ if result:
249
+ yield result
250
+
251
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
252
+ """
253
+ Fetch full content for an asset (for detector scanning).
254
+
255
+ This method should be implemented by sources that support content fetching.
256
+ It retrieves the full content of an asset given its identifier.
257
+
258
+ Args:
259
+ asset_id: Asset identifier (page_id, post_id, document_id, etc.)
260
+
261
+ Returns:
262
+ Tuple of (raw_content, text_content) where:
263
+ - raw_content: Original HTML/markup content
264
+ - text_content: Plain text extracted from content
265
+ Returns None if content fetching is not supported or fails.
266
+
267
+ Note:
268
+ Default implementation returns None. Sources that support detector
269
+ integration should override this method.
270
+ """
271
+ return None
272
+
273
+ def enrich_finding_location(
274
+ self,
275
+ finding: DetectionResult,
276
+ asset: SingleAssetScanResults,
277
+ text_content: str,
278
+ ) -> None:
279
+ """
280
+ Set a human-readable path on finding.location so users can find the source.
281
+
282
+ Override per source type:
283
+ - Tabular (PostgreSQL, MySQL): "schema.table, row N"
284
+ - Web (WordPress): the page URL
285
+ - Slack: permalink or "channel / message_ts"
286
+ """
287
+ pass
288
+
289
+ def resolve_link_for_detection(self, link: str) -> str | None:
290
+ """
291
+ Resolve a stored asset link into a concrete HTTP(S) URL for link-based detectors.
292
+
293
+ Sources that store non-URL link identifiers (for example, hashed IDs) can override
294
+ this and map those identifiers back to their original URLs.
295
+ """
296
+ return normalize_http_url(link)
@@ -0,0 +1,3 @@
1
+ from .source import ConfluenceSource
2
+
3
+ __all__ = ["ConfluenceSource"]