classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
src/outputs/rest.py ADDED
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Literal, cast
5
+ from urllib.parse import urljoin
6
+
7
+ import requests # type: ignore[import-untyped]
8
+ from pydantic import BaseModel, ConfigDict, Field
9
+
10
+ from .base import OutputRuntimeContext, OutputType
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def _drop_none_recursive(value: Any) -> Any:
16
+ if isinstance(value, dict):
17
+ return {key: _drop_none_recursive(item) for key, item in value.items() if item is not None}
18
+ if isinstance(value, list):
19
+ return [_drop_none_recursive(item) for item in value if item is not None]
20
+ return value
21
+
22
+
23
+ class BulkIngestAssetsRequest(BaseModel):
24
+ model_config = ConfigDict(populate_by_name=True)
25
+
26
+ runner_id: str = Field(serialization_alias="runnerId")
27
+ assets: list[dict[str, Any]]
28
+ finalize_run: bool = Field(False, serialization_alias="finalizeRun")
29
+ skip_findings: bool = Field(False, serialization_alias="skipFindings")
30
+
31
+
32
+ class FinalizeIngestRunRequest(BaseModel):
33
+ model_config = ConfigDict(populate_by_name=True)
34
+
35
+ runner_id: str = Field(serialization_alias="runnerId")
36
+ seen_hashes: list[str] = Field(serialization_alias="seenHashes")
37
+
38
+
39
+ class UpdateRunnerStatusRequest(BaseModel):
40
+ model_config = ConfigDict(populate_by_name=True)
41
+
42
+ status: Literal["COMPLETED", "ERROR"]
43
+ error_message: str | None = Field(None, serialization_alias="errorMessage")
44
+
45
+
46
+ class ExternalRunnerResponse(BaseModel):
47
+ id: str
48
+ source_id: str = Field(validation_alias="sourceId")
49
+
50
+
51
+ class RestOutputSink:
52
+ output_type: OutputType = "rest"
53
+
54
+ def __init__(
55
+ self,
56
+ context: OutputRuntimeContext,
57
+ *,
58
+ base_url: str,
59
+ timeout_sec: int,
60
+ ):
61
+ self.context = context
62
+ self.batch_size = context.batch_size
63
+ self.base_url = base_url.rstrip("/")
64
+ self.timeout_sec = timeout_sec
65
+ self.session = requests.Session()
66
+ self._runner_id = context.runner_id
67
+ self._seen_hashes: set[str] = set()
68
+
69
+ async def start(self) -> None:
70
+ if not self.context.source_id:
71
+ raise ValueError("REST output requires source_id")
72
+
73
+ if self._runner_id:
74
+ return
75
+
76
+ if self.context.managed_runner:
77
+ raise ValueError("managed_runner mode requires runner_id")
78
+
79
+ payload = self._request_json(
80
+ "POST",
81
+ f"/sources/{self.context.source_id}/runners/external",
82
+ )
83
+ response = ExternalRunnerResponse.model_validate(payload)
84
+ self._runner_id = response.id
85
+ logger.info("Created external runner %s for source %s", response.id, response.source_id)
86
+
87
+ # Keep each bulk request well under Fastify's 50 MB bodyLimit
88
+ _MAX_BATCH_BYTES = 20 * 1024 * 1024 # 20 MB
89
+
90
+ async def emit_batch(
91
+ self, assets: list[dict[str, Any]], *, skip_findings: bool = False
92
+ ) -> None:
93
+ if not assets:
94
+ return
95
+
96
+ source_id = self._require_source_id()
97
+ runner_id = self._require_runner_id()
98
+
99
+ for asset in assets:
100
+ hash_value = asset.get("hash")
101
+ if hash_value is not None:
102
+ self._seen_hashes.add(str(hash_value))
103
+
104
+ for chunk in self._split_by_size(assets):
105
+ cleaned_chunk = cast(list[dict[str, Any]], _drop_none_recursive(chunk))
106
+ payload = BulkIngestAssetsRequest(
107
+ runner_id=runner_id,
108
+ assets=cleaned_chunk,
109
+ finalize_run=False,
110
+ skip_findings=skip_findings,
111
+ )
112
+ self._request_json(
113
+ "POST",
114
+ f"/sources/{source_id}/assets/bulk",
115
+ payload.model_dump(mode="json", by_alias=True),
116
+ )
117
+
118
+ def _split_by_size(self, assets: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
119
+ """Split assets into chunks that each stay under _MAX_BATCH_BYTES."""
120
+ import json as _json
121
+
122
+ chunks: list[list[dict[str, Any]]] = []
123
+ current: list[dict[str, Any]] = []
124
+ current_bytes = 0
125
+
126
+ for asset in assets:
127
+ asset_bytes = len(_json.dumps(asset, ensure_ascii=False).encode())
128
+ if current and current_bytes + asset_bytes > self._MAX_BATCH_BYTES:
129
+ chunks.append(current)
130
+ current = []
131
+ current_bytes = 0
132
+ current.append(asset)
133
+ current_bytes += asset_bytes
134
+
135
+ if current:
136
+ chunks.append(current)
137
+
138
+ return chunks
139
+
140
+ async def finish(self) -> None:
141
+ source_id = self._require_source_id()
142
+ runner_id = self._require_runner_id()
143
+
144
+ payload = FinalizeIngestRunRequest(
145
+ runner_id=runner_id,
146
+ seen_hashes=sorted(self._seen_hashes),
147
+ )
148
+ self._request_json(
149
+ "POST",
150
+ f"/sources/{source_id}/assets/finalize",
151
+ payload.model_dump(mode="json", by_alias=True),
152
+ )
153
+
154
+ status_payload = UpdateRunnerStatusRequest(status="COMPLETED")
155
+ self._request_json(
156
+ "PATCH",
157
+ f"/runners/{runner_id}/status",
158
+ status_payload.model_dump(mode="json"),
159
+ )
160
+
161
+ async def fail(self, error: Exception) -> None:
162
+ if not self._runner_id:
163
+ return
164
+
165
+ error_message = f"{type(error).__name__}: {error}"
166
+ try:
167
+ payload = UpdateRunnerStatusRequest(status="ERROR", error_message=error_message)
168
+ self._request_json(
169
+ "PATCH",
170
+ f"/runners/{self._runner_id}/status",
171
+ payload.model_dump(mode="json", by_alias=True, exclude_none=True),
172
+ )
173
+ except Exception as update_error:
174
+ logger.warning(
175
+ "Failed to update runner status to ERROR after failure %s: %s",
176
+ error,
177
+ update_error,
178
+ )
179
+
180
+ async def register_discovered_assets(self, hashes: list[str]) -> None:
181
+ runner_id = self._require_runner_id()
182
+ for i in range(0, len(hashes), 500):
183
+ chunk = hashes[i : i + 500]
184
+ self._request_json(
185
+ "POST",
186
+ f"/runners/{runner_id}/assets/discover",
187
+ {"assetHashes": chunk},
188
+ )
189
+
190
+ async def update_asset_status(
191
+ self,
192
+ asset_hash: str,
193
+ status: str,
194
+ error_message: str | None = None,
195
+ findings_total: int | None = None,
196
+ findings_by_severity: dict[str, int] | None = None,
197
+ findings_by_detector: dict[str, dict[str, int]] | None = None,
198
+ ) -> None:
199
+ runner_id = self._require_runner_id()
200
+ item: dict[str, Any] = {"assetHash": asset_hash, "status": status}
201
+ if error_message is not None:
202
+ item["errorMessage"] = error_message[:2000]
203
+ if findings_total is not None:
204
+ item["findingsTotal"] = findings_total
205
+ if findings_by_severity is not None:
206
+ item["findingsBySeverity"] = findings_by_severity
207
+ if findings_by_detector is not None:
208
+ item["findingsByDetector"] = findings_by_detector
209
+ self._request_json(
210
+ "PATCH",
211
+ f"/runners/{runner_id}/assets/status",
212
+ {"assets": [item]},
213
+ )
214
+
215
+ def _require_source_id(self) -> str:
216
+ source_id = self.context.source_id
217
+ if not source_id:
218
+ raise ValueError("source_id is required for REST output")
219
+ return source_id
220
+
221
+ def _require_runner_id(self) -> str:
222
+ if not self._runner_id:
223
+ raise ValueError("runner_id is required for REST output")
224
+ return self._runner_id
225
+
226
+ def _request_json(
227
+ self,
228
+ method: str,
229
+ path: str,
230
+ payload: dict[str, Any] | None = None,
231
+ ) -> dict[str, Any]:
232
+ url = urljoin(f"{self.base_url}/", path.lstrip("/"))
233
+ response = self.session.request(
234
+ method=method,
235
+ url=url,
236
+ json=payload,
237
+ timeout=self.timeout_sec,
238
+ )
239
+
240
+ if response.status_code >= 400:
241
+ body_preview = response.text.strip()[:400]
242
+ raise RuntimeError(
243
+ f"REST output request failed ({method} {url}): "
244
+ f"{response.status_code} {response.reason} {body_preview}"
245
+ )
246
+
247
+ if not response.text.strip():
248
+ return {}
249
+
250
+ try:
251
+ parsed = response.json()
252
+ except ValueError:
253
+ return {}
254
+
255
+ if not isinstance(parsed, dict):
256
+ raise RuntimeError(f"Expected JSON object response from {method} {url}")
257
+
258
+ return cast(dict[str, Any], parsed)
@@ -0,0 +1,7 @@
1
+ """Pipeline for processing assets through detectors."""
2
+
3
+ from .content_provider import ContentProvider
4
+ from .detector_pipeline import DetectorPipeline
5
+ from .parsed_content_provider import ParsedContentProvider
6
+
7
+ __all__ = ["ContentProvider", "DetectorPipeline", "ParsedContentProvider"]
@@ -0,0 +1,26 @@
1
+ """Protocol for content access — decouples the pipeline from source internals."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncGenerator
6
+ from typing import Protocol, runtime_checkable
7
+
8
+ from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
9
+
10
+
11
+ @runtime_checkable
12
+ class ContentProvider(Protocol):
13
+ """Minimal contract the pipeline needs to fetch content and enrich findings."""
14
+
15
+ async def fetch_text_pages(self, asset_id: str) -> AsyncGenerator[str, None]: ...
16
+
17
+ async def fetch_bytes(self, asset_id: str) -> tuple[bytes, str] | None: ...
18
+
19
+ def enrich_finding_location(
20
+ self,
21
+ finding: DetectionResult,
22
+ asset: SingleAssetScanResults,
23
+ text_content: str,
24
+ ) -> None: ...
25
+
26
+ def resolve_link_for_detection(self, link: str) -> str | None: ...