classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,774 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import random
|
|
6
|
+
from collections.abc import AsyncGenerator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import UTC, datetime, timedelta
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
from ...models.generated_input import (
|
|
14
|
+
PowerBIInput,
|
|
15
|
+
PowerBIMaskedAccessToken,
|
|
16
|
+
PowerBIMaskedClientSecret,
|
|
17
|
+
PowerBIOptionalConnection,
|
|
18
|
+
PowerBIOptionalExtraction,
|
|
19
|
+
PowerBIOptionalScope,
|
|
20
|
+
PowerBIRequiredAccessToken,
|
|
21
|
+
PowerBIRequiredServicePrincipal,
|
|
22
|
+
SamplingConfig,
|
|
23
|
+
SamplingStrategy,
|
|
24
|
+
)
|
|
25
|
+
from ...models.generated_single_asset_scan_results import (
|
|
26
|
+
AssetType as OutputAssetType,
|
|
27
|
+
)
|
|
28
|
+
from ...models.generated_single_asset_scan_results import (
|
|
29
|
+
DetectionResult,
|
|
30
|
+
Location,
|
|
31
|
+
SingleAssetScanResults,
|
|
32
|
+
)
|
|
33
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
34
|
+
from ..base import BaseSource
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class PowerBIAssetRef:
|
|
41
|
+
raw_id: str
|
|
42
|
+
kind: str
|
|
43
|
+
workspace_id: str
|
|
44
|
+
workspace_name: str
|
|
45
|
+
asset_id: str
|
|
46
|
+
name: str
|
|
47
|
+
external_url: str
|
|
48
|
+
metadata: dict[str, Any]
|
|
49
|
+
linked_raw_ids: list[str]
|
|
50
|
+
created_at: datetime
|
|
51
|
+
updated_at: datetime
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PowerBISource(BaseSource):
|
|
55
|
+
source_type = "powerbi"
|
|
56
|
+
|
|
57
|
+
API_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
|
|
58
|
+
DEFAULT_AUTHORITY_URL = "https://login.microsoftonline.com"
|
|
59
|
+
DEFAULT_API_BASE_URL = "https://api.powerbi.com/v1.0/myorg"
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
recipe: dict[str, Any],
|
|
64
|
+
source_id: str | None = None,
|
|
65
|
+
runner_id: str | None = None,
|
|
66
|
+
) -> None:
|
|
67
|
+
super().__init__(recipe, source_id, runner_id)
|
|
68
|
+
self.config = PowerBIInput.model_validate(recipe)
|
|
69
|
+
self.runner_id = runner_id or "local-run"
|
|
70
|
+
|
|
71
|
+
self.session = requests.Session()
|
|
72
|
+
self._access_token: str | None = None
|
|
73
|
+
self._access_token_expiry: datetime | None = None
|
|
74
|
+
|
|
75
|
+
self._asset_lookup: dict[str, PowerBIAssetRef] = {}
|
|
76
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
77
|
+
|
|
78
|
+
def _asset_type_value(self) -> str:
|
|
79
|
+
type_value = self.config.type
|
|
80
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
81
|
+
|
|
82
|
+
def _sampling(self) -> SamplingConfig:
|
|
83
|
+
return self.config.sampling
|
|
84
|
+
|
|
85
|
+
def _connection_options(self) -> PowerBIOptionalConnection:
|
|
86
|
+
if self.config.optional and self.config.optional.connection:
|
|
87
|
+
return self.config.optional.connection
|
|
88
|
+
return PowerBIOptionalConnection()
|
|
89
|
+
|
|
90
|
+
def _scope_options(self) -> PowerBIOptionalScope:
|
|
91
|
+
if self.config.optional and self.config.optional.scope:
|
|
92
|
+
return self.config.optional.scope
|
|
93
|
+
return PowerBIOptionalScope()
|
|
94
|
+
|
|
95
|
+
def _extraction_options(self) -> PowerBIOptionalExtraction:
|
|
96
|
+
if self.config.optional and self.config.optional.extraction:
|
|
97
|
+
return self.config.optional.extraction
|
|
98
|
+
return PowerBIOptionalExtraction()
|
|
99
|
+
|
|
100
|
+
def _is_service_principal_mode(self) -> bool:
|
|
101
|
+
return isinstance(self.config.required, PowerBIRequiredServicePrincipal)
|
|
102
|
+
|
|
103
|
+
def _is_access_token_mode(self) -> bool:
|
|
104
|
+
return isinstance(self.config.required, PowerBIRequiredAccessToken)
|
|
105
|
+
|
|
106
|
+
def _masked_client_secret(self) -> str:
|
|
107
|
+
masked = self.config.masked
|
|
108
|
+
if not isinstance(masked, PowerBIMaskedClientSecret):
|
|
109
|
+
raise ValueError("POWERBI SERVICE_PRINCIPAL auth requires masked.client_secret")
|
|
110
|
+
return masked.client_secret
|
|
111
|
+
|
|
112
|
+
def _masked_access_token(self) -> str:
|
|
113
|
+
masked = self.config.masked
|
|
114
|
+
if not isinstance(masked, PowerBIMaskedAccessToken):
|
|
115
|
+
raise ValueError("POWERBI ACCESS_TOKEN auth requires masked.access_token")
|
|
116
|
+
return masked.access_token
|
|
117
|
+
|
|
118
|
+
def _authority_url(self) -> str:
|
|
119
|
+
configured = self._connection_options().authority_url
|
|
120
|
+
base = str(configured) if configured is not None else self.DEFAULT_AUTHORITY_URL
|
|
121
|
+
return base.rstrip("/")
|
|
122
|
+
|
|
123
|
+
def _api_base_url(self) -> str:
|
|
124
|
+
configured = self._connection_options().api_base_url
|
|
125
|
+
base = str(configured) if configured is not None else self.DEFAULT_API_BASE_URL
|
|
126
|
+
return base.rstrip("/")
|
|
127
|
+
|
|
128
|
+
def _timeout_seconds(self) -> int:
|
|
129
|
+
timeout = self._connection_options().timeout_seconds
|
|
130
|
+
return int(timeout or 30)
|
|
131
|
+
|
|
132
|
+
def _token_endpoint(self) -> str:
|
|
133
|
+
required = self.config.required
|
|
134
|
+
if not isinstance(required, PowerBIRequiredServicePrincipal):
|
|
135
|
+
raise ValueError("Token endpoint is available only for SERVICE_PRINCIPAL mode")
|
|
136
|
+
return f"{self._authority_url()}/{required.tenant_id}/oauth2/v2.0/token"
|
|
137
|
+
|
|
138
|
+
def _normalize_bearer_token(self, token: str) -> str:
|
|
139
|
+
cleaned = token.strip()
|
|
140
|
+
if cleaned.lower().startswith("bearer "):
|
|
141
|
+
return cleaned
|
|
142
|
+
return f"Bearer {cleaned}"
|
|
143
|
+
|
|
144
|
+
def _is_access_token_expired(self) -> bool:
|
|
145
|
+
if self._access_token_expiry is None:
|
|
146
|
+
return True
|
|
147
|
+
return self._access_token_expiry <= datetime.now(UTC)
|
|
148
|
+
|
|
149
|
+
def _acquire_service_principal_token(self) -> str:
|
|
150
|
+
required = self.config.required
|
|
151
|
+
if not isinstance(required, PowerBIRequiredServicePrincipal):
|
|
152
|
+
raise ValueError("SERVICE_PRINCIPAL auth mode is required")
|
|
153
|
+
|
|
154
|
+
payload = {
|
|
155
|
+
"grant_type": "client_credentials",
|
|
156
|
+
"client_id": required.client_id,
|
|
157
|
+
"client_secret": self._masked_client_secret(),
|
|
158
|
+
"scope": self.API_SCOPE,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
response = self.session.post(
|
|
162
|
+
self._token_endpoint(),
|
|
163
|
+
data=payload,
|
|
164
|
+
timeout=self._timeout_seconds(),
|
|
165
|
+
)
|
|
166
|
+
response.raise_for_status()
|
|
167
|
+
|
|
168
|
+
body = response.json()
|
|
169
|
+
token = body.get("access_token")
|
|
170
|
+
if not isinstance(token, str) or not token.strip():
|
|
171
|
+
raise ValueError("PowerBI token response did not include access_token")
|
|
172
|
+
|
|
173
|
+
expires_in = int(body.get("expires_in", 3600))
|
|
174
|
+
safety_seconds = 300
|
|
175
|
+
valid_for = max(expires_in - safety_seconds, 0)
|
|
176
|
+
self._access_token_expiry = datetime.now(UTC) + timedelta(seconds=valid_for)
|
|
177
|
+
|
|
178
|
+
return self._normalize_bearer_token(token)
|
|
179
|
+
|
|
180
|
+
def _access_token_value(self) -> str:
|
|
181
|
+
if self._is_access_token_mode():
|
|
182
|
+
return self._normalize_bearer_token(self._masked_access_token())
|
|
183
|
+
|
|
184
|
+
if self._access_token and not self._is_access_token_expired():
|
|
185
|
+
return self._access_token
|
|
186
|
+
|
|
187
|
+
self._access_token = self._acquire_service_principal_token()
|
|
188
|
+
return self._access_token
|
|
189
|
+
|
|
190
|
+
def _request_json(
|
|
191
|
+
self,
|
|
192
|
+
method: str,
|
|
193
|
+
path_or_url: str,
|
|
194
|
+
*,
|
|
195
|
+
params: dict[str, Any] | None = None,
|
|
196
|
+
json_payload: dict[str, Any] | None = None,
|
|
197
|
+
) -> dict[str, Any]:
|
|
198
|
+
url = (
|
|
199
|
+
path_or_url
|
|
200
|
+
if path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
|
201
|
+
else f"{self._api_base_url()}/{path_or_url.lstrip('/')}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
headers = {
|
|
205
|
+
"Authorization": self._access_token_value(),
|
|
206
|
+
"Accept": "application/json",
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
response = self.session.request(
|
|
211
|
+
method,
|
|
212
|
+
url,
|
|
213
|
+
headers=headers,
|
|
214
|
+
params=params,
|
|
215
|
+
json=json_payload,
|
|
216
|
+
timeout=self._timeout_seconds(),
|
|
217
|
+
)
|
|
218
|
+
response.raise_for_status()
|
|
219
|
+
except requests.RequestException as exc:
|
|
220
|
+
raise RuntimeError(f"PowerBI request failed for {url}: {exc}") from exc
|
|
221
|
+
|
|
222
|
+
if response.status_code == 204 or not response.text.strip():
|
|
223
|
+
return {}
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
return response.json()
|
|
227
|
+
except ValueError as exc:
|
|
228
|
+
raise RuntimeError(f"PowerBI returned invalid JSON for {url}") from exc
|
|
229
|
+
|
|
230
|
+
def _paged_values(
|
|
231
|
+
self,
|
|
232
|
+
path: str,
|
|
233
|
+
*,
|
|
234
|
+
params: dict[str, Any] | None = None,
|
|
235
|
+
) -> list[dict[str, Any]]:
|
|
236
|
+
collected: list[dict[str, Any]] = []
|
|
237
|
+
|
|
238
|
+
next_url: str | None = path
|
|
239
|
+
next_params = params
|
|
240
|
+
while next_url:
|
|
241
|
+
payload = self._request_json("get", next_url, params=next_params)
|
|
242
|
+
values = payload.get("value", [])
|
|
243
|
+
if isinstance(values, list):
|
|
244
|
+
for item in values:
|
|
245
|
+
if isinstance(item, dict):
|
|
246
|
+
collected.append(item)
|
|
247
|
+
|
|
248
|
+
potential_next = payload.get("@odata.nextLink")
|
|
249
|
+
next_url = potential_next if isinstance(potential_next, str) else None
|
|
250
|
+
next_params = None
|
|
251
|
+
|
|
252
|
+
return collected
|
|
253
|
+
|
|
254
|
+
def _parse_datetime(self, value: Any) -> datetime | None:
|
|
255
|
+
if isinstance(value, datetime):
|
|
256
|
+
return value if value.tzinfo else value.replace(tzinfo=UTC)
|
|
257
|
+
|
|
258
|
+
if isinstance(value, str):
|
|
259
|
+
cleaned = value.strip()
|
|
260
|
+
if not cleaned:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
normalized = cleaned.replace("Z", "+00:00")
|
|
264
|
+
try:
|
|
265
|
+
parsed = datetime.fromisoformat(normalized)
|
|
266
|
+
except ValueError:
|
|
267
|
+
return None
|
|
268
|
+
return parsed if parsed.tzinfo else parsed.replace(tzinfo=UTC)
|
|
269
|
+
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
def _workspace_allowlist_ids(self) -> set[str]:
|
|
273
|
+
configured = self._scope_options().workspace_ids or []
|
|
274
|
+
return {item.strip() for item in configured if item and item.strip()}
|
|
275
|
+
|
|
276
|
+
def _workspace_allowlist_names(self) -> set[str]:
|
|
277
|
+
configured = self._scope_options().workspace_names or []
|
|
278
|
+
return {item.strip().lower() for item in configured if item and item.strip()}
|
|
279
|
+
|
|
280
|
+
def _workspace_allowed(self, workspace: dict[str, Any]) -> bool:
|
|
281
|
+
workspace_id = str(workspace.get("id") or "").strip()
|
|
282
|
+
workspace_name = str(workspace.get("name") or "").strip()
|
|
283
|
+
workspace_type = str(workspace.get("type") or "").strip().lower()
|
|
284
|
+
|
|
285
|
+
allow_ids = self._workspace_allowlist_ids()
|
|
286
|
+
allow_names = self._workspace_allowlist_names()
|
|
287
|
+
|
|
288
|
+
if allow_ids and workspace_id not in allow_ids:
|
|
289
|
+
return False
|
|
290
|
+
if allow_names and workspace_name.lower() not in allow_names:
|
|
291
|
+
return False
|
|
292
|
+
|
|
293
|
+
include_personal = bool(self._scope_options().include_personal_workspaces)
|
|
294
|
+
if not include_personal:
|
|
295
|
+
if workspace_type in {"personal", "personalgroup"}:
|
|
296
|
+
return False
|
|
297
|
+
if workspace_name.lower() in {"my workspace"}:
|
|
298
|
+
return False
|
|
299
|
+
|
|
300
|
+
return bool(workspace_id)
|
|
301
|
+
|
|
302
|
+
def _coerce_url(self, value: Any, fallback: str) -> str:
|
|
303
|
+
candidate = str(value or "").strip()
|
|
304
|
+
return candidate if candidate else fallback
|
|
305
|
+
|
|
306
|
+
def _workspace_raw_id(self, workspace_id: str) -> str:
|
|
307
|
+
return f"{workspace_id}_#_workspace"
|
|
308
|
+
|
|
309
|
+
def _dataset_raw_id(self, workspace_id: str, dataset_id: str) -> str:
|
|
310
|
+
return f"{workspace_id}_#_dataset_#_{dataset_id}"
|
|
311
|
+
|
|
312
|
+
def _report_raw_id(self, workspace_id: str, report_id: str) -> str:
|
|
313
|
+
return f"{workspace_id}_#_report_#_{report_id}"
|
|
314
|
+
|
|
315
|
+
def _dashboard_raw_id(self, workspace_id: str, dashboard_id: str) -> str:
|
|
316
|
+
return f"{workspace_id}_#_dashboard_#_{dashboard_id}"
|
|
317
|
+
|
|
318
|
+
def _workspace_external_url(self, workspace_id: str) -> str:
|
|
319
|
+
return f"https://app.powerbi.com/groups/{workspace_id}/list"
|
|
320
|
+
|
|
321
|
+
def _dataset_external_url(self, workspace_id: str, dataset_id: str) -> str:
|
|
322
|
+
return f"https://app.powerbi.com/groups/{workspace_id}/datasets/{dataset_id}/details"
|
|
323
|
+
|
|
324
|
+
def _report_external_url(self, workspace_id: str, report_id: str) -> str:
|
|
325
|
+
return f"https://app.powerbi.com/groups/{workspace_id}/reports/{report_id}"
|
|
326
|
+
|
|
327
|
+
def _dashboard_external_url(self, workspace_id: str, dashboard_id: str) -> str:
|
|
328
|
+
return f"https://app.powerbi.com/groups/{workspace_id}/dashboards/{dashboard_id}"
|
|
329
|
+
|
|
330
|
+
def _list_workspaces(self) -> list[dict[str, Any]]:
|
|
331
|
+
workspaces = self._paged_values("groups", params={"$top": 5000})
|
|
332
|
+
return [workspace for workspace in workspaces if self._workspace_allowed(workspace)]
|
|
333
|
+
|
|
334
|
+
def _list_datasets(self, workspace_id: str) -> list[dict[str, Any]]:
|
|
335
|
+
return self._paged_values(f"groups/{workspace_id}/datasets", params={"$top": 5000})
|
|
336
|
+
|
|
337
|
+
def _list_reports(self, workspace_id: str) -> list[dict[str, Any]]:
|
|
338
|
+
return self._paged_values(f"groups/{workspace_id}/reports", params={"$top": 5000})
|
|
339
|
+
|
|
340
|
+
def _list_dashboards(self, workspace_id: str) -> list[dict[str, Any]]:
|
|
341
|
+
return self._paged_values(f"groups/{workspace_id}/dashboards", params={"$top": 5000})
|
|
342
|
+
|
|
343
|
+
def _list_dataset_tables(
|
|
344
|
+
self,
|
|
345
|
+
workspace_id: str,
|
|
346
|
+
dataset_id: str,
|
|
347
|
+
) -> list[dict[str, Any]]:
|
|
348
|
+
try:
|
|
349
|
+
return self._paged_values(
|
|
350
|
+
f"groups/{workspace_id}/datasets/{dataset_id}/tables",
|
|
351
|
+
params={"$top": 5000},
|
|
352
|
+
)
|
|
353
|
+
except Exception as exc:
|
|
354
|
+
logger.debug(
|
|
355
|
+
"Failed to list tables for dataset %s in workspace %s: %s",
|
|
356
|
+
dataset_id,
|
|
357
|
+
workspace_id,
|
|
358
|
+
exc,
|
|
359
|
+
)
|
|
360
|
+
return []
|
|
361
|
+
|
|
362
|
+
def _to_asset_ref(
|
|
363
|
+
self,
|
|
364
|
+
*,
|
|
365
|
+
raw_id: str,
|
|
366
|
+
kind: str,
|
|
367
|
+
workspace_id: str,
|
|
368
|
+
workspace_name: str,
|
|
369
|
+
asset_id: str,
|
|
370
|
+
name: str,
|
|
371
|
+
external_url: str,
|
|
372
|
+
metadata: dict[str, Any],
|
|
373
|
+
linked_raw_ids: list[str] | None = None,
|
|
374
|
+
) -> PowerBIAssetRef:
|
|
375
|
+
created_at = self._parse_datetime(metadata.get("createdDateTime")) or datetime.now(UTC)
|
|
376
|
+
updated_at = (
|
|
377
|
+
self._parse_datetime(metadata.get("modifiedDateTime"))
|
|
378
|
+
or self._parse_datetime(metadata.get("lastUpdate"))
|
|
379
|
+
or created_at
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
return PowerBIAssetRef(
|
|
383
|
+
raw_id=raw_id,
|
|
384
|
+
kind=kind,
|
|
385
|
+
workspace_id=workspace_id,
|
|
386
|
+
workspace_name=workspace_name,
|
|
387
|
+
asset_id=asset_id,
|
|
388
|
+
name=name,
|
|
389
|
+
external_url=external_url,
|
|
390
|
+
metadata=metadata,
|
|
391
|
+
linked_raw_ids=list(linked_raw_ids or []),
|
|
392
|
+
created_at=created_at,
|
|
393
|
+
updated_at=updated_at,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
def _discover_assets(self) -> list[PowerBIAssetRef]:
|
|
397
|
+
extraction = self._extraction_options()
|
|
398
|
+
refs: list[PowerBIAssetRef] = []
|
|
399
|
+
|
|
400
|
+
for workspace in self._list_workspaces():
|
|
401
|
+
if self._aborted:
|
|
402
|
+
break
|
|
403
|
+
|
|
404
|
+
workspace_id = str(workspace.get("id") or "").strip()
|
|
405
|
+
workspace_name = str(workspace.get("name") or workspace_id)
|
|
406
|
+
if not workspace_id:
|
|
407
|
+
continue
|
|
408
|
+
|
|
409
|
+
workspace_raw_id = self._workspace_raw_id(workspace_id)
|
|
410
|
+
if extraction.extract_workspaces_to_containers is not False:
|
|
411
|
+
refs.append(
|
|
412
|
+
self._to_asset_ref(
|
|
413
|
+
raw_id=workspace_raw_id,
|
|
414
|
+
kind="workspace",
|
|
415
|
+
workspace_id=workspace_id,
|
|
416
|
+
workspace_name=workspace_name,
|
|
417
|
+
asset_id=workspace_id,
|
|
418
|
+
name=workspace_name,
|
|
419
|
+
external_url=self._workspace_external_url(workspace_id),
|
|
420
|
+
metadata={
|
|
421
|
+
"workspace": workspace,
|
|
422
|
+
"extract_workspaces_to_containers": bool(
|
|
423
|
+
extraction.extract_workspaces_to_containers
|
|
424
|
+
),
|
|
425
|
+
},
|
|
426
|
+
)
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
datasets = self._list_datasets(workspace_id)
|
|
430
|
+
dataset_raw_by_id: dict[str, str] = {}
|
|
431
|
+
extract_schema = extraction.extract_dataset_schema is not False
|
|
432
|
+
|
|
433
|
+
for dataset in datasets:
|
|
434
|
+
dataset_id = str(dataset.get("id") or "").strip()
|
|
435
|
+
if not dataset_id:
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
dataset_raw_id = self._dataset_raw_id(workspace_id, dataset_id)
|
|
439
|
+
dataset_raw_by_id[dataset_id] = dataset_raw_id
|
|
440
|
+
|
|
441
|
+
metadata: dict[str, Any] = {
|
|
442
|
+
"workspace": {
|
|
443
|
+
"id": workspace_id,
|
|
444
|
+
"name": workspace_name,
|
|
445
|
+
},
|
|
446
|
+
"dataset": dataset,
|
|
447
|
+
"extract_datasets_to_containers": bool(
|
|
448
|
+
extraction.extract_datasets_to_containers
|
|
449
|
+
),
|
|
450
|
+
}
|
|
451
|
+
if extract_schema:
|
|
452
|
+
tables = self._list_dataset_tables(workspace_id, dataset_id)
|
|
453
|
+
if tables:
|
|
454
|
+
metadata["tables"] = tables
|
|
455
|
+
|
|
456
|
+
refs.append(
|
|
457
|
+
self._to_asset_ref(
|
|
458
|
+
raw_id=dataset_raw_id,
|
|
459
|
+
kind="dataset",
|
|
460
|
+
workspace_id=workspace_id,
|
|
461
|
+
workspace_name=workspace_name,
|
|
462
|
+
asset_id=dataset_id,
|
|
463
|
+
name=str(dataset.get("name") or dataset_id),
|
|
464
|
+
external_url=self._coerce_url(
|
|
465
|
+
dataset.get("webUrl"),
|
|
466
|
+
self._dataset_external_url(workspace_id, dataset_id),
|
|
467
|
+
),
|
|
468
|
+
metadata=metadata,
|
|
469
|
+
linked_raw_ids=[workspace_raw_id],
|
|
470
|
+
)
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
if extraction.extract_reports is not False:
|
|
474
|
+
for report in self._list_reports(workspace_id):
|
|
475
|
+
report_id = str(report.get("id") or "").strip()
|
|
476
|
+
if not report_id:
|
|
477
|
+
continue
|
|
478
|
+
|
|
479
|
+
raw_id = self._report_raw_id(workspace_id, report_id)
|
|
480
|
+
linked_raw_ids = [workspace_raw_id]
|
|
481
|
+
dataset_id = str(report.get("datasetId") or "").strip()
|
|
482
|
+
if dataset_id and dataset_id in dataset_raw_by_id:
|
|
483
|
+
linked_raw_ids.append(dataset_raw_by_id[dataset_id])
|
|
484
|
+
|
|
485
|
+
refs.append(
|
|
486
|
+
self._to_asset_ref(
|
|
487
|
+
raw_id=raw_id,
|
|
488
|
+
kind="report",
|
|
489
|
+
workspace_id=workspace_id,
|
|
490
|
+
workspace_name=workspace_name,
|
|
491
|
+
asset_id=report_id,
|
|
492
|
+
name=str(report.get("name") or report_id),
|
|
493
|
+
external_url=self._coerce_url(
|
|
494
|
+
report.get("webUrl"),
|
|
495
|
+
self._report_external_url(workspace_id, report_id),
|
|
496
|
+
),
|
|
497
|
+
metadata={
|
|
498
|
+
"workspace": {
|
|
499
|
+
"id": workspace_id,
|
|
500
|
+
"name": workspace_name,
|
|
501
|
+
},
|
|
502
|
+
"report": report,
|
|
503
|
+
},
|
|
504
|
+
linked_raw_ids=linked_raw_ids,
|
|
505
|
+
)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
if extraction.extract_dashboards is not False:
|
|
509
|
+
for dashboard in self._list_dashboards(workspace_id):
|
|
510
|
+
dashboard_id = str(dashboard.get("id") or "").strip()
|
|
511
|
+
if not dashboard_id:
|
|
512
|
+
continue
|
|
513
|
+
|
|
514
|
+
raw_id = self._dashboard_raw_id(workspace_id, dashboard_id)
|
|
515
|
+
refs.append(
|
|
516
|
+
self._to_asset_ref(
|
|
517
|
+
raw_id=raw_id,
|
|
518
|
+
kind="dashboard",
|
|
519
|
+
workspace_id=workspace_id,
|
|
520
|
+
workspace_name=workspace_name,
|
|
521
|
+
asset_id=dashboard_id,
|
|
522
|
+
name=str(
|
|
523
|
+
dashboard.get("displayName")
|
|
524
|
+
or dashboard.get("name")
|
|
525
|
+
or dashboard_id
|
|
526
|
+
),
|
|
527
|
+
external_url=self._coerce_url(
|
|
528
|
+
dashboard.get("webUrl"),
|
|
529
|
+
self._dashboard_external_url(workspace_id, dashboard_id),
|
|
530
|
+
),
|
|
531
|
+
metadata={
|
|
532
|
+
"workspace": {
|
|
533
|
+
"id": workspace_id,
|
|
534
|
+
"name": workspace_name,
|
|
535
|
+
},
|
|
536
|
+
"dashboard": dashboard,
|
|
537
|
+
},
|
|
538
|
+
linked_raw_ids=[workspace_raw_id],
|
|
539
|
+
)
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
return refs
|
|
543
|
+
|
|
544
|
+
def _sampling_sort_datetime(self, ref: PowerBIAssetRef, field_name: str) -> datetime | None:
|
|
545
|
+
candidates = [
|
|
546
|
+
ref.metadata.get(field_name),
|
|
547
|
+
ref.metadata.get("report", {}).get(field_name),
|
|
548
|
+
ref.metadata.get("dataset", {}).get(field_name),
|
|
549
|
+
ref.metadata.get("dashboard", {}).get(field_name),
|
|
550
|
+
ref.metadata.get("workspace", {}).get(field_name),
|
|
551
|
+
]
|
|
552
|
+
for value in candidates:
|
|
553
|
+
parsed = self._parse_datetime(value)
|
|
554
|
+
if parsed is not None:
|
|
555
|
+
return parsed
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
def _sample_refs(self, refs: list[PowerBIAssetRef]) -> list[PowerBIAssetRef]:
|
|
559
|
+
sampling = self._sampling()
|
|
560
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
561
|
+
return refs
|
|
562
|
+
|
|
563
|
+
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
564
|
+
limit = int(sampling.rows_per_page or 100)
|
|
565
|
+
if limit >= len(refs):
|
|
566
|
+
return refs
|
|
567
|
+
generator = random.Random(0)
|
|
568
|
+
sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
|
|
569
|
+
return [refs[index] for index in sampled_indexes]
|
|
570
|
+
|
|
571
|
+
order_field = sampling.order_by_column or "modifiedDateTime"
|
|
572
|
+
values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
|
|
573
|
+
has_order_values = any(value is not None for value in values)
|
|
574
|
+
|
|
575
|
+
if not has_order_values and sampling.fallback_to_random is not False:
|
|
576
|
+
generator = random.Random(0)
|
|
577
|
+
limit = int(sampling.rows_per_page or 100)
|
|
578
|
+
sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
|
|
579
|
+
return [refs[index] for index in sampled_indexes]
|
|
580
|
+
|
|
581
|
+
scored: list[tuple[bool, datetime, PowerBIAssetRef]] = []
|
|
582
|
+
for ref, parsed in zip(refs, values, strict=False):
|
|
583
|
+
effective = parsed or ref.updated_at
|
|
584
|
+
scored.append((parsed is not None, effective, ref))
|
|
585
|
+
|
|
586
|
+
scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
|
|
587
|
+
limit = int(sampling.rows_per_page or 100)
|
|
588
|
+
return [item[2] for item in scored[:limit]]
|
|
589
|
+
|
|
590
|
+
def _asset_from_ref(
|
|
591
|
+
self,
|
|
592
|
+
ref: PowerBIAssetRef,
|
|
593
|
+
*,
|
|
594
|
+
links: list[str],
|
|
595
|
+
) -> SingleAssetScanResults:
|
|
596
|
+
asset_hash = self.generate_hash_id(ref.raw_id)
|
|
597
|
+
checksum_payload = {
|
|
598
|
+
"kind": ref.kind,
|
|
599
|
+
"workspace_id": ref.workspace_id,
|
|
600
|
+
"workspace_name": ref.workspace_name,
|
|
601
|
+
"asset_id": ref.asset_id,
|
|
602
|
+
"name": ref.name,
|
|
603
|
+
"metadata": ref.metadata,
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
return SingleAssetScanResults(
|
|
607
|
+
hash=asset_hash,
|
|
608
|
+
checksum=self.calculate_checksum(checksum_payload),
|
|
609
|
+
name=f"{ref.workspace_name} / {ref.kind} / {ref.name}",
|
|
610
|
+
external_url=self.ensure_location(
|
|
611
|
+
ref.external_url,
|
|
612
|
+
fallback=self._workspace_external_url(ref.workspace_id),
|
|
613
|
+
),
|
|
614
|
+
links=links,
|
|
615
|
+
asset_type=OutputAssetType.TXT,
|
|
616
|
+
source_id=self.source_id,
|
|
617
|
+
created_at=ref.created_at,
|
|
618
|
+
updated_at=ref.updated_at,
|
|
619
|
+
runner_id=self.runner_id,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
def test_connection(self) -> dict[str, Any]:
|
|
623
|
+
logger.info("Testing connection to PowerBI...")
|
|
624
|
+
result = {
|
|
625
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
626
|
+
"source_type": self.recipe.get("type"),
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
workspaces = self._list_workspaces()
|
|
631
|
+
auth_mode = "SERVICE_PRINCIPAL" if self._is_service_principal_mode() else "ACCESS_TOKEN"
|
|
632
|
+
result["status"] = "SUCCESS"
|
|
633
|
+
result["message"] = (
|
|
634
|
+
f"Successfully connected to PowerBI using {auth_mode}. "
|
|
635
|
+
f"Reachable workspaces: {len(workspaces)}."
|
|
636
|
+
)
|
|
637
|
+
except Exception as exc:
|
|
638
|
+
result["status"] = "FAILURE"
|
|
639
|
+
result["message"] = f"Failed to connect to PowerBI: {exc}"
|
|
640
|
+
|
|
641
|
+
return result
|
|
642
|
+
|
|
643
|
+
STREAM_DETECTIONS = True
|
|
644
|
+
|
|
645
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
646
|
+
if self._aborted:
|
|
647
|
+
return
|
|
648
|
+
|
|
649
|
+
refs = self._sample_refs(self._discover_assets())
|
|
650
|
+
hash_by_raw = {ref.raw_id: self.generate_hash_id(ref.raw_id) for ref in refs}
|
|
651
|
+
|
|
652
|
+
batch: list[SingleAssetScanResults] = []
|
|
653
|
+
for ref in refs:
|
|
654
|
+
if self._aborted:
|
|
655
|
+
return
|
|
656
|
+
|
|
657
|
+
asset_hash = hash_by_raw[ref.raw_id]
|
|
658
|
+
self._asset_lookup[asset_hash] = ref
|
|
659
|
+
|
|
660
|
+
linked_hashes = [
|
|
661
|
+
hash_by_raw[linked_raw_id]
|
|
662
|
+
for linked_raw_id in ref.linked_raw_ids
|
|
663
|
+
if linked_raw_id in hash_by_raw
|
|
664
|
+
]
|
|
665
|
+
|
|
666
|
+
batch.append(self._asset_from_ref(ref, links=linked_hashes))
|
|
667
|
+
|
|
668
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
669
|
+
yield batch
|
|
670
|
+
batch = []
|
|
671
|
+
|
|
672
|
+
if batch:
|
|
673
|
+
yield batch
|
|
674
|
+
|
|
675
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
676
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
677
|
+
|
|
678
|
+
def _format_asset_content(self, ref: PowerBIAssetRef) -> tuple[str, str]:
|
|
679
|
+
sampling = self._sampling()
|
|
680
|
+
lines: list[str] = [
|
|
681
|
+
f"workspace={ref.workspace_name}",
|
|
682
|
+
f"workspace_id={ref.workspace_id}",
|
|
683
|
+
f"kind={ref.kind}",
|
|
684
|
+
f"name={ref.name}",
|
|
685
|
+
f"sampling_strategy={sampling.strategy}",
|
|
686
|
+
"",
|
|
687
|
+
]
|
|
688
|
+
|
|
689
|
+
if ref.kind == "dataset":
|
|
690
|
+
tables = ref.metadata.get("tables")
|
|
691
|
+
if isinstance(tables, list) and tables:
|
|
692
|
+
lines.append(f"dataset_tables={len(tables)}")
|
|
693
|
+
for table in tables[:20]:
|
|
694
|
+
if not isinstance(table, dict):
|
|
695
|
+
continue
|
|
696
|
+
table_name = str(table.get("name") or "")
|
|
697
|
+
columns = table.get("columns", [])
|
|
698
|
+
column_names = [
|
|
699
|
+
str(column.get("name"))
|
|
700
|
+
for column in columns
|
|
701
|
+
if isinstance(column, dict) and column.get("name")
|
|
702
|
+
]
|
|
703
|
+
rendered_columns = ", ".join(column_names[:20])
|
|
704
|
+
lines.append(f"table={table_name}; columns={rendered_columns}")
|
|
705
|
+
|
|
706
|
+
if ref.kind == "report":
|
|
707
|
+
dataset_id = ref.metadata.get("report", {}).get("datasetId")
|
|
708
|
+
if dataset_id:
|
|
709
|
+
lines.append(f"dataset_id={dataset_id}")
|
|
710
|
+
|
|
711
|
+
text_content = "\n".join(lines)
|
|
712
|
+
raw_content = json.dumps(
|
|
713
|
+
{
|
|
714
|
+
"kind": ref.kind,
|
|
715
|
+
"workspace_id": ref.workspace_id,
|
|
716
|
+
"workspace_name": ref.workspace_name,
|
|
717
|
+
"asset_id": ref.asset_id,
|
|
718
|
+
"name": ref.name,
|
|
719
|
+
"metadata": ref.metadata,
|
|
720
|
+
},
|
|
721
|
+
ensure_ascii=False,
|
|
722
|
+
default=str,
|
|
723
|
+
)
|
|
724
|
+
return raw_content, text_content
|
|
725
|
+
|
|
726
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
727
|
+
cached = self._content_cache.get(asset_id)
|
|
728
|
+
if cached:
|
|
729
|
+
return cached
|
|
730
|
+
|
|
731
|
+
ref = self._asset_lookup.get(asset_id)
|
|
732
|
+
if ref is None:
|
|
733
|
+
try:
|
|
734
|
+
decoded = unhash_id(asset_id)
|
|
735
|
+
except Exception:
|
|
736
|
+
decoded = asset_id
|
|
737
|
+
|
|
738
|
+
if decoded.startswith("POWERBI_#_"):
|
|
739
|
+
decoded = decoded[len("POWERBI_#_") :]
|
|
740
|
+
|
|
741
|
+
for known_ref in self._asset_lookup.values():
|
|
742
|
+
if known_ref.raw_id == decoded:
|
|
743
|
+
ref = known_ref
|
|
744
|
+
break
|
|
745
|
+
|
|
746
|
+
if ref is None:
|
|
747
|
+
return None
|
|
748
|
+
|
|
749
|
+
content = self._format_asset_content(ref)
|
|
750
|
+
self._content_cache[asset_id] = content
|
|
751
|
+
return content
|
|
752
|
+
|
|
753
|
+
def enrich_finding_location(
|
|
754
|
+
self,
|
|
755
|
+
finding: DetectionResult,
|
|
756
|
+
asset: SingleAssetScanResults,
|
|
757
|
+
text_content: str,
|
|
758
|
+
) -> None:
|
|
759
|
+
_ = text_content
|
|
760
|
+
ref = self._asset_lookup.get(asset.hash)
|
|
761
|
+
if not ref:
|
|
762
|
+
return
|
|
763
|
+
|
|
764
|
+
finding.location = Location(path=f"{ref.workspace_name}/{ref.kind}/{ref.name}")
|
|
765
|
+
|
|
766
|
+
def abort(self) -> None:
|
|
767
|
+
logger.info("Aborting PowerBI extraction...")
|
|
768
|
+
super().abort()
|
|
769
|
+
|
|
770
|
+
def cleanup(self) -> None:
|
|
771
|
+
try:
|
|
772
|
+
self.session.close()
|
|
773
|
+
except Exception:
|
|
774
|
+
logger.debug("Failed to close PowerBI session cleanly", exc_info=True)
|