classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,774 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import random
6
+ from collections.abc import AsyncGenerator
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime, timedelta
9
+ from typing import Any
10
+
11
+ import requests
12
+
13
+ from ...models.generated_input import (
14
+ PowerBIInput,
15
+ PowerBIMaskedAccessToken,
16
+ PowerBIMaskedClientSecret,
17
+ PowerBIOptionalConnection,
18
+ PowerBIOptionalExtraction,
19
+ PowerBIOptionalScope,
20
+ PowerBIRequiredAccessToken,
21
+ PowerBIRequiredServicePrincipal,
22
+ SamplingConfig,
23
+ SamplingStrategy,
24
+ )
25
+ from ...models.generated_single_asset_scan_results import (
26
+ AssetType as OutputAssetType,
27
+ )
28
+ from ...models.generated_single_asset_scan_results import (
29
+ DetectionResult,
30
+ Location,
31
+ SingleAssetScanResults,
32
+ )
33
+ from ...utils.hashing import hash_id, unhash_id
34
+ from ..base import BaseSource
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class PowerBIAssetRef:
41
+ raw_id: str
42
+ kind: str
43
+ workspace_id: str
44
+ workspace_name: str
45
+ asset_id: str
46
+ name: str
47
+ external_url: str
48
+ metadata: dict[str, Any]
49
+ linked_raw_ids: list[str]
50
+ created_at: datetime
51
+ updated_at: datetime
52
+
53
+
54
+ class PowerBISource(BaseSource):
55
+ source_type = "powerbi"
56
+
57
+ API_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
58
+ DEFAULT_AUTHORITY_URL = "https://login.microsoftonline.com"
59
+ DEFAULT_API_BASE_URL = "https://api.powerbi.com/v1.0/myorg"
60
+
61
+ def __init__(
62
+ self,
63
+ recipe: dict[str, Any],
64
+ source_id: str | None = None,
65
+ runner_id: str | None = None,
66
+ ) -> None:
67
+ super().__init__(recipe, source_id, runner_id)
68
+ self.config = PowerBIInput.model_validate(recipe)
69
+ self.runner_id = runner_id or "local-run"
70
+
71
+ self.session = requests.Session()
72
+ self._access_token: str | None = None
73
+ self._access_token_expiry: datetime | None = None
74
+
75
+ self._asset_lookup: dict[str, PowerBIAssetRef] = {}
76
+ self._content_cache: dict[str, tuple[str, str]] = {}
77
+
78
+ def _asset_type_value(self) -> str:
79
+ type_value = self.config.type
80
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
81
+
82
+ def _sampling(self) -> SamplingConfig:
83
+ return self.config.sampling
84
+
85
+ def _connection_options(self) -> PowerBIOptionalConnection:
86
+ if self.config.optional and self.config.optional.connection:
87
+ return self.config.optional.connection
88
+ return PowerBIOptionalConnection()
89
+
90
+ def _scope_options(self) -> PowerBIOptionalScope:
91
+ if self.config.optional and self.config.optional.scope:
92
+ return self.config.optional.scope
93
+ return PowerBIOptionalScope()
94
+
95
+ def _extraction_options(self) -> PowerBIOptionalExtraction:
96
+ if self.config.optional and self.config.optional.extraction:
97
+ return self.config.optional.extraction
98
+ return PowerBIOptionalExtraction()
99
+
100
+ def _is_service_principal_mode(self) -> bool:
101
+ return isinstance(self.config.required, PowerBIRequiredServicePrincipal)
102
+
103
+ def _is_access_token_mode(self) -> bool:
104
+ return isinstance(self.config.required, PowerBIRequiredAccessToken)
105
+
106
+ def _masked_client_secret(self) -> str:
107
+ masked = self.config.masked
108
+ if not isinstance(masked, PowerBIMaskedClientSecret):
109
+ raise ValueError("POWERBI SERVICE_PRINCIPAL auth requires masked.client_secret")
110
+ return masked.client_secret
111
+
112
+ def _masked_access_token(self) -> str:
113
+ masked = self.config.masked
114
+ if not isinstance(masked, PowerBIMaskedAccessToken):
115
+ raise ValueError("POWERBI ACCESS_TOKEN auth requires masked.access_token")
116
+ return masked.access_token
117
+
118
+ def _authority_url(self) -> str:
119
+ configured = self._connection_options().authority_url
120
+ base = str(configured) if configured is not None else self.DEFAULT_AUTHORITY_URL
121
+ return base.rstrip("/")
122
+
123
+ def _api_base_url(self) -> str:
124
+ configured = self._connection_options().api_base_url
125
+ base = str(configured) if configured is not None else self.DEFAULT_API_BASE_URL
126
+ return base.rstrip("/")
127
+
128
+ def _timeout_seconds(self) -> int:
129
+ timeout = self._connection_options().timeout_seconds
130
+ return int(timeout or 30)
131
+
132
+ def _token_endpoint(self) -> str:
133
+ required = self.config.required
134
+ if not isinstance(required, PowerBIRequiredServicePrincipal):
135
+ raise ValueError("Token endpoint is available only for SERVICE_PRINCIPAL mode")
136
+ return f"{self._authority_url()}/{required.tenant_id}/oauth2/v2.0/token"
137
+
138
+ def _normalize_bearer_token(self, token: str) -> str:
139
+ cleaned = token.strip()
140
+ if cleaned.lower().startswith("bearer "):
141
+ return cleaned
142
+ return f"Bearer {cleaned}"
143
+
144
+ def _is_access_token_expired(self) -> bool:
145
+ if self._access_token_expiry is None:
146
+ return True
147
+ return self._access_token_expiry <= datetime.now(UTC)
148
+
149
+ def _acquire_service_principal_token(self) -> str:
150
+ required = self.config.required
151
+ if not isinstance(required, PowerBIRequiredServicePrincipal):
152
+ raise ValueError("SERVICE_PRINCIPAL auth mode is required")
153
+
154
+ payload = {
155
+ "grant_type": "client_credentials",
156
+ "client_id": required.client_id,
157
+ "client_secret": self._masked_client_secret(),
158
+ "scope": self.API_SCOPE,
159
+ }
160
+
161
+ response = self.session.post(
162
+ self._token_endpoint(),
163
+ data=payload,
164
+ timeout=self._timeout_seconds(),
165
+ )
166
+ response.raise_for_status()
167
+
168
+ body = response.json()
169
+ token = body.get("access_token")
170
+ if not isinstance(token, str) or not token.strip():
171
+ raise ValueError("PowerBI token response did not include access_token")
172
+
173
+ expires_in = int(body.get("expires_in", 3600))
174
+ safety_seconds = 300
175
+ valid_for = max(expires_in - safety_seconds, 0)
176
+ self._access_token_expiry = datetime.now(UTC) + timedelta(seconds=valid_for)
177
+
178
+ return self._normalize_bearer_token(token)
179
+
180
+ def _access_token_value(self) -> str:
181
+ if self._is_access_token_mode():
182
+ return self._normalize_bearer_token(self._masked_access_token())
183
+
184
+ if self._access_token and not self._is_access_token_expired():
185
+ return self._access_token
186
+
187
+ self._access_token = self._acquire_service_principal_token()
188
+ return self._access_token
189
+
190
+ def _request_json(
191
+ self,
192
+ method: str,
193
+ path_or_url: str,
194
+ *,
195
+ params: dict[str, Any] | None = None,
196
+ json_payload: dict[str, Any] | None = None,
197
+ ) -> dict[str, Any]:
198
+ url = (
199
+ path_or_url
200
+ if path_or_url.startswith("http://") or path_or_url.startswith("https://")
201
+ else f"{self._api_base_url()}/{path_or_url.lstrip('/')}"
202
+ )
203
+
204
+ headers = {
205
+ "Authorization": self._access_token_value(),
206
+ "Accept": "application/json",
207
+ }
208
+
209
+ try:
210
+ response = self.session.request(
211
+ method,
212
+ url,
213
+ headers=headers,
214
+ params=params,
215
+ json=json_payload,
216
+ timeout=self._timeout_seconds(),
217
+ )
218
+ response.raise_for_status()
219
+ except requests.RequestException as exc:
220
+ raise RuntimeError(f"PowerBI request failed for {url}: {exc}") from exc
221
+
222
+ if response.status_code == 204 or not response.text.strip():
223
+ return {}
224
+
225
+ try:
226
+ return response.json()
227
+ except ValueError as exc:
228
+ raise RuntimeError(f"PowerBI returned invalid JSON for {url}") from exc
229
+
230
+ def _paged_values(
231
+ self,
232
+ path: str,
233
+ *,
234
+ params: dict[str, Any] | None = None,
235
+ ) -> list[dict[str, Any]]:
236
+ collected: list[dict[str, Any]] = []
237
+
238
+ next_url: str | None = path
239
+ next_params = params
240
+ while next_url:
241
+ payload = self._request_json("get", next_url, params=next_params)
242
+ values = payload.get("value", [])
243
+ if isinstance(values, list):
244
+ for item in values:
245
+ if isinstance(item, dict):
246
+ collected.append(item)
247
+
248
+ potential_next = payload.get("@odata.nextLink")
249
+ next_url = potential_next if isinstance(potential_next, str) else None
250
+ next_params = None
251
+
252
+ return collected
253
+
254
+ def _parse_datetime(self, value: Any) -> datetime | None:
255
+ if isinstance(value, datetime):
256
+ return value if value.tzinfo else value.replace(tzinfo=UTC)
257
+
258
+ if isinstance(value, str):
259
+ cleaned = value.strip()
260
+ if not cleaned:
261
+ return None
262
+
263
+ normalized = cleaned.replace("Z", "+00:00")
264
+ try:
265
+ parsed = datetime.fromisoformat(normalized)
266
+ except ValueError:
267
+ return None
268
+ return parsed if parsed.tzinfo else parsed.replace(tzinfo=UTC)
269
+
270
+ return None
271
+
272
+ def _workspace_allowlist_ids(self) -> set[str]:
273
+ configured = self._scope_options().workspace_ids or []
274
+ return {item.strip() for item in configured if item and item.strip()}
275
+
276
+ def _workspace_allowlist_names(self) -> set[str]:
277
+ configured = self._scope_options().workspace_names or []
278
+ return {item.strip().lower() for item in configured if item and item.strip()}
279
+
280
+ def _workspace_allowed(self, workspace: dict[str, Any]) -> bool:
281
+ workspace_id = str(workspace.get("id") or "").strip()
282
+ workspace_name = str(workspace.get("name") or "").strip()
283
+ workspace_type = str(workspace.get("type") or "").strip().lower()
284
+
285
+ allow_ids = self._workspace_allowlist_ids()
286
+ allow_names = self._workspace_allowlist_names()
287
+
288
+ if allow_ids and workspace_id not in allow_ids:
289
+ return False
290
+ if allow_names and workspace_name.lower() not in allow_names:
291
+ return False
292
+
293
+ include_personal = bool(self._scope_options().include_personal_workspaces)
294
+ if not include_personal:
295
+ if workspace_type in {"personal", "personalgroup"}:
296
+ return False
297
+ if workspace_name.lower() in {"my workspace"}:
298
+ return False
299
+
300
+ return bool(workspace_id)
301
+
302
+ def _coerce_url(self, value: Any, fallback: str) -> str:
303
+ candidate = str(value or "").strip()
304
+ return candidate if candidate else fallback
305
+
306
+ def _workspace_raw_id(self, workspace_id: str) -> str:
307
+ return f"{workspace_id}_#_workspace"
308
+
309
+ def _dataset_raw_id(self, workspace_id: str, dataset_id: str) -> str:
310
+ return f"{workspace_id}_#_dataset_#_{dataset_id}"
311
+
312
+ def _report_raw_id(self, workspace_id: str, report_id: str) -> str:
313
+ return f"{workspace_id}_#_report_#_{report_id}"
314
+
315
+ def _dashboard_raw_id(self, workspace_id: str, dashboard_id: str) -> str:
316
+ return f"{workspace_id}_#_dashboard_#_{dashboard_id}"
317
+
318
+ def _workspace_external_url(self, workspace_id: str) -> str:
319
+ return f"https://app.powerbi.com/groups/{workspace_id}/list"
320
+
321
+ def _dataset_external_url(self, workspace_id: str, dataset_id: str) -> str:
322
+ return f"https://app.powerbi.com/groups/{workspace_id}/datasets/{dataset_id}/details"
323
+
324
+ def _report_external_url(self, workspace_id: str, report_id: str) -> str:
325
+ return f"https://app.powerbi.com/groups/{workspace_id}/reports/{report_id}"
326
+
327
+ def _dashboard_external_url(self, workspace_id: str, dashboard_id: str) -> str:
328
+ return f"https://app.powerbi.com/groups/{workspace_id}/dashboards/{dashboard_id}"
329
+
330
+ def _list_workspaces(self) -> list[dict[str, Any]]:
331
+ workspaces = self._paged_values("groups", params={"$top": 5000})
332
+ return [workspace for workspace in workspaces if self._workspace_allowed(workspace)]
333
+
334
+ def _list_datasets(self, workspace_id: str) -> list[dict[str, Any]]:
335
+ return self._paged_values(f"groups/{workspace_id}/datasets", params={"$top": 5000})
336
+
337
+ def _list_reports(self, workspace_id: str) -> list[dict[str, Any]]:
338
+ return self._paged_values(f"groups/{workspace_id}/reports", params={"$top": 5000})
339
+
340
+ def _list_dashboards(self, workspace_id: str) -> list[dict[str, Any]]:
341
+ return self._paged_values(f"groups/{workspace_id}/dashboards", params={"$top": 5000})
342
+
343
+ def _list_dataset_tables(
344
+ self,
345
+ workspace_id: str,
346
+ dataset_id: str,
347
+ ) -> list[dict[str, Any]]:
348
+ try:
349
+ return self._paged_values(
350
+ f"groups/{workspace_id}/datasets/{dataset_id}/tables",
351
+ params={"$top": 5000},
352
+ )
353
+ except Exception as exc:
354
+ logger.debug(
355
+ "Failed to list tables for dataset %s in workspace %s: %s",
356
+ dataset_id,
357
+ workspace_id,
358
+ exc,
359
+ )
360
+ return []
361
+
362
+ def _to_asset_ref(
363
+ self,
364
+ *,
365
+ raw_id: str,
366
+ kind: str,
367
+ workspace_id: str,
368
+ workspace_name: str,
369
+ asset_id: str,
370
+ name: str,
371
+ external_url: str,
372
+ metadata: dict[str, Any],
373
+ linked_raw_ids: list[str] | None = None,
374
+ ) -> PowerBIAssetRef:
375
+ created_at = self._parse_datetime(metadata.get("createdDateTime")) or datetime.now(UTC)
376
+ updated_at = (
377
+ self._parse_datetime(metadata.get("modifiedDateTime"))
378
+ or self._parse_datetime(metadata.get("lastUpdate"))
379
+ or created_at
380
+ )
381
+
382
+ return PowerBIAssetRef(
383
+ raw_id=raw_id,
384
+ kind=kind,
385
+ workspace_id=workspace_id,
386
+ workspace_name=workspace_name,
387
+ asset_id=asset_id,
388
+ name=name,
389
+ external_url=external_url,
390
+ metadata=metadata,
391
+ linked_raw_ids=list(linked_raw_ids or []),
392
+ created_at=created_at,
393
+ updated_at=updated_at,
394
+ )
395
+
396
+ def _discover_assets(self) -> list[PowerBIAssetRef]:
397
+ extraction = self._extraction_options()
398
+ refs: list[PowerBIAssetRef] = []
399
+
400
+ for workspace in self._list_workspaces():
401
+ if self._aborted:
402
+ break
403
+
404
+ workspace_id = str(workspace.get("id") or "").strip()
405
+ workspace_name = str(workspace.get("name") or workspace_id)
406
+ if not workspace_id:
407
+ continue
408
+
409
+ workspace_raw_id = self._workspace_raw_id(workspace_id)
410
+ if extraction.extract_workspaces_to_containers is not False:
411
+ refs.append(
412
+ self._to_asset_ref(
413
+ raw_id=workspace_raw_id,
414
+ kind="workspace",
415
+ workspace_id=workspace_id,
416
+ workspace_name=workspace_name,
417
+ asset_id=workspace_id,
418
+ name=workspace_name,
419
+ external_url=self._workspace_external_url(workspace_id),
420
+ metadata={
421
+ "workspace": workspace,
422
+ "extract_workspaces_to_containers": bool(
423
+ extraction.extract_workspaces_to_containers
424
+ ),
425
+ },
426
+ )
427
+ )
428
+
429
+ datasets = self._list_datasets(workspace_id)
430
+ dataset_raw_by_id: dict[str, str] = {}
431
+ extract_schema = extraction.extract_dataset_schema is not False
432
+
433
+ for dataset in datasets:
434
+ dataset_id = str(dataset.get("id") or "").strip()
435
+ if not dataset_id:
436
+ continue
437
+
438
+ dataset_raw_id = self._dataset_raw_id(workspace_id, dataset_id)
439
+ dataset_raw_by_id[dataset_id] = dataset_raw_id
440
+
441
+ metadata: dict[str, Any] = {
442
+ "workspace": {
443
+ "id": workspace_id,
444
+ "name": workspace_name,
445
+ },
446
+ "dataset": dataset,
447
+ "extract_datasets_to_containers": bool(
448
+ extraction.extract_datasets_to_containers
449
+ ),
450
+ }
451
+ if extract_schema:
452
+ tables = self._list_dataset_tables(workspace_id, dataset_id)
453
+ if tables:
454
+ metadata["tables"] = tables
455
+
456
+ refs.append(
457
+ self._to_asset_ref(
458
+ raw_id=dataset_raw_id,
459
+ kind="dataset",
460
+ workspace_id=workspace_id,
461
+ workspace_name=workspace_name,
462
+ asset_id=dataset_id,
463
+ name=str(dataset.get("name") or dataset_id),
464
+ external_url=self._coerce_url(
465
+ dataset.get("webUrl"),
466
+ self._dataset_external_url(workspace_id, dataset_id),
467
+ ),
468
+ metadata=metadata,
469
+ linked_raw_ids=[workspace_raw_id],
470
+ )
471
+ )
472
+
473
+ if extraction.extract_reports is not False:
474
+ for report in self._list_reports(workspace_id):
475
+ report_id = str(report.get("id") or "").strip()
476
+ if not report_id:
477
+ continue
478
+
479
+ raw_id = self._report_raw_id(workspace_id, report_id)
480
+ linked_raw_ids = [workspace_raw_id]
481
+ dataset_id = str(report.get("datasetId") or "").strip()
482
+ if dataset_id and dataset_id in dataset_raw_by_id:
483
+ linked_raw_ids.append(dataset_raw_by_id[dataset_id])
484
+
485
+ refs.append(
486
+ self._to_asset_ref(
487
+ raw_id=raw_id,
488
+ kind="report",
489
+ workspace_id=workspace_id,
490
+ workspace_name=workspace_name,
491
+ asset_id=report_id,
492
+ name=str(report.get("name") or report_id),
493
+ external_url=self._coerce_url(
494
+ report.get("webUrl"),
495
+ self._report_external_url(workspace_id, report_id),
496
+ ),
497
+ metadata={
498
+ "workspace": {
499
+ "id": workspace_id,
500
+ "name": workspace_name,
501
+ },
502
+ "report": report,
503
+ },
504
+ linked_raw_ids=linked_raw_ids,
505
+ )
506
+ )
507
+
508
+ if extraction.extract_dashboards is not False:
509
+ for dashboard in self._list_dashboards(workspace_id):
510
+ dashboard_id = str(dashboard.get("id") or "").strip()
511
+ if not dashboard_id:
512
+ continue
513
+
514
+ raw_id = self._dashboard_raw_id(workspace_id, dashboard_id)
515
+ refs.append(
516
+ self._to_asset_ref(
517
+ raw_id=raw_id,
518
+ kind="dashboard",
519
+ workspace_id=workspace_id,
520
+ workspace_name=workspace_name,
521
+ asset_id=dashboard_id,
522
+ name=str(
523
+ dashboard.get("displayName")
524
+ or dashboard.get("name")
525
+ or dashboard_id
526
+ ),
527
+ external_url=self._coerce_url(
528
+ dashboard.get("webUrl"),
529
+ self._dashboard_external_url(workspace_id, dashboard_id),
530
+ ),
531
+ metadata={
532
+ "workspace": {
533
+ "id": workspace_id,
534
+ "name": workspace_name,
535
+ },
536
+ "dashboard": dashboard,
537
+ },
538
+ linked_raw_ids=[workspace_raw_id],
539
+ )
540
+ )
541
+
542
+ return refs
543
+
544
+ def _sampling_sort_datetime(self, ref: PowerBIAssetRef, field_name: str) -> datetime | None:
545
+ candidates = [
546
+ ref.metadata.get(field_name),
547
+ ref.metadata.get("report", {}).get(field_name),
548
+ ref.metadata.get("dataset", {}).get(field_name),
549
+ ref.metadata.get("dashboard", {}).get(field_name),
550
+ ref.metadata.get("workspace", {}).get(field_name),
551
+ ]
552
+ for value in candidates:
553
+ parsed = self._parse_datetime(value)
554
+ if parsed is not None:
555
+ return parsed
556
+ return None
557
+
558
+ def _sample_refs(self, refs: list[PowerBIAssetRef]) -> list[PowerBIAssetRef]:
559
+ sampling = self._sampling()
560
+ if sampling.strategy == SamplingStrategy.ALL:
561
+ return refs
562
+
563
+ if sampling.strategy == SamplingStrategy.RANDOM:
564
+ limit = int(sampling.rows_per_page or 100)
565
+ if limit >= len(refs):
566
+ return refs
567
+ generator = random.Random(0)
568
+ sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
569
+ return [refs[index] for index in sampled_indexes]
570
+
571
+ order_field = sampling.order_by_column or "modifiedDateTime"
572
+ values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
573
+ has_order_values = any(value is not None for value in values)
574
+
575
+ if not has_order_values and sampling.fallback_to_random is not False:
576
+ generator = random.Random(0)
577
+ limit = int(sampling.rows_per_page or 100)
578
+ sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
579
+ return [refs[index] for index in sampled_indexes]
580
+
581
+ scored: list[tuple[bool, datetime, PowerBIAssetRef]] = []
582
+ for ref, parsed in zip(refs, values, strict=False):
583
+ effective = parsed or ref.updated_at
584
+ scored.append((parsed is not None, effective, ref))
585
+
586
+ scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
587
+ limit = int(sampling.rows_per_page or 100)
588
+ return [item[2] for item in scored[:limit]]
589
+
590
+ def _asset_from_ref(
591
+ self,
592
+ ref: PowerBIAssetRef,
593
+ *,
594
+ links: list[str],
595
+ ) -> SingleAssetScanResults:
596
+ asset_hash = self.generate_hash_id(ref.raw_id)
597
+ checksum_payload = {
598
+ "kind": ref.kind,
599
+ "workspace_id": ref.workspace_id,
600
+ "workspace_name": ref.workspace_name,
601
+ "asset_id": ref.asset_id,
602
+ "name": ref.name,
603
+ "metadata": ref.metadata,
604
+ }
605
+
606
+ return SingleAssetScanResults(
607
+ hash=asset_hash,
608
+ checksum=self.calculate_checksum(checksum_payload),
609
+ name=f"{ref.workspace_name} / {ref.kind} / {ref.name}",
610
+ external_url=self.ensure_location(
611
+ ref.external_url,
612
+ fallback=self._workspace_external_url(ref.workspace_id),
613
+ ),
614
+ links=links,
615
+ asset_type=OutputAssetType.TXT,
616
+ source_id=self.source_id,
617
+ created_at=ref.created_at,
618
+ updated_at=ref.updated_at,
619
+ runner_id=self.runner_id,
620
+ )
621
+
622
+ def test_connection(self) -> dict[str, Any]:
623
+ logger.info("Testing connection to PowerBI...")
624
+ result = {
625
+ "timestamp": datetime.now(UTC).isoformat(),
626
+ "source_type": self.recipe.get("type"),
627
+ }
628
+
629
+ try:
630
+ workspaces = self._list_workspaces()
631
+ auth_mode = "SERVICE_PRINCIPAL" if self._is_service_principal_mode() else "ACCESS_TOKEN"
632
+ result["status"] = "SUCCESS"
633
+ result["message"] = (
634
+ f"Successfully connected to PowerBI using {auth_mode}. "
635
+ f"Reachable workspaces: {len(workspaces)}."
636
+ )
637
+ except Exception as exc:
638
+ result["status"] = "FAILURE"
639
+ result["message"] = f"Failed to connect to PowerBI: {exc}"
640
+
641
+ return result
642
+
643
+ STREAM_DETECTIONS = True
644
+
645
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
646
+ if self._aborted:
647
+ return
648
+
649
+ refs = self._sample_refs(self._discover_assets())
650
+ hash_by_raw = {ref.raw_id: self.generate_hash_id(ref.raw_id) for ref in refs}
651
+
652
+ batch: list[SingleAssetScanResults] = []
653
+ for ref in refs:
654
+ if self._aborted:
655
+ return
656
+
657
+ asset_hash = hash_by_raw[ref.raw_id]
658
+ self._asset_lookup[asset_hash] = ref
659
+
660
+ linked_hashes = [
661
+ hash_by_raw[linked_raw_id]
662
+ for linked_raw_id in ref.linked_raw_ids
663
+ if linked_raw_id in hash_by_raw
664
+ ]
665
+
666
+ batch.append(self._asset_from_ref(ref, links=linked_hashes))
667
+
668
+ if len(batch) >= self.BATCH_SIZE:
669
+ yield batch
670
+ batch = []
671
+
672
+ if batch:
673
+ yield batch
674
+
675
+ def generate_hash_id(self, asset_id: str) -> str:
676
+ return hash_id(self._asset_type_value(), asset_id)
677
+
678
+ def _format_asset_content(self, ref: PowerBIAssetRef) -> tuple[str, str]:
679
+ sampling = self._sampling()
680
+ lines: list[str] = [
681
+ f"workspace={ref.workspace_name}",
682
+ f"workspace_id={ref.workspace_id}",
683
+ f"kind={ref.kind}",
684
+ f"name={ref.name}",
685
+ f"sampling_strategy={sampling.strategy}",
686
+ "",
687
+ ]
688
+
689
+ if ref.kind == "dataset":
690
+ tables = ref.metadata.get("tables")
691
+ if isinstance(tables, list) and tables:
692
+ lines.append(f"dataset_tables={len(tables)}")
693
+ for table in tables[:20]:
694
+ if not isinstance(table, dict):
695
+ continue
696
+ table_name = str(table.get("name") or "")
697
+ columns = table.get("columns", [])
698
+ column_names = [
699
+ str(column.get("name"))
700
+ for column in columns
701
+ if isinstance(column, dict) and column.get("name")
702
+ ]
703
+ rendered_columns = ", ".join(column_names[:20])
704
+ lines.append(f"table={table_name}; columns={rendered_columns}")
705
+
706
+ if ref.kind == "report":
707
+ dataset_id = ref.metadata.get("report", {}).get("datasetId")
708
+ if dataset_id:
709
+ lines.append(f"dataset_id={dataset_id}")
710
+
711
+ text_content = "\n".join(lines)
712
+ raw_content = json.dumps(
713
+ {
714
+ "kind": ref.kind,
715
+ "workspace_id": ref.workspace_id,
716
+ "workspace_name": ref.workspace_name,
717
+ "asset_id": ref.asset_id,
718
+ "name": ref.name,
719
+ "metadata": ref.metadata,
720
+ },
721
+ ensure_ascii=False,
722
+ default=str,
723
+ )
724
+ return raw_content, text_content
725
+
726
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
727
+ cached = self._content_cache.get(asset_id)
728
+ if cached:
729
+ return cached
730
+
731
+ ref = self._asset_lookup.get(asset_id)
732
+ if ref is None:
733
+ try:
734
+ decoded = unhash_id(asset_id)
735
+ except Exception:
736
+ decoded = asset_id
737
+
738
+ if decoded.startswith("POWERBI_#_"):
739
+ decoded = decoded[len("POWERBI_#_") :]
740
+
741
+ for known_ref in self._asset_lookup.values():
742
+ if known_ref.raw_id == decoded:
743
+ ref = known_ref
744
+ break
745
+
746
+ if ref is None:
747
+ return None
748
+
749
+ content = self._format_asset_content(ref)
750
+ self._content_cache[asset_id] = content
751
+ return content
752
+
753
+ def enrich_finding_location(
754
+ self,
755
+ finding: DetectionResult,
756
+ asset: SingleAssetScanResults,
757
+ text_content: str,
758
+ ) -> None:
759
+ _ = text_content
760
+ ref = self._asset_lookup.get(asset.hash)
761
+ if not ref:
762
+ return
763
+
764
+ finding.location = Location(path=f"{ref.workspace_name}/{ref.kind}/{ref.name}")
765
+
766
+ def abort(self) -> None:
767
+ logger.info("Aborting PowerBI extraction...")
768
+ super().abort()
769
+
770
+ def cleanup(self) -> None:
771
+ try:
772
+ self.session.close()
773
+ except Exception:
774
+ logger.debug("Failed to close PowerBI session cleanly", exc_info=True)