classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,799 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import random
6
+ from collections.abc import AsyncGenerator
7
+ from contextlib import contextmanager
8
+ from dataclasses import dataclass
9
+ from datetime import UTC, datetime
10
+ from typing import Any
11
+ from urllib.parse import urljoin
12
+
13
+ from requests.adapters import HTTPAdapter
14
+ from urllib3 import Retry
15
+
16
+ from ...models.generated_input import (
17
+ SamplingConfig,
18
+ SamplingStrategy,
19
+ TableauInput,
20
+ TableauMaskedPersonalAccessToken,
21
+ TableauMaskedUsernamePassword,
22
+ TableauOptionalConnection,
23
+ TableauOptionalExtraction,
24
+ TableauOptionalScope,
25
+ TableauRequiredPersonalAccessToken,
26
+ TableauRequiredUsernamePassword,
27
+ )
28
+ from ...models.generated_single_asset_scan_results import (
29
+ AssetType as OutputAssetType,
30
+ )
31
+ from ...models.generated_single_asset_scan_results import (
32
+ DetectionResult,
33
+ Location,
34
+ SingleAssetScanResults,
35
+ )
36
+ from ...utils.hashing import hash_id, unhash_id
37
+ from ..base import BaseSource
38
+ from ..dependencies import require_module
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ _RETRIABLE_STATUS_CODES = [408, 429, 500, 502, 503, 504]
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class TableauAssetRef:
47
+ raw_id: str
48
+ kind: str
49
+ site: str
50
+ project_name: str | None
51
+ asset_id: str
52
+ name: str
53
+ external_url: str
54
+ metadata: dict[str, Any]
55
+ linked_raw_ids: list[str]
56
+ created_at: datetime
57
+ updated_at: datetime
58
+
59
+
60
+ class TableauSource(BaseSource):
61
+ source_type = "tableau"
62
+
63
+ def __init__(
64
+ self,
65
+ recipe: dict[str, Any],
66
+ source_id: str | None = None,
67
+ runner_id: str | None = None,
68
+ ) -> None:
69
+ super().__init__(recipe, source_id, runner_id)
70
+ self.config = TableauInput.model_validate(recipe)
71
+ self.runner_id = runner_id or "local-run"
72
+
73
+ self._tsc = require_module(
74
+ module_name="tableauserverclient",
75
+ source_name="Tableau",
76
+ uv_groups=["tableau"],
77
+ detail="The Tableau connector is optional.",
78
+ )
79
+ self._asset_lookup: dict[str, TableauAssetRef] = {}
80
+ self._content_cache: dict[str, tuple[str, str]] = {}
81
+ self._owner_cache: dict[str, dict[str, Any]] = {}
82
+
83
+ self._validate_auth_configuration()
84
+
85
+ def _validate_auth_configuration(self) -> None:
86
+ required = self.config.required
87
+ masked = self.config.masked
88
+
89
+ if isinstance(required, TableauRequiredUsernamePassword):
90
+ if not isinstance(masked, TableauMaskedUsernamePassword):
91
+ raise ValueError(
92
+ "TABLEAU USERNAME_PASSWORD auth requires masked.username and masked.password"
93
+ )
94
+ return
95
+
96
+ if isinstance(required, TableauRequiredPersonalAccessToken):
97
+ if not isinstance(masked, TableauMaskedPersonalAccessToken):
98
+ raise ValueError("TABLEAU PERSONAL_ACCESS_TOKEN auth requires masked.token_value")
99
+ return
100
+
101
+ raise ValueError("Unsupported TABLEAU auth configuration")
102
+
103
+ def _asset_type_value(self) -> str:
104
+ type_value = self.config.type
105
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
106
+
107
+ def _sampling(self) -> SamplingConfig:
108
+ return self.config.sampling
109
+
110
+ def _connection_options(self) -> TableauOptionalConnection:
111
+ if self.config.optional and self.config.optional.connection:
112
+ return self.config.optional.connection
113
+ return TableauOptionalConnection()
114
+
115
+ def _scope_options(self) -> TableauOptionalScope:
116
+ if self.config.optional and self.config.optional.scope:
117
+ return self.config.optional.scope
118
+ return TableauOptionalScope()
119
+
120
+ def _extraction_options(self) -> TableauOptionalExtraction:
121
+ if self.config.optional and self.config.optional.extraction:
122
+ return self.config.optional.extraction
123
+ return TableauOptionalExtraction()
124
+
125
+ def _connect_uri(self) -> str:
126
+ return str(self.config.required.connect_uri).rstrip("/")
127
+
128
+ def _site(self) -> str:
129
+ return self.config.required.site
130
+
131
+ def _site_for_display(self) -> str:
132
+ site = self._site().strip()
133
+ return site if site else "default"
134
+
135
+ def _timeout_seconds(self) -> int:
136
+ timeout = self._connection_options().timeout_seconds
137
+ return int(timeout or 30)
138
+
139
+ def _request_options(self, page_number: int):
140
+ request_options = self._tsc.RequestOptions()
141
+ request_options.page_size = 100
142
+ request_options.page_number = page_number
143
+ # Compatibility across tableauserverclient versions.
144
+ request_options.pagesize = 100
145
+ request_options.pagenumber = page_number
146
+ return request_options
147
+
148
+ def _build_auth(self) -> Any:
149
+ required = self.config.required
150
+ masked = self.config.masked
151
+ site = self._site()
152
+
153
+ if isinstance(required, TableauRequiredUsernamePassword):
154
+ if not isinstance(masked, TableauMaskedUsernamePassword):
155
+ raise ValueError(
156
+ "TABLEAU USERNAME_PASSWORD auth requires masked.username and masked.password"
157
+ )
158
+ return self._tsc.TableauAuth(
159
+ username=masked.username,
160
+ password=masked.password,
161
+ site_id=site,
162
+ )
163
+
164
+ if isinstance(required, TableauRequiredPersonalAccessToken):
165
+ if not isinstance(masked, TableauMaskedPersonalAccessToken):
166
+ raise ValueError("TABLEAU PERSONAL_ACCESS_TOKEN auth requires masked.token_value")
167
+ return self._tsc.PersonalAccessTokenAuth(
168
+ required.token_name,
169
+ masked.token_value,
170
+ site,
171
+ )
172
+
173
+ raise ValueError("Unsupported TABLEAU auth configuration")
174
+
175
+ def _build_server(self) -> Any:
176
+ connection_options = self._connection_options()
177
+ ssl_verify = connection_options.ssl_verify
178
+ http_options: dict[str, Any] = {
179
+ "verify": bool(ssl_verify) if not isinstance(ssl_verify, str) else ssl_verify,
180
+ "timeout": self._timeout_seconds(),
181
+ }
182
+
183
+ server = self._tsc.Server(
184
+ self._connect_uri(),
185
+ use_server_version=True,
186
+ http_options=http_options,
187
+ )
188
+
189
+ if hasattr(server, "_session"):
190
+ server._session.trust_env = bool(connection_options.session_trust_env)
191
+
192
+ adapter = HTTPAdapter(
193
+ max_retries=Retry(
194
+ total=int(connection_options.max_retries or 3),
195
+ backoff_factor=1,
196
+ status_forcelist=_RETRIABLE_STATUS_CODES,
197
+ )
198
+ )
199
+ server._session.mount("http://", adapter)
200
+ server._session.mount("https://", adapter)
201
+
202
+ return server
203
+
204
+ @contextmanager
205
+ def _signed_in_server(self):
206
+ server = self._build_server()
207
+ auth = self._build_auth()
208
+ server.auth.sign_in(auth)
209
+ try:
210
+ yield server
211
+ finally:
212
+ try:
213
+ server.auth.sign_out()
214
+ except Exception:
215
+ logger.debug("Failed to sign out Tableau session cleanly", exc_info=True)
216
+
217
+ def _paged_items(self, endpoint: Any) -> list[Any]:
218
+ pager = getattr(self._tsc, "Pager", None)
219
+ if pager is not None:
220
+ try:
221
+ return list(pager(endpoint))
222
+ except Exception:
223
+ logger.debug("Tableau Pager fallback to manual pagination", exc_info=True)
224
+
225
+ items: list[Any] = []
226
+ page_number = 1
227
+ while True:
228
+ request_options = self._request_options(page_number)
229
+ response = endpoint.get(request_options)
230
+ if not isinstance(response, tuple) or len(response) != 2:
231
+ if isinstance(response, list):
232
+ items.extend(response)
233
+ break
234
+
235
+ page_items, pagination = response
236
+ if page_items:
237
+ items.extend(page_items)
238
+
239
+ total = int(
240
+ getattr(
241
+ pagination,
242
+ "total_available",
243
+ getattr(pagination, "totalAvailable", len(items)),
244
+ )
245
+ )
246
+ page_size = int(
247
+ getattr(
248
+ pagination,
249
+ "page_size",
250
+ getattr(pagination, "pagesize", len(page_items) or 1),
251
+ )
252
+ )
253
+ current_page = int(
254
+ getattr(
255
+ pagination,
256
+ "page_number",
257
+ getattr(pagination, "pagenumber", page_number),
258
+ )
259
+ )
260
+ if current_page * page_size >= total:
261
+ break
262
+
263
+ page_number = current_page + 1
264
+
265
+ return items
266
+
267
+ def _project_allowlist(self) -> set[str]:
268
+ configured = self._scope_options().project_names or []
269
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
270
+
271
+ def _workbook_allowlist(self) -> set[str]:
272
+ configured = self._scope_options().workbook_names or []
273
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
274
+
275
+ def _datasource_allowlist(self) -> set[str]:
276
+ configured = self._scope_options().datasource_names or []
277
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
278
+
279
+ def _project_allowed(self, project_name: str) -> bool:
280
+ allowlist = self._project_allowlist()
281
+ if not allowlist:
282
+ return True
283
+ return project_name.strip().lower() in allowlist
284
+
285
+ def _workbook_allowed(self, workbook_name: str) -> bool:
286
+ allowlist = self._workbook_allowlist()
287
+ if not allowlist:
288
+ return True
289
+ return workbook_name.strip().lower() in allowlist
290
+
291
+ def _datasource_allowed(self, datasource_name: str) -> bool:
292
+ allowlist = self._datasource_allowlist()
293
+ if not allowlist:
294
+ return True
295
+ return datasource_name.strip().lower() in allowlist
296
+
297
+ def _coerce_external_url(self, value: Any, fallback: str) -> str:
298
+ candidate = str(value or "").strip()
299
+ if not candidate:
300
+ return fallback
301
+ if candidate.startswith("http://") or candidate.startswith("https://"):
302
+ return candidate
303
+ return urljoin(f"{self._connect_uri()}/", candidate)
304
+
305
+ def _parse_datetime(self, value: Any) -> datetime | None:
306
+ if isinstance(value, datetime):
307
+ return value if value.tzinfo else value.replace(tzinfo=UTC)
308
+
309
+ if isinstance(value, str):
310
+ cleaned = value.strip()
311
+ if not cleaned:
312
+ return None
313
+ normalized = cleaned.replace("Z", "+00:00")
314
+ try:
315
+ parsed = datetime.fromisoformat(normalized)
316
+ except ValueError:
317
+ return None
318
+ return parsed if parsed.tzinfo else parsed.replace(tzinfo=UTC)
319
+
320
+ return None
321
+
322
+ def _project_raw_id(self, project_id: str) -> str:
323
+ return f"{self._site_for_display()}_#_project_#_{project_id}"
324
+
325
+ def _workbook_raw_id(self, workbook_id: str) -> str:
326
+ return f"{self._site_for_display()}_#_workbook_#_{workbook_id}"
327
+
328
+ def _datasource_raw_id(self, datasource_id: str) -> str:
329
+ return f"{self._site_for_display()}_#_datasource_#_{datasource_id}"
330
+
331
+ def _project_fallback_url(self, project_id: str) -> str:
332
+ return f"{self._connect_uri()}/#/site/{self._site_for_display()}/projects/{project_id}"
333
+
334
+ def _workbook_fallback_url(self, workbook_id: str) -> str:
335
+ return f"{self._connect_uri()}/#/site/{self._site_for_display()}/workbooks/{workbook_id}"
336
+
337
+ def _datasource_fallback_url(self, datasource_id: str) -> str:
338
+ return (
339
+ f"{self._connect_uri()}/#/site/{self._site_for_display()}/datasources/{datasource_id}"
340
+ )
341
+
342
+ def _resolve_owner_metadata(self, server: Any, owner_id: str | None) -> dict[str, Any] | None:
343
+ if not owner_id:
344
+ return None
345
+ if owner_id in self._owner_cache:
346
+ return self._owner_cache[owner_id]
347
+
348
+ try:
349
+ user = server.users.get_by_id(owner_id)
350
+ except Exception:
351
+ logger.debug("Unable to resolve Tableau owner for %s", owner_id, exc_info=True)
352
+ self._owner_cache[owner_id] = {"id": owner_id}
353
+ return self._owner_cache[owner_id]
354
+
355
+ owner = {
356
+ "id": owner_id,
357
+ "name": str(getattr(user, "name", "") or "").strip() or None,
358
+ "full_name": str(getattr(user, "full_name", "") or "").strip() or None,
359
+ "email": str(getattr(user, "email", "") or "").strip() or None,
360
+ }
361
+ self._owner_cache[owner_id] = owner
362
+ return owner
363
+
364
+ def _extract_tags(self, endpoint: Any, item: Any) -> list[str]:
365
+ if not bool(self._extraction_options().ingest_tags):
366
+ return []
367
+
368
+ try:
369
+ endpoint.populate_tags(item)
370
+ except Exception:
371
+ logger.debug("Unable to populate Tableau tags for %s", item, exc_info=True)
372
+
373
+ tags: list[str] = []
374
+ for tag in getattr(item, "tags", []) or []:
375
+ label = str(getattr(tag, "name", "") or "").strip()
376
+ if label:
377
+ tags.append(label)
378
+ return tags
379
+
380
+ def _to_asset_ref(
381
+ self,
382
+ *,
383
+ raw_id: str,
384
+ kind: str,
385
+ asset_id: str,
386
+ name: str,
387
+ project_name: str | None,
388
+ external_url: str,
389
+ metadata: dict[str, Any],
390
+ linked_raw_ids: list[str] | None = None,
391
+ ) -> TableauAssetRef:
392
+ created_at = self._parse_datetime(metadata.get("created_at")) or datetime.now(UTC)
393
+ updated_at = self._parse_datetime(metadata.get("updated_at")) or created_at
394
+ return TableauAssetRef(
395
+ raw_id=raw_id,
396
+ kind=kind,
397
+ site=self._site_for_display(),
398
+ project_name=project_name,
399
+ asset_id=asset_id,
400
+ name=name,
401
+ external_url=external_url,
402
+ metadata=metadata,
403
+ linked_raw_ids=list(linked_raw_ids or []),
404
+ created_at=created_at,
405
+ updated_at=updated_at,
406
+ )
407
+
408
+ def _discover_assets(self) -> list[TableauAssetRef]:
409
+ refs: list[TableauAssetRef] = []
410
+ scope = self._scope_options()
411
+ extraction = self._extraction_options()
412
+
413
+ with self._signed_in_server() as server:
414
+ projects = self._paged_items(server.projects)
415
+ project_raw_by_id: dict[str, str] = {}
416
+ project_name_by_id: dict[str, str] = {}
417
+
418
+ for project in projects:
419
+ if self._aborted:
420
+ break
421
+
422
+ project_id = str(getattr(project, "id", "") or "").strip()
423
+ project_name = str(getattr(project, "name", "") or "").strip()
424
+ if not project_id or not project_name:
425
+ continue
426
+ if not self._project_allowed(project_name):
427
+ continue
428
+
429
+ raw_id = self._project_raw_id(project_id)
430
+ project_raw_by_id[project_id] = raw_id
431
+ project_name_by_id[project_id] = project_name
432
+ refs.append(
433
+ self._to_asset_ref(
434
+ raw_id=raw_id,
435
+ kind="project",
436
+ asset_id=project_id,
437
+ name=project_name,
438
+ project_name=project_name,
439
+ external_url=self._project_fallback_url(project_id),
440
+ metadata={
441
+ "site": self._site_for_display(),
442
+ "project_id": project_id,
443
+ "project_name": project_name,
444
+ },
445
+ )
446
+ )
447
+
448
+ include_workbooks = scope.include_workbooks is not False
449
+ if include_workbooks:
450
+ for workbook in self._paged_items(server.workbooks):
451
+ if self._aborted:
452
+ break
453
+
454
+ workbook_id = str(getattr(workbook, "id", "") or "").strip()
455
+ workbook_name = str(getattr(workbook, "name", "") or "").strip()
456
+ if (
457
+ not workbook_id
458
+ or not workbook_name
459
+ or not self._workbook_allowed(workbook_name)
460
+ ):
461
+ continue
462
+
463
+ project_id = str(getattr(workbook, "project_id", "") or "").strip()
464
+ project_name = str(getattr(workbook, "project_name", "") or "").strip() or None
465
+ if project_name and not self._project_allowed(project_name):
466
+ continue
467
+ if project_id and not project_name:
468
+ project_name = project_name_by_id.get(project_id)
469
+
470
+ tags = self._extract_tags(server.workbooks, workbook)
471
+ owner_id = str(getattr(workbook, "owner_id", "") or "").strip() or None
472
+
473
+ metadata: dict[str, Any] = {
474
+ "site": self._site_for_display(),
475
+ "project_id": project_id or None,
476
+ "project_name": project_name,
477
+ "created_at": getattr(workbook, "created_at", None),
478
+ "updated_at": getattr(workbook, "updated_at", None),
479
+ }
480
+ if extraction.ingest_tags:
481
+ metadata["tags"] = tags
482
+ if extraction.ingest_owner:
483
+ metadata["owner"] = self._resolve_owner_metadata(server, owner_id)
484
+ if extraction.extract_usage_stats:
485
+ metadata["usage"] = {
486
+ "total_views": int(getattr(workbook, "total_views", 0) or 0)
487
+ }
488
+
489
+ linked: list[str] = []
490
+ if project_id and project_id in project_raw_by_id:
491
+ linked.append(project_raw_by_id[project_id])
492
+
493
+ refs.append(
494
+ self._to_asset_ref(
495
+ raw_id=self._workbook_raw_id(workbook_id),
496
+ kind="workbook",
497
+ asset_id=workbook_id,
498
+ name=workbook_name,
499
+ project_name=project_name,
500
+ external_url=self._coerce_external_url(
501
+ getattr(workbook, "webpage_url", None),
502
+ self._workbook_fallback_url(workbook_id),
503
+ ),
504
+ metadata=metadata,
505
+ linked_raw_ids=linked,
506
+ )
507
+ )
508
+
509
+ include_datasources = scope.include_datasources is not False
510
+ if include_datasources:
511
+ for datasource in self._paged_items(server.datasources):
512
+ if self._aborted:
513
+ break
514
+
515
+ datasource_id = str(getattr(datasource, "id", "") or "").strip()
516
+ datasource_name = str(getattr(datasource, "name", "") or "").strip()
517
+ if (
518
+ not datasource_id
519
+ or not datasource_name
520
+ or not self._datasource_allowed(datasource_name)
521
+ ):
522
+ continue
523
+
524
+ project_id = str(getattr(datasource, "project_id", "") or "").strip()
525
+ project_name = (
526
+ str(getattr(datasource, "project_name", "") or "").strip() or None
527
+ )
528
+ if project_name and not self._project_allowed(project_name):
529
+ continue
530
+ if project_id and not project_name:
531
+ project_name = project_name_by_id.get(project_id)
532
+
533
+ tags = self._extract_tags(server.datasources, datasource)
534
+ owner_id = str(getattr(datasource, "owner_id", "") or "").strip() or None
535
+
536
+ metadata = {
537
+ "site": self._site_for_display(),
538
+ "project_id": project_id or None,
539
+ "project_name": project_name,
540
+ "created_at": getattr(datasource, "created_at", None),
541
+ "updated_at": getattr(datasource, "updated_at", None),
542
+ }
543
+ if extraction.ingest_tags:
544
+ metadata["tags"] = tags
545
+ if extraction.ingest_owner:
546
+ metadata["owner"] = self._resolve_owner_metadata(server, owner_id)
547
+ if extraction.extract_usage_stats:
548
+ metadata["usage"] = {
549
+ "total_views": int(getattr(datasource, "total_views", 0) or 0)
550
+ }
551
+
552
+ linked = []
553
+ if project_id and project_id in project_raw_by_id:
554
+ linked.append(project_raw_by_id[project_id])
555
+
556
+ refs.append(
557
+ self._to_asset_ref(
558
+ raw_id=self._datasource_raw_id(datasource_id),
559
+ kind="datasource",
560
+ asset_id=datasource_id,
561
+ name=datasource_name,
562
+ project_name=project_name,
563
+ external_url=self._coerce_external_url(
564
+ getattr(datasource, "webpage_url", None),
565
+ self._datasource_fallback_url(datasource_id),
566
+ ),
567
+ metadata=metadata,
568
+ linked_raw_ids=linked,
569
+ )
570
+ )
571
+
572
+ refs.sort(key=lambda ref: (ref.kind, ref.name.lower(), ref.asset_id))
573
+ return refs
574
+
575
+ def _sampling_sort_datetime(self, ref: TableauAssetRef, field_name: str) -> datetime | None:
576
+ candidates = [
577
+ ref.metadata.get(field_name),
578
+ ref.metadata.get(field_name.lower()),
579
+ ref.metadata.get(field_name.upper()),
580
+ ref.metadata.get("updated_at"),
581
+ ref.metadata.get("created_at"),
582
+ ]
583
+ for candidate in candidates:
584
+ parsed = self._parse_datetime(candidate)
585
+ if parsed is not None:
586
+ return parsed
587
+ return None
588
+
589
+ def _sample_refs(self, refs: list[TableauAssetRef]) -> list[TableauAssetRef]:
590
+ sampling = self._sampling()
591
+ if sampling.strategy == SamplingStrategy.ALL:
592
+ return refs
593
+
594
+ if sampling.strategy == SamplingStrategy.RANDOM:
595
+ limit = int(sampling.rows_per_page or 100)
596
+ if limit >= len(refs):
597
+ return refs
598
+ generator = random.Random(0)
599
+ sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
600
+ return [refs[index] for index in sampled_indexes]
601
+
602
+ order_field = sampling.order_by_column or "updated_at"
603
+ values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
604
+ has_order_values = any(value is not None for value in values)
605
+
606
+ if not has_order_values and sampling.fallback_to_random is not False:
607
+ generator = random.Random(0)
608
+ limit = int(sampling.rows_per_page or 100)
609
+ sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
610
+ return [refs[index] for index in sampled_indexes]
611
+
612
+ scored: list[tuple[bool, datetime, TableauAssetRef]] = []
613
+ for ref, parsed in zip(refs, values, strict=False):
614
+ effective = parsed or ref.updated_at
615
+ scored.append((parsed is not None, effective, ref))
616
+
617
+ scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
618
+ limit = int(sampling.rows_per_page or 100)
619
+ return [item[2] for item in scored[:limit]]
620
+
621
+ def _asset_from_ref(
622
+ self,
623
+ ref: TableauAssetRef,
624
+ *,
625
+ links: list[str],
626
+ ) -> SingleAssetScanResults:
627
+ asset_hash = self.generate_hash_id(ref.raw_id)
628
+ checksum_payload = {
629
+ "kind": ref.kind,
630
+ "site": ref.site,
631
+ "project_name": ref.project_name,
632
+ "asset_id": ref.asset_id,
633
+ "name": ref.name,
634
+ "metadata": ref.metadata,
635
+ }
636
+
637
+ return SingleAssetScanResults(
638
+ hash=asset_hash,
639
+ checksum=self.calculate_checksum(checksum_payload),
640
+ name=f"{ref.site} / {ref.kind} / {ref.name}",
641
+ external_url=self.ensure_location(ref.external_url, fallback=self._connect_uri()),
642
+ links=links,
643
+ asset_type=OutputAssetType.TXT,
644
+ source_id=self.source_id,
645
+ created_at=ref.created_at,
646
+ updated_at=ref.updated_at,
647
+ runner_id=self.runner_id,
648
+ )
649
+
650
+ def _auth_mode(self) -> str:
651
+ mode = self.config.required.auth_mode
652
+ return mode.value if hasattr(mode, "value") else str(mode)
653
+
654
+ def test_connection(self) -> dict[str, Any]:
655
+ logger.info("Testing connection to Tableau...")
656
+ result = {
657
+ "timestamp": datetime.now(UTC).isoformat(),
658
+ "source_type": self.recipe.get("type"),
659
+ }
660
+
661
+ try:
662
+ with self._signed_in_server() as server:
663
+ projects = self._paged_items(server.projects)
664
+ result["status"] = "SUCCESS"
665
+ result["message"] = (
666
+ f"Successfully connected to Tableau using {self._auth_mode()}. "
667
+ f"Reachable projects: {len(projects)}."
668
+ )
669
+ except Exception as exc:
670
+ result["status"] = "FAILURE"
671
+ result["message"] = f"Failed to connect to Tableau: {exc}"
672
+
673
+ return result
674
+
675
+ STREAM_DETECTIONS = True
676
+
677
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
678
+ if self._aborted:
679
+ return
680
+
681
+ refs = self._sample_refs(self._discover_assets())
682
+ hash_by_raw = {ref.raw_id: self.generate_hash_id(ref.raw_id) for ref in refs}
683
+
684
+ batch: list[SingleAssetScanResults] = []
685
+ for ref in refs:
686
+ if self._aborted:
687
+ return
688
+
689
+ asset_hash = hash_by_raw[ref.raw_id]
690
+ self._asset_lookup[asset_hash] = ref
691
+
692
+ linked_hashes = [
693
+ hash_by_raw[linked_raw_id]
694
+ for linked_raw_id in ref.linked_raw_ids
695
+ if linked_raw_id in hash_by_raw
696
+ ]
697
+ batch.append(self._asset_from_ref(ref, links=linked_hashes))
698
+
699
+ if len(batch) >= self.BATCH_SIZE:
700
+ yield batch
701
+ batch = []
702
+
703
+ if batch:
704
+ yield batch
705
+
706
+ def generate_hash_id(self, asset_id: str) -> str:
707
+ return hash_id(self._asset_type_value(), asset_id)
708
+
709
+ def _format_asset_content(self, ref: TableauAssetRef) -> tuple[str, str]:
710
+ sampling = self._sampling()
711
+ lines = [
712
+ f"site={ref.site}",
713
+ f"kind={ref.kind}",
714
+ f"name={ref.name}",
715
+ f"project_name={ref.project_name or ''}",
716
+ f"sampling_strategy={sampling.strategy}",
717
+ "",
718
+ ]
719
+
720
+ tags = ref.metadata.get("tags")
721
+ if isinstance(tags, list) and tags:
722
+ lines.append(f"tags={', '.join(str(tag) for tag in tags)}")
723
+
724
+ owner = ref.metadata.get("owner")
725
+ if isinstance(owner, dict):
726
+ owner_name = owner.get("name") or owner.get("email") or owner.get("id")
727
+ if owner_name:
728
+ lines.append(f"owner={owner_name}")
729
+
730
+ usage = ref.metadata.get("usage")
731
+ if isinstance(usage, dict):
732
+ total_views = usage.get("total_views")
733
+ if total_views is not None:
734
+ lines.append(f"total_views={total_views}")
735
+
736
+ text_content = "\n".join(lines)
737
+ raw_content = json.dumps(
738
+ {
739
+ "kind": ref.kind,
740
+ "site": ref.site,
741
+ "project_name": ref.project_name,
742
+ "asset_id": ref.asset_id,
743
+ "name": ref.name,
744
+ "metadata": ref.metadata,
745
+ },
746
+ ensure_ascii=False,
747
+ default=str,
748
+ )
749
+ return raw_content, text_content
750
+
751
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
752
+ cached = self._content_cache.get(asset_id)
753
+ if cached:
754
+ return cached
755
+
756
+ ref = self._asset_lookup.get(asset_id)
757
+ if ref is None:
758
+ try:
759
+ decoded = unhash_id(asset_id)
760
+ except Exception:
761
+ decoded = asset_id
762
+
763
+ if decoded.startswith("TABLEAU_#_"):
764
+ decoded = decoded[len("TABLEAU_#_") :]
765
+
766
+ for known_ref in self._asset_lookup.values():
767
+ if known_ref.raw_id == decoded:
768
+ ref = known_ref
769
+ break
770
+
771
+ if ref is None:
772
+ return None
773
+
774
+ content = self._format_asset_content(ref)
775
+ self._content_cache[asset_id] = content
776
+ return content
777
+
778
+ def enrich_finding_location(
779
+ self,
780
+ finding: DetectionResult,
781
+ asset: SingleAssetScanResults,
782
+ text_content: str,
783
+ ) -> None:
784
+ _ = text_content
785
+ ref = self._asset_lookup.get(asset.hash)
786
+ if not ref:
787
+ return
788
+
789
+ project_prefix = f"{ref.project_name}/" if ref.project_name else ""
790
+ finding.location = Location(path=f"{ref.site}/{project_prefix}{ref.kind}/{ref.name}")
791
+
792
+ def abort(self) -> None:
793
+ logger.info("Aborting Tableau extraction...")
794
+ super().abort()
795
+
796
+ def cleanup(self) -> None:
797
+ self._content_cache.clear()
798
+ self._asset_lookup.clear()
799
+ self._owner_cache.clear()