classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import random
|
|
6
|
+
from collections.abc import AsyncGenerator
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from typing import Any
|
|
11
|
+
from urllib.parse import urljoin
|
|
12
|
+
|
|
13
|
+
from requests.adapters import HTTPAdapter
|
|
14
|
+
from urllib3 import Retry
|
|
15
|
+
|
|
16
|
+
from ...models.generated_input import (
|
|
17
|
+
SamplingConfig,
|
|
18
|
+
SamplingStrategy,
|
|
19
|
+
TableauInput,
|
|
20
|
+
TableauMaskedPersonalAccessToken,
|
|
21
|
+
TableauMaskedUsernamePassword,
|
|
22
|
+
TableauOptionalConnection,
|
|
23
|
+
TableauOptionalExtraction,
|
|
24
|
+
TableauOptionalScope,
|
|
25
|
+
TableauRequiredPersonalAccessToken,
|
|
26
|
+
TableauRequiredUsernamePassword,
|
|
27
|
+
)
|
|
28
|
+
from ...models.generated_single_asset_scan_results import (
|
|
29
|
+
AssetType as OutputAssetType,
|
|
30
|
+
)
|
|
31
|
+
from ...models.generated_single_asset_scan_results import (
|
|
32
|
+
DetectionResult,
|
|
33
|
+
Location,
|
|
34
|
+
SingleAssetScanResults,
|
|
35
|
+
)
|
|
36
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
37
|
+
from ..base import BaseSource
|
|
38
|
+
from ..dependencies import require_module
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
_RETRIABLE_STATUS_CODES = [408, 429, 500, 502, 503, 504]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class TableauAssetRef:
|
|
47
|
+
raw_id: str
|
|
48
|
+
kind: str
|
|
49
|
+
site: str
|
|
50
|
+
project_name: str | None
|
|
51
|
+
asset_id: str
|
|
52
|
+
name: str
|
|
53
|
+
external_url: str
|
|
54
|
+
metadata: dict[str, Any]
|
|
55
|
+
linked_raw_ids: list[str]
|
|
56
|
+
created_at: datetime
|
|
57
|
+
updated_at: datetime
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TableauSource(BaseSource):
|
|
61
|
+
source_type = "tableau"
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
recipe: dict[str, Any],
|
|
66
|
+
source_id: str | None = None,
|
|
67
|
+
runner_id: str | None = None,
|
|
68
|
+
) -> None:
|
|
69
|
+
super().__init__(recipe, source_id, runner_id)
|
|
70
|
+
self.config = TableauInput.model_validate(recipe)
|
|
71
|
+
self.runner_id = runner_id or "local-run"
|
|
72
|
+
|
|
73
|
+
self._tsc = require_module(
|
|
74
|
+
module_name="tableauserverclient",
|
|
75
|
+
source_name="Tableau",
|
|
76
|
+
uv_groups=["tableau"],
|
|
77
|
+
detail="The Tableau connector is optional.",
|
|
78
|
+
)
|
|
79
|
+
self._asset_lookup: dict[str, TableauAssetRef] = {}
|
|
80
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
81
|
+
self._owner_cache: dict[str, dict[str, Any]] = {}
|
|
82
|
+
|
|
83
|
+
self._validate_auth_configuration()
|
|
84
|
+
|
|
85
|
+
def _validate_auth_configuration(self) -> None:
|
|
86
|
+
required = self.config.required
|
|
87
|
+
masked = self.config.masked
|
|
88
|
+
|
|
89
|
+
if isinstance(required, TableauRequiredUsernamePassword):
|
|
90
|
+
if not isinstance(masked, TableauMaskedUsernamePassword):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"TABLEAU USERNAME_PASSWORD auth requires masked.username and masked.password"
|
|
93
|
+
)
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
if isinstance(required, TableauRequiredPersonalAccessToken):
|
|
97
|
+
if not isinstance(masked, TableauMaskedPersonalAccessToken):
|
|
98
|
+
raise ValueError("TABLEAU PERSONAL_ACCESS_TOKEN auth requires masked.token_value")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
raise ValueError("Unsupported TABLEAU auth configuration")
|
|
102
|
+
|
|
103
|
+
def _asset_type_value(self) -> str:
|
|
104
|
+
type_value = self.config.type
|
|
105
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
106
|
+
|
|
107
|
+
def _sampling(self) -> SamplingConfig:
|
|
108
|
+
return self.config.sampling
|
|
109
|
+
|
|
110
|
+
def _connection_options(self) -> TableauOptionalConnection:
|
|
111
|
+
if self.config.optional and self.config.optional.connection:
|
|
112
|
+
return self.config.optional.connection
|
|
113
|
+
return TableauOptionalConnection()
|
|
114
|
+
|
|
115
|
+
def _scope_options(self) -> TableauOptionalScope:
|
|
116
|
+
if self.config.optional and self.config.optional.scope:
|
|
117
|
+
return self.config.optional.scope
|
|
118
|
+
return TableauOptionalScope()
|
|
119
|
+
|
|
120
|
+
def _extraction_options(self) -> TableauOptionalExtraction:
|
|
121
|
+
if self.config.optional and self.config.optional.extraction:
|
|
122
|
+
return self.config.optional.extraction
|
|
123
|
+
return TableauOptionalExtraction()
|
|
124
|
+
|
|
125
|
+
def _connect_uri(self) -> str:
|
|
126
|
+
return str(self.config.required.connect_uri).rstrip("/")
|
|
127
|
+
|
|
128
|
+
def _site(self) -> str:
|
|
129
|
+
return self.config.required.site
|
|
130
|
+
|
|
131
|
+
def _site_for_display(self) -> str:
|
|
132
|
+
site = self._site().strip()
|
|
133
|
+
return site if site else "default"
|
|
134
|
+
|
|
135
|
+
def _timeout_seconds(self) -> int:
|
|
136
|
+
timeout = self._connection_options().timeout_seconds
|
|
137
|
+
return int(timeout or 30)
|
|
138
|
+
|
|
139
|
+
def _request_options(self, page_number: int):
|
|
140
|
+
request_options = self._tsc.RequestOptions()
|
|
141
|
+
request_options.page_size = 100
|
|
142
|
+
request_options.page_number = page_number
|
|
143
|
+
# Compatibility across tableauserverclient versions.
|
|
144
|
+
request_options.pagesize = 100
|
|
145
|
+
request_options.pagenumber = page_number
|
|
146
|
+
return request_options
|
|
147
|
+
|
|
148
|
+
def _build_auth(self) -> Any:
|
|
149
|
+
required = self.config.required
|
|
150
|
+
masked = self.config.masked
|
|
151
|
+
site = self._site()
|
|
152
|
+
|
|
153
|
+
if isinstance(required, TableauRequiredUsernamePassword):
|
|
154
|
+
if not isinstance(masked, TableauMaskedUsernamePassword):
|
|
155
|
+
raise ValueError(
|
|
156
|
+
"TABLEAU USERNAME_PASSWORD auth requires masked.username and masked.password"
|
|
157
|
+
)
|
|
158
|
+
return self._tsc.TableauAuth(
|
|
159
|
+
username=masked.username,
|
|
160
|
+
password=masked.password,
|
|
161
|
+
site_id=site,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if isinstance(required, TableauRequiredPersonalAccessToken):
|
|
165
|
+
if not isinstance(masked, TableauMaskedPersonalAccessToken):
|
|
166
|
+
raise ValueError("TABLEAU PERSONAL_ACCESS_TOKEN auth requires masked.token_value")
|
|
167
|
+
return self._tsc.PersonalAccessTokenAuth(
|
|
168
|
+
required.token_name,
|
|
169
|
+
masked.token_value,
|
|
170
|
+
site,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
raise ValueError("Unsupported TABLEAU auth configuration")
|
|
174
|
+
|
|
175
|
+
def _build_server(self) -> Any:
|
|
176
|
+
connection_options = self._connection_options()
|
|
177
|
+
ssl_verify = connection_options.ssl_verify
|
|
178
|
+
http_options: dict[str, Any] = {
|
|
179
|
+
"verify": bool(ssl_verify) if not isinstance(ssl_verify, str) else ssl_verify,
|
|
180
|
+
"timeout": self._timeout_seconds(),
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
server = self._tsc.Server(
|
|
184
|
+
self._connect_uri(),
|
|
185
|
+
use_server_version=True,
|
|
186
|
+
http_options=http_options,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if hasattr(server, "_session"):
|
|
190
|
+
server._session.trust_env = bool(connection_options.session_trust_env)
|
|
191
|
+
|
|
192
|
+
adapter = HTTPAdapter(
|
|
193
|
+
max_retries=Retry(
|
|
194
|
+
total=int(connection_options.max_retries or 3),
|
|
195
|
+
backoff_factor=1,
|
|
196
|
+
status_forcelist=_RETRIABLE_STATUS_CODES,
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
server._session.mount("http://", adapter)
|
|
200
|
+
server._session.mount("https://", adapter)
|
|
201
|
+
|
|
202
|
+
return server
|
|
203
|
+
|
|
204
|
+
@contextmanager
|
|
205
|
+
def _signed_in_server(self):
|
|
206
|
+
server = self._build_server()
|
|
207
|
+
auth = self._build_auth()
|
|
208
|
+
server.auth.sign_in(auth)
|
|
209
|
+
try:
|
|
210
|
+
yield server
|
|
211
|
+
finally:
|
|
212
|
+
try:
|
|
213
|
+
server.auth.sign_out()
|
|
214
|
+
except Exception:
|
|
215
|
+
logger.debug("Failed to sign out Tableau session cleanly", exc_info=True)
|
|
216
|
+
|
|
217
|
+
def _paged_items(self, endpoint: Any) -> list[Any]:
|
|
218
|
+
pager = getattr(self._tsc, "Pager", None)
|
|
219
|
+
if pager is not None:
|
|
220
|
+
try:
|
|
221
|
+
return list(pager(endpoint))
|
|
222
|
+
except Exception:
|
|
223
|
+
logger.debug("Tableau Pager fallback to manual pagination", exc_info=True)
|
|
224
|
+
|
|
225
|
+
items: list[Any] = []
|
|
226
|
+
page_number = 1
|
|
227
|
+
while True:
|
|
228
|
+
request_options = self._request_options(page_number)
|
|
229
|
+
response = endpoint.get(request_options)
|
|
230
|
+
if not isinstance(response, tuple) or len(response) != 2:
|
|
231
|
+
if isinstance(response, list):
|
|
232
|
+
items.extend(response)
|
|
233
|
+
break
|
|
234
|
+
|
|
235
|
+
page_items, pagination = response
|
|
236
|
+
if page_items:
|
|
237
|
+
items.extend(page_items)
|
|
238
|
+
|
|
239
|
+
total = int(
|
|
240
|
+
getattr(
|
|
241
|
+
pagination,
|
|
242
|
+
"total_available",
|
|
243
|
+
getattr(pagination, "totalAvailable", len(items)),
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
page_size = int(
|
|
247
|
+
getattr(
|
|
248
|
+
pagination,
|
|
249
|
+
"page_size",
|
|
250
|
+
getattr(pagination, "pagesize", len(page_items) or 1),
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
current_page = int(
|
|
254
|
+
getattr(
|
|
255
|
+
pagination,
|
|
256
|
+
"page_number",
|
|
257
|
+
getattr(pagination, "pagenumber", page_number),
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
if current_page * page_size >= total:
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
page_number = current_page + 1
|
|
264
|
+
|
|
265
|
+
return items
|
|
266
|
+
|
|
267
|
+
def _project_allowlist(self) -> set[str]:
|
|
268
|
+
configured = self._scope_options().project_names or []
|
|
269
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
270
|
+
|
|
271
|
+
def _workbook_allowlist(self) -> set[str]:
|
|
272
|
+
configured = self._scope_options().workbook_names or []
|
|
273
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
274
|
+
|
|
275
|
+
def _datasource_allowlist(self) -> set[str]:
|
|
276
|
+
configured = self._scope_options().datasource_names or []
|
|
277
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
278
|
+
|
|
279
|
+
def _project_allowed(self, project_name: str) -> bool:
|
|
280
|
+
allowlist = self._project_allowlist()
|
|
281
|
+
if not allowlist:
|
|
282
|
+
return True
|
|
283
|
+
return project_name.strip().lower() in allowlist
|
|
284
|
+
|
|
285
|
+
def _workbook_allowed(self, workbook_name: str) -> bool:
|
|
286
|
+
allowlist = self._workbook_allowlist()
|
|
287
|
+
if not allowlist:
|
|
288
|
+
return True
|
|
289
|
+
return workbook_name.strip().lower() in allowlist
|
|
290
|
+
|
|
291
|
+
def _datasource_allowed(self, datasource_name: str) -> bool:
|
|
292
|
+
allowlist = self._datasource_allowlist()
|
|
293
|
+
if not allowlist:
|
|
294
|
+
return True
|
|
295
|
+
return datasource_name.strip().lower() in allowlist
|
|
296
|
+
|
|
297
|
+
def _coerce_external_url(self, value: Any, fallback: str) -> str:
|
|
298
|
+
candidate = str(value or "").strip()
|
|
299
|
+
if not candidate:
|
|
300
|
+
return fallback
|
|
301
|
+
if candidate.startswith("http://") or candidate.startswith("https://"):
|
|
302
|
+
return candidate
|
|
303
|
+
return urljoin(f"{self._connect_uri()}/", candidate)
|
|
304
|
+
|
|
305
|
+
def _parse_datetime(self, value: Any) -> datetime | None:
|
|
306
|
+
if isinstance(value, datetime):
|
|
307
|
+
return value if value.tzinfo else value.replace(tzinfo=UTC)
|
|
308
|
+
|
|
309
|
+
if isinstance(value, str):
|
|
310
|
+
cleaned = value.strip()
|
|
311
|
+
if not cleaned:
|
|
312
|
+
return None
|
|
313
|
+
normalized = cleaned.replace("Z", "+00:00")
|
|
314
|
+
try:
|
|
315
|
+
parsed = datetime.fromisoformat(normalized)
|
|
316
|
+
except ValueError:
|
|
317
|
+
return None
|
|
318
|
+
return parsed if parsed.tzinfo else parsed.replace(tzinfo=UTC)
|
|
319
|
+
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
def _project_raw_id(self, project_id: str) -> str:
|
|
323
|
+
return f"{self._site_for_display()}_#_project_#_{project_id}"
|
|
324
|
+
|
|
325
|
+
def _workbook_raw_id(self, workbook_id: str) -> str:
|
|
326
|
+
return f"{self._site_for_display()}_#_workbook_#_{workbook_id}"
|
|
327
|
+
|
|
328
|
+
def _datasource_raw_id(self, datasource_id: str) -> str:
|
|
329
|
+
return f"{self._site_for_display()}_#_datasource_#_{datasource_id}"
|
|
330
|
+
|
|
331
|
+
def _project_fallback_url(self, project_id: str) -> str:
|
|
332
|
+
return f"{self._connect_uri()}/#/site/{self._site_for_display()}/projects/{project_id}"
|
|
333
|
+
|
|
334
|
+
def _workbook_fallback_url(self, workbook_id: str) -> str:
|
|
335
|
+
return f"{self._connect_uri()}/#/site/{self._site_for_display()}/workbooks/{workbook_id}"
|
|
336
|
+
|
|
337
|
+
def _datasource_fallback_url(self, datasource_id: str) -> str:
|
|
338
|
+
return (
|
|
339
|
+
f"{self._connect_uri()}/#/site/{self._site_for_display()}/datasources/{datasource_id}"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def _resolve_owner_metadata(self, server: Any, owner_id: str | None) -> dict[str, Any] | None:
|
|
343
|
+
if not owner_id:
|
|
344
|
+
return None
|
|
345
|
+
if owner_id in self._owner_cache:
|
|
346
|
+
return self._owner_cache[owner_id]
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
user = server.users.get_by_id(owner_id)
|
|
350
|
+
except Exception:
|
|
351
|
+
logger.debug("Unable to resolve Tableau owner for %s", owner_id, exc_info=True)
|
|
352
|
+
self._owner_cache[owner_id] = {"id": owner_id}
|
|
353
|
+
return self._owner_cache[owner_id]
|
|
354
|
+
|
|
355
|
+
owner = {
|
|
356
|
+
"id": owner_id,
|
|
357
|
+
"name": str(getattr(user, "name", "") or "").strip() or None,
|
|
358
|
+
"full_name": str(getattr(user, "full_name", "") or "").strip() or None,
|
|
359
|
+
"email": str(getattr(user, "email", "") or "").strip() or None,
|
|
360
|
+
}
|
|
361
|
+
self._owner_cache[owner_id] = owner
|
|
362
|
+
return owner
|
|
363
|
+
|
|
364
|
+
def _extract_tags(self, endpoint: Any, item: Any) -> list[str]:
|
|
365
|
+
if not bool(self._extraction_options().ingest_tags):
|
|
366
|
+
return []
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
endpoint.populate_tags(item)
|
|
370
|
+
except Exception:
|
|
371
|
+
logger.debug("Unable to populate Tableau tags for %s", item, exc_info=True)
|
|
372
|
+
|
|
373
|
+
tags: list[str] = []
|
|
374
|
+
for tag in getattr(item, "tags", []) or []:
|
|
375
|
+
label = str(getattr(tag, "name", "") or "").strip()
|
|
376
|
+
if label:
|
|
377
|
+
tags.append(label)
|
|
378
|
+
return tags
|
|
379
|
+
|
|
380
|
+
def _to_asset_ref(
|
|
381
|
+
self,
|
|
382
|
+
*,
|
|
383
|
+
raw_id: str,
|
|
384
|
+
kind: str,
|
|
385
|
+
asset_id: str,
|
|
386
|
+
name: str,
|
|
387
|
+
project_name: str | None,
|
|
388
|
+
external_url: str,
|
|
389
|
+
metadata: dict[str, Any],
|
|
390
|
+
linked_raw_ids: list[str] | None = None,
|
|
391
|
+
) -> TableauAssetRef:
|
|
392
|
+
created_at = self._parse_datetime(metadata.get("created_at")) or datetime.now(UTC)
|
|
393
|
+
updated_at = self._parse_datetime(metadata.get("updated_at")) or created_at
|
|
394
|
+
return TableauAssetRef(
|
|
395
|
+
raw_id=raw_id,
|
|
396
|
+
kind=kind,
|
|
397
|
+
site=self._site_for_display(),
|
|
398
|
+
project_name=project_name,
|
|
399
|
+
asset_id=asset_id,
|
|
400
|
+
name=name,
|
|
401
|
+
external_url=external_url,
|
|
402
|
+
metadata=metadata,
|
|
403
|
+
linked_raw_ids=list(linked_raw_ids or []),
|
|
404
|
+
created_at=created_at,
|
|
405
|
+
updated_at=updated_at,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
def _discover_assets(self) -> list[TableauAssetRef]:
|
|
409
|
+
refs: list[TableauAssetRef] = []
|
|
410
|
+
scope = self._scope_options()
|
|
411
|
+
extraction = self._extraction_options()
|
|
412
|
+
|
|
413
|
+
with self._signed_in_server() as server:
|
|
414
|
+
projects = self._paged_items(server.projects)
|
|
415
|
+
project_raw_by_id: dict[str, str] = {}
|
|
416
|
+
project_name_by_id: dict[str, str] = {}
|
|
417
|
+
|
|
418
|
+
for project in projects:
|
|
419
|
+
if self._aborted:
|
|
420
|
+
break
|
|
421
|
+
|
|
422
|
+
project_id = str(getattr(project, "id", "") or "").strip()
|
|
423
|
+
project_name = str(getattr(project, "name", "") or "").strip()
|
|
424
|
+
if not project_id or not project_name:
|
|
425
|
+
continue
|
|
426
|
+
if not self._project_allowed(project_name):
|
|
427
|
+
continue
|
|
428
|
+
|
|
429
|
+
raw_id = self._project_raw_id(project_id)
|
|
430
|
+
project_raw_by_id[project_id] = raw_id
|
|
431
|
+
project_name_by_id[project_id] = project_name
|
|
432
|
+
refs.append(
|
|
433
|
+
self._to_asset_ref(
|
|
434
|
+
raw_id=raw_id,
|
|
435
|
+
kind="project",
|
|
436
|
+
asset_id=project_id,
|
|
437
|
+
name=project_name,
|
|
438
|
+
project_name=project_name,
|
|
439
|
+
external_url=self._project_fallback_url(project_id),
|
|
440
|
+
metadata={
|
|
441
|
+
"site": self._site_for_display(),
|
|
442
|
+
"project_id": project_id,
|
|
443
|
+
"project_name": project_name,
|
|
444
|
+
},
|
|
445
|
+
)
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
include_workbooks = scope.include_workbooks is not False
|
|
449
|
+
if include_workbooks:
|
|
450
|
+
for workbook in self._paged_items(server.workbooks):
|
|
451
|
+
if self._aborted:
|
|
452
|
+
break
|
|
453
|
+
|
|
454
|
+
workbook_id = str(getattr(workbook, "id", "") or "").strip()
|
|
455
|
+
workbook_name = str(getattr(workbook, "name", "") or "").strip()
|
|
456
|
+
if (
|
|
457
|
+
not workbook_id
|
|
458
|
+
or not workbook_name
|
|
459
|
+
or not self._workbook_allowed(workbook_name)
|
|
460
|
+
):
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
project_id = str(getattr(workbook, "project_id", "") or "").strip()
|
|
464
|
+
project_name = str(getattr(workbook, "project_name", "") or "").strip() or None
|
|
465
|
+
if project_name and not self._project_allowed(project_name):
|
|
466
|
+
continue
|
|
467
|
+
if project_id and not project_name:
|
|
468
|
+
project_name = project_name_by_id.get(project_id)
|
|
469
|
+
|
|
470
|
+
tags = self._extract_tags(server.workbooks, workbook)
|
|
471
|
+
owner_id = str(getattr(workbook, "owner_id", "") or "").strip() or None
|
|
472
|
+
|
|
473
|
+
metadata: dict[str, Any] = {
|
|
474
|
+
"site": self._site_for_display(),
|
|
475
|
+
"project_id": project_id or None,
|
|
476
|
+
"project_name": project_name,
|
|
477
|
+
"created_at": getattr(workbook, "created_at", None),
|
|
478
|
+
"updated_at": getattr(workbook, "updated_at", None),
|
|
479
|
+
}
|
|
480
|
+
if extraction.ingest_tags:
|
|
481
|
+
metadata["tags"] = tags
|
|
482
|
+
if extraction.ingest_owner:
|
|
483
|
+
metadata["owner"] = self._resolve_owner_metadata(server, owner_id)
|
|
484
|
+
if extraction.extract_usage_stats:
|
|
485
|
+
metadata["usage"] = {
|
|
486
|
+
"total_views": int(getattr(workbook, "total_views", 0) or 0)
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
linked: list[str] = []
|
|
490
|
+
if project_id and project_id in project_raw_by_id:
|
|
491
|
+
linked.append(project_raw_by_id[project_id])
|
|
492
|
+
|
|
493
|
+
refs.append(
|
|
494
|
+
self._to_asset_ref(
|
|
495
|
+
raw_id=self._workbook_raw_id(workbook_id),
|
|
496
|
+
kind="workbook",
|
|
497
|
+
asset_id=workbook_id,
|
|
498
|
+
name=workbook_name,
|
|
499
|
+
project_name=project_name,
|
|
500
|
+
external_url=self._coerce_external_url(
|
|
501
|
+
getattr(workbook, "webpage_url", None),
|
|
502
|
+
self._workbook_fallback_url(workbook_id),
|
|
503
|
+
),
|
|
504
|
+
metadata=metadata,
|
|
505
|
+
linked_raw_ids=linked,
|
|
506
|
+
)
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
include_datasources = scope.include_datasources is not False
|
|
510
|
+
if include_datasources:
|
|
511
|
+
for datasource in self._paged_items(server.datasources):
|
|
512
|
+
if self._aborted:
|
|
513
|
+
break
|
|
514
|
+
|
|
515
|
+
datasource_id = str(getattr(datasource, "id", "") or "").strip()
|
|
516
|
+
datasource_name = str(getattr(datasource, "name", "") or "").strip()
|
|
517
|
+
if (
|
|
518
|
+
not datasource_id
|
|
519
|
+
or not datasource_name
|
|
520
|
+
or not self._datasource_allowed(datasource_name)
|
|
521
|
+
):
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
project_id = str(getattr(datasource, "project_id", "") or "").strip()
|
|
525
|
+
project_name = (
|
|
526
|
+
str(getattr(datasource, "project_name", "") or "").strip() or None
|
|
527
|
+
)
|
|
528
|
+
if project_name and not self._project_allowed(project_name):
|
|
529
|
+
continue
|
|
530
|
+
if project_id and not project_name:
|
|
531
|
+
project_name = project_name_by_id.get(project_id)
|
|
532
|
+
|
|
533
|
+
tags = self._extract_tags(server.datasources, datasource)
|
|
534
|
+
owner_id = str(getattr(datasource, "owner_id", "") or "").strip() or None
|
|
535
|
+
|
|
536
|
+
metadata = {
|
|
537
|
+
"site": self._site_for_display(),
|
|
538
|
+
"project_id": project_id or None,
|
|
539
|
+
"project_name": project_name,
|
|
540
|
+
"created_at": getattr(datasource, "created_at", None),
|
|
541
|
+
"updated_at": getattr(datasource, "updated_at", None),
|
|
542
|
+
}
|
|
543
|
+
if extraction.ingest_tags:
|
|
544
|
+
metadata["tags"] = tags
|
|
545
|
+
if extraction.ingest_owner:
|
|
546
|
+
metadata["owner"] = self._resolve_owner_metadata(server, owner_id)
|
|
547
|
+
if extraction.extract_usage_stats:
|
|
548
|
+
metadata["usage"] = {
|
|
549
|
+
"total_views": int(getattr(datasource, "total_views", 0) or 0)
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
linked = []
|
|
553
|
+
if project_id and project_id in project_raw_by_id:
|
|
554
|
+
linked.append(project_raw_by_id[project_id])
|
|
555
|
+
|
|
556
|
+
refs.append(
|
|
557
|
+
self._to_asset_ref(
|
|
558
|
+
raw_id=self._datasource_raw_id(datasource_id),
|
|
559
|
+
kind="datasource",
|
|
560
|
+
asset_id=datasource_id,
|
|
561
|
+
name=datasource_name,
|
|
562
|
+
project_name=project_name,
|
|
563
|
+
external_url=self._coerce_external_url(
|
|
564
|
+
getattr(datasource, "webpage_url", None),
|
|
565
|
+
self._datasource_fallback_url(datasource_id),
|
|
566
|
+
),
|
|
567
|
+
metadata=metadata,
|
|
568
|
+
linked_raw_ids=linked,
|
|
569
|
+
)
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
refs.sort(key=lambda ref: (ref.kind, ref.name.lower(), ref.asset_id))
|
|
573
|
+
return refs
|
|
574
|
+
|
|
575
|
+
def _sampling_sort_datetime(self, ref: TableauAssetRef, field_name: str) -> datetime | None:
|
|
576
|
+
candidates = [
|
|
577
|
+
ref.metadata.get(field_name),
|
|
578
|
+
ref.metadata.get(field_name.lower()),
|
|
579
|
+
ref.metadata.get(field_name.upper()),
|
|
580
|
+
ref.metadata.get("updated_at"),
|
|
581
|
+
ref.metadata.get("created_at"),
|
|
582
|
+
]
|
|
583
|
+
for candidate in candidates:
|
|
584
|
+
parsed = self._parse_datetime(candidate)
|
|
585
|
+
if parsed is not None:
|
|
586
|
+
return parsed
|
|
587
|
+
return None
|
|
588
|
+
|
|
589
|
+
def _sample_refs(self, refs: list[TableauAssetRef]) -> list[TableauAssetRef]:
|
|
590
|
+
sampling = self._sampling()
|
|
591
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
592
|
+
return refs
|
|
593
|
+
|
|
594
|
+
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
595
|
+
limit = int(sampling.rows_per_page or 100)
|
|
596
|
+
if limit >= len(refs):
|
|
597
|
+
return refs
|
|
598
|
+
generator = random.Random(0)
|
|
599
|
+
sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
|
|
600
|
+
return [refs[index] for index in sampled_indexes]
|
|
601
|
+
|
|
602
|
+
order_field = sampling.order_by_column or "updated_at"
|
|
603
|
+
values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
|
|
604
|
+
has_order_values = any(value is not None for value in values)
|
|
605
|
+
|
|
606
|
+
if not has_order_values and sampling.fallback_to_random is not False:
|
|
607
|
+
generator = random.Random(0)
|
|
608
|
+
limit = int(sampling.rows_per_page or 100)
|
|
609
|
+
sampled_indexes = sorted(generator.sample(range(len(refs)), k=limit))
|
|
610
|
+
return [refs[index] for index in sampled_indexes]
|
|
611
|
+
|
|
612
|
+
scored: list[tuple[bool, datetime, TableauAssetRef]] = []
|
|
613
|
+
for ref, parsed in zip(refs, values, strict=False):
|
|
614
|
+
effective = parsed or ref.updated_at
|
|
615
|
+
scored.append((parsed is not None, effective, ref))
|
|
616
|
+
|
|
617
|
+
scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
|
|
618
|
+
limit = int(sampling.rows_per_page or 100)
|
|
619
|
+
return [item[2] for item in scored[:limit]]
|
|
620
|
+
|
|
621
|
+
def _asset_from_ref(
|
|
622
|
+
self,
|
|
623
|
+
ref: TableauAssetRef,
|
|
624
|
+
*,
|
|
625
|
+
links: list[str],
|
|
626
|
+
) -> SingleAssetScanResults:
|
|
627
|
+
asset_hash = self.generate_hash_id(ref.raw_id)
|
|
628
|
+
checksum_payload = {
|
|
629
|
+
"kind": ref.kind,
|
|
630
|
+
"site": ref.site,
|
|
631
|
+
"project_name": ref.project_name,
|
|
632
|
+
"asset_id": ref.asset_id,
|
|
633
|
+
"name": ref.name,
|
|
634
|
+
"metadata": ref.metadata,
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
return SingleAssetScanResults(
|
|
638
|
+
hash=asset_hash,
|
|
639
|
+
checksum=self.calculate_checksum(checksum_payload),
|
|
640
|
+
name=f"{ref.site} / {ref.kind} / {ref.name}",
|
|
641
|
+
external_url=self.ensure_location(ref.external_url, fallback=self._connect_uri()),
|
|
642
|
+
links=links,
|
|
643
|
+
asset_type=OutputAssetType.TXT,
|
|
644
|
+
source_id=self.source_id,
|
|
645
|
+
created_at=ref.created_at,
|
|
646
|
+
updated_at=ref.updated_at,
|
|
647
|
+
runner_id=self.runner_id,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
def _auth_mode(self) -> str:
|
|
651
|
+
mode = self.config.required.auth_mode
|
|
652
|
+
return mode.value if hasattr(mode, "value") else str(mode)
|
|
653
|
+
|
|
654
|
+
def test_connection(self) -> dict[str, Any]:
|
|
655
|
+
logger.info("Testing connection to Tableau...")
|
|
656
|
+
result = {
|
|
657
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
658
|
+
"source_type": self.recipe.get("type"),
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
with self._signed_in_server() as server:
|
|
663
|
+
projects = self._paged_items(server.projects)
|
|
664
|
+
result["status"] = "SUCCESS"
|
|
665
|
+
result["message"] = (
|
|
666
|
+
f"Successfully connected to Tableau using {self._auth_mode()}. "
|
|
667
|
+
f"Reachable projects: {len(projects)}."
|
|
668
|
+
)
|
|
669
|
+
except Exception as exc:
|
|
670
|
+
result["status"] = "FAILURE"
|
|
671
|
+
result["message"] = f"Failed to connect to Tableau: {exc}"
|
|
672
|
+
|
|
673
|
+
return result
|
|
674
|
+
|
|
675
|
+
STREAM_DETECTIONS = True
|
|
676
|
+
|
|
677
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
678
|
+
if self._aborted:
|
|
679
|
+
return
|
|
680
|
+
|
|
681
|
+
refs = self._sample_refs(self._discover_assets())
|
|
682
|
+
hash_by_raw = {ref.raw_id: self.generate_hash_id(ref.raw_id) for ref in refs}
|
|
683
|
+
|
|
684
|
+
batch: list[SingleAssetScanResults] = []
|
|
685
|
+
for ref in refs:
|
|
686
|
+
if self._aborted:
|
|
687
|
+
return
|
|
688
|
+
|
|
689
|
+
asset_hash = hash_by_raw[ref.raw_id]
|
|
690
|
+
self._asset_lookup[asset_hash] = ref
|
|
691
|
+
|
|
692
|
+
linked_hashes = [
|
|
693
|
+
hash_by_raw[linked_raw_id]
|
|
694
|
+
for linked_raw_id in ref.linked_raw_ids
|
|
695
|
+
if linked_raw_id in hash_by_raw
|
|
696
|
+
]
|
|
697
|
+
batch.append(self._asset_from_ref(ref, links=linked_hashes))
|
|
698
|
+
|
|
699
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
700
|
+
yield batch
|
|
701
|
+
batch = []
|
|
702
|
+
|
|
703
|
+
if batch:
|
|
704
|
+
yield batch
|
|
705
|
+
|
|
706
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
707
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
708
|
+
|
|
709
|
+
def _format_asset_content(self, ref: TableauAssetRef) -> tuple[str, str]:
|
|
710
|
+
sampling = self._sampling()
|
|
711
|
+
lines = [
|
|
712
|
+
f"site={ref.site}",
|
|
713
|
+
f"kind={ref.kind}",
|
|
714
|
+
f"name={ref.name}",
|
|
715
|
+
f"project_name={ref.project_name or ''}",
|
|
716
|
+
f"sampling_strategy={sampling.strategy}",
|
|
717
|
+
"",
|
|
718
|
+
]
|
|
719
|
+
|
|
720
|
+
tags = ref.metadata.get("tags")
|
|
721
|
+
if isinstance(tags, list) and tags:
|
|
722
|
+
lines.append(f"tags={', '.join(str(tag) for tag in tags)}")
|
|
723
|
+
|
|
724
|
+
owner = ref.metadata.get("owner")
|
|
725
|
+
if isinstance(owner, dict):
|
|
726
|
+
owner_name = owner.get("name") or owner.get("email") or owner.get("id")
|
|
727
|
+
if owner_name:
|
|
728
|
+
lines.append(f"owner={owner_name}")
|
|
729
|
+
|
|
730
|
+
usage = ref.metadata.get("usage")
|
|
731
|
+
if isinstance(usage, dict):
|
|
732
|
+
total_views = usage.get("total_views")
|
|
733
|
+
if total_views is not None:
|
|
734
|
+
lines.append(f"total_views={total_views}")
|
|
735
|
+
|
|
736
|
+
text_content = "\n".join(lines)
|
|
737
|
+
raw_content = json.dumps(
|
|
738
|
+
{
|
|
739
|
+
"kind": ref.kind,
|
|
740
|
+
"site": ref.site,
|
|
741
|
+
"project_name": ref.project_name,
|
|
742
|
+
"asset_id": ref.asset_id,
|
|
743
|
+
"name": ref.name,
|
|
744
|
+
"metadata": ref.metadata,
|
|
745
|
+
},
|
|
746
|
+
ensure_ascii=False,
|
|
747
|
+
default=str,
|
|
748
|
+
)
|
|
749
|
+
return raw_content, text_content
|
|
750
|
+
|
|
751
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
752
|
+
cached = self._content_cache.get(asset_id)
|
|
753
|
+
if cached:
|
|
754
|
+
return cached
|
|
755
|
+
|
|
756
|
+
ref = self._asset_lookup.get(asset_id)
|
|
757
|
+
if ref is None:
|
|
758
|
+
try:
|
|
759
|
+
decoded = unhash_id(asset_id)
|
|
760
|
+
except Exception:
|
|
761
|
+
decoded = asset_id
|
|
762
|
+
|
|
763
|
+
if decoded.startswith("TABLEAU_#_"):
|
|
764
|
+
decoded = decoded[len("TABLEAU_#_") :]
|
|
765
|
+
|
|
766
|
+
for known_ref in self._asset_lookup.values():
|
|
767
|
+
if known_ref.raw_id == decoded:
|
|
768
|
+
ref = known_ref
|
|
769
|
+
break
|
|
770
|
+
|
|
771
|
+
if ref is None:
|
|
772
|
+
return None
|
|
773
|
+
|
|
774
|
+
content = self._format_asset_content(ref)
|
|
775
|
+
self._content_cache[asset_id] = content
|
|
776
|
+
return content
|
|
777
|
+
|
|
778
|
+
def enrich_finding_location(
|
|
779
|
+
self,
|
|
780
|
+
finding: DetectionResult,
|
|
781
|
+
asset: SingleAssetScanResults,
|
|
782
|
+
text_content: str,
|
|
783
|
+
) -> None:
|
|
784
|
+
_ = text_content
|
|
785
|
+
ref = self._asset_lookup.get(asset.hash)
|
|
786
|
+
if not ref:
|
|
787
|
+
return
|
|
788
|
+
|
|
789
|
+
project_prefix = f"{ref.project_name}/" if ref.project_name else ""
|
|
790
|
+
finding.location = Location(path=f"{ref.site}/{project_prefix}{ref.kind}/{ref.name}")
|
|
791
|
+
|
|
792
|
+
def abort(self) -> None:
|
|
793
|
+
logger.info("Aborting Tableau extraction...")
|
|
794
|
+
super().abort()
|
|
795
|
+
|
|
796
|
+
def cleanup(self) -> None:
|
|
797
|
+
self._content_cache.clear()
|
|
798
|
+
self._asset_lookup.clear()
|
|
799
|
+
self._owner_cache.clear()
|