acryl-datahub 1.2.0.2rc2__py3-none-any.whl → 1.2.0.2rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (39) hide show
  1. {acryl_datahub-1.2.0.2rc2.dist-info → acryl_datahub-1.2.0.2rc3.dist-info}/METADATA +2569 -2567
  2. {acryl_datahub-1.2.0.2rc2.dist-info → acryl_datahub-1.2.0.2rc3.dist-info}/RECORD +39 -31
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +13 -1
  5. datahub/ingestion/autogenerated/capability_summary.json +97 -6
  6. datahub/ingestion/source/aws/glue.py +8 -0
  7. datahub/ingestion/source/cassandra/cassandra.py +5 -7
  8. datahub/ingestion/source/common/subtypes.py +2 -0
  9. datahub/ingestion/source/datahub/datahub_source.py +3 -0
  10. datahub/ingestion/source/delta_lake/source.py +1 -0
  11. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  12. datahub/ingestion/source/grafana/field_utils.py +307 -0
  13. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  14. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  15. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  16. datahub/ingestion/source/grafana/lineage.py +202 -0
  17. datahub/ingestion/source/grafana/models.py +120 -0
  18. datahub/ingestion/source/grafana/report.py +91 -0
  19. datahub/ingestion/source/grafana/types.py +16 -0
  20. datahub/ingestion/source/hex/hex.py +8 -0
  21. datahub/ingestion/source/looker/looker_source.py +9 -0
  22. datahub/ingestion/source/looker/lookml_source.py +8 -0
  23. datahub/ingestion/source/mongodb.py +11 -1
  24. datahub/ingestion/source/redshift/redshift.py +8 -1
  25. datahub/ingestion/source/s3/source.py +9 -1
  26. datahub/ingestion/source/sql/athena.py +8 -2
  27. datahub/ingestion/source/sql/clickhouse.py +9 -0
  28. datahub/ingestion/source/sql_queries.py +2 -2
  29. datahub/metadata/_internal_schema_classes.py +18 -3
  30. datahub/metadata/schema.avsc +10 -1
  31. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +10 -1
  32. datahub/sdk/dataset.py +44 -0
  33. datahub/sdk/search_filters.py +34 -14
  34. datahub/sql_parsing/sql_parsing_aggregator.py +5 -0
  35. datahub/telemetry/telemetry.py +4 -1
  36. {acryl_datahub-1.2.0.2rc2.dist-info → acryl_datahub-1.2.0.2rc3.dist-info}/WHEEL +0 -0
  37. {acryl_datahub-1.2.0.2rc2.dist-info → acryl_datahub-1.2.0.2rc3.dist-info}/entry_points.txt +0 -0
  38. {acryl_datahub-1.2.0.2rc2.dist-info → acryl_datahub-1.2.0.2rc3.dist-info}/licenses/LICENSE +0 -0
  39. {acryl_datahub-1.2.0.2rc2.dist-info → acryl_datahub-1.2.0.2rc3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,272 @@
1
+ from typing import Dict, List, Optional, Tuple
2
+
3
+ from datahub.emitter.mce_builder import (
4
+ make_chart_urn,
5
+ make_dashboard_urn,
6
+ make_data_platform_urn,
7
+ make_dataplatform_instance_urn,
8
+ make_dataset_urn_with_platform_instance,
9
+ make_tag_urn,
10
+ make_user_urn,
11
+ )
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.source.grafana.models import Dashboard, Panel
14
+ from datahub.ingestion.source.grafana.types import CHART_TYPE_MAPPINGS
15
+ from datahub.metadata.schema_classes import (
16
+ ChangeAuditStampsClass,
17
+ ChartInfoClass,
18
+ DashboardInfoClass,
19
+ DataPlatformInstanceClass,
20
+ GlobalTagsClass,
21
+ OwnerClass,
22
+ OwnershipClass,
23
+ OwnershipTypeClass,
24
+ StatusClass,
25
+ TagAssociationClass,
26
+ )
27
+
28
+
29
+ def build_chart_mcps(
30
+ panel: Panel,
31
+ dashboard: Dashboard,
32
+ platform: str,
33
+ platform_instance: Optional[str],
34
+ env: str,
35
+ base_url: str,
36
+ ingest_tags: bool,
37
+ ) -> Tuple[Optional[str], str, List[MetadataChangeProposalWrapper]]:
38
+ """Build chart metadata change proposals"""
39
+ ds_urn = None
40
+ mcps = []
41
+
42
+ chart_urn = make_chart_urn(
43
+ platform,
44
+ f"{dashboard.uid}.{panel.id}",
45
+ platform_instance,
46
+ )
47
+
48
+ # Platform instance aspect
49
+ mcps.append(
50
+ MetadataChangeProposalWrapper(
51
+ entityUrn=chart_urn,
52
+ aspect=DataPlatformInstanceClass(
53
+ platform=make_data_platform_urn(platform),
54
+ instance=make_dataplatform_instance_urn(
55
+ platform=platform,
56
+ instance=platform_instance,
57
+ )
58
+ if platform_instance
59
+ else None,
60
+ ),
61
+ )
62
+ )
63
+
64
+ # Status aspect
65
+ mcps.append(
66
+ MetadataChangeProposalWrapper(
67
+ entityUrn=chart_urn,
68
+ aspect=StatusClass(removed=False),
69
+ )
70
+ )
71
+
72
+ # Get input datasets
73
+ input_datasets = []
74
+ if panel.datasource_ref:
75
+ ds_type = panel.datasource_ref.type or "unknown"
76
+ ds_uid = panel.datasource_ref.uid or "unknown"
77
+
78
+ # Add Grafana dataset
79
+ dataset_name = f"{ds_type}.{ds_uid}.{panel.id}"
80
+ ds_urn = make_dataset_urn_with_platform_instance(
81
+ platform=platform,
82
+ name=dataset_name,
83
+ platform_instance=platform_instance,
84
+ env=env,
85
+ )
86
+ input_datasets.append(ds_urn)
87
+
88
+ # Chart info aspect
89
+ title = panel.title or f"Panel {panel.id}"
90
+ mcps.append(
91
+ MetadataChangeProposalWrapper(
92
+ entityUrn=chart_urn,
93
+ aspect=ChartInfoClass(
94
+ type=CHART_TYPE_MAPPINGS.get(panel.type) if panel.type else None,
95
+ description=panel.description,
96
+ title=title,
97
+ lastModified=ChangeAuditStampsClass(),
98
+ chartUrl=f"{base_url}/d/{dashboard.uid}?viewPanel={panel.id}",
99
+ customProperties=_build_custom_properties(panel),
100
+ inputs=input_datasets,
101
+ ),
102
+ )
103
+ )
104
+
105
+ # Tags aspect
106
+ if dashboard.tags and ingest_tags:
107
+ tags = []
108
+ for tag in dashboard.tags:
109
+ if ":" in tag:
110
+ key, value = tag.split(":", 1)
111
+ tag_urn = make_tag_urn(f"{key}.{value}")
112
+ else:
113
+ tag_urn = make_tag_urn(tag)
114
+ tags.append(TagAssociationClass(tag=tag_urn))
115
+
116
+ if tags:
117
+ mcps.append(
118
+ MetadataChangeProposalWrapper(
119
+ entityUrn=chart_urn,
120
+ aspect=GlobalTagsClass(tags=tags),
121
+ )
122
+ )
123
+
124
+ return ds_urn, chart_urn, mcps
125
+
126
+
127
+ def build_dashboard_mcps(
128
+ dashboard: Dashboard,
129
+ platform: str,
130
+ platform_instance: Optional[str],
131
+ chart_urns: List[str],
132
+ base_url: str,
133
+ ingest_owners: bool,
134
+ ingest_tags: bool,
135
+ ) -> Tuple[str, List[MetadataChangeProposalWrapper]]:
136
+ """Build dashboard metadata change proposals"""
137
+ mcps = []
138
+ dashboard_urn = make_dashboard_urn(platform, dashboard.uid, platform_instance)
139
+
140
+ # Platform instance aspect
141
+ mcps.append(
142
+ MetadataChangeProposalWrapper(
143
+ entityUrn=dashboard_urn,
144
+ aspect=DataPlatformInstanceClass(
145
+ platform=make_data_platform_urn(platform),
146
+ instance=make_dataplatform_instance_urn(
147
+ platform=platform,
148
+ instance=platform_instance,
149
+ )
150
+ if platform_instance
151
+ else None,
152
+ ),
153
+ )
154
+ )
155
+
156
+ # Dashboard info aspect
157
+ mcps.append(
158
+ MetadataChangeProposalWrapper(
159
+ entityUrn=dashboard_urn,
160
+ aspect=DashboardInfoClass(
161
+ description=dashboard.description,
162
+ title=dashboard.title,
163
+ charts=chart_urns,
164
+ lastModified=ChangeAuditStampsClass(),
165
+ dashboardUrl=f"{base_url}/d/{dashboard.uid}",
166
+ customProperties=_build_dashboard_properties(dashboard),
167
+ ),
168
+ )
169
+ )
170
+
171
+ # Ownership aspect
172
+ if dashboard.uid and ingest_owners:
173
+ owner = _build_ownership(dashboard)
174
+ if owner:
175
+ mcps.append(
176
+ MetadataChangeProposalWrapper(
177
+ entityUrn=dashboard_urn,
178
+ aspect=owner,
179
+ )
180
+ )
181
+
182
+ # Tags aspect
183
+ if dashboard.tags and ingest_tags:
184
+ tags = [TagAssociationClass(tag=make_tag_urn(tag)) for tag in dashboard.tags]
185
+ if tags:
186
+ mcps.append(
187
+ MetadataChangeProposalWrapper(
188
+ entityUrn=dashboard_urn,
189
+ aspect=GlobalTagsClass(tags=tags),
190
+ )
191
+ )
192
+
193
+ # Status aspect
194
+ mcps.append(
195
+ MetadataChangeProposalWrapper(
196
+ entityUrn=dashboard_urn,
197
+ aspect=StatusClass(removed=False),
198
+ )
199
+ )
200
+
201
+ return dashboard_urn, mcps
202
+
203
+
204
+ def _build_custom_properties(panel: Panel) -> Dict[str, str]:
205
+ """Build custom properties for chart"""
206
+ props = {}
207
+
208
+ if panel.type:
209
+ props["type"] = panel.type
210
+
211
+ if panel.datasource_ref:
212
+ props["datasourceType"] = panel.datasource_ref.type or ""
213
+ props["datasourceUid"] = panel.datasource_ref.uid or ""
214
+
215
+ for key in [
216
+ "description",
217
+ "format",
218
+ "pluginVersion",
219
+ "repeatDirection",
220
+ "maxDataPoints",
221
+ ]:
222
+ value = getattr(panel, key, None)
223
+ if value:
224
+ props[key] = str(value)
225
+
226
+ if panel.query_targets:
227
+ props["targetsCount"] = str(len(panel.query_targets))
228
+
229
+ return props
230
+
231
+
232
+ def _build_dashboard_properties(dashboard: Dashboard) -> Dict[str, str]:
233
+ """Build custom properties for dashboard"""
234
+ props = {}
235
+
236
+ if dashboard.timezone:
237
+ props["timezone"] = dashboard.timezone
238
+
239
+ if dashboard.schema_version:
240
+ props["schema_version"] = dashboard.schema_version
241
+
242
+ if dashboard.version:
243
+ props["version"] = dashboard.version
244
+
245
+ if dashboard.refresh:
246
+ props["refresh"] = dashboard.refresh
247
+
248
+ return props
249
+
250
+
251
+ def _build_ownership(dashboard: Dashboard) -> Optional[OwnershipClass]:
252
+ """Build ownership information"""
253
+ owners = []
254
+
255
+ if dashboard.uid:
256
+ owners.append(
257
+ OwnerClass(
258
+ owner=make_user_urn(dashboard.uid),
259
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
260
+ )
261
+ )
262
+
263
+ if dashboard.created_by:
264
+ owner_id = dashboard.created_by.split("@")[0]
265
+ owners.append(
266
+ OwnerClass(
267
+ owner=make_user_urn(owner_id),
268
+ type=OwnershipTypeClass.DATAOWNER,
269
+ )
270
+ )
271
+
272
+ return OwnershipClass(owners=owners) if owners else None
@@ -0,0 +1,307 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from datahub.ingestion.graph.client import DataHubGraph
5
+ from datahub.ingestion.source.grafana.models import Panel
6
+ from datahub.metadata.schema_classes import (
7
+ NumberTypeClass,
8
+ SchemaFieldClass,
9
+ SchemaFieldDataTypeClass,
10
+ StringTypeClass,
11
+ TimeTypeClass,
12
+ )
13
+ from datahub.sql_parsing.sqlglot_lineage import (
14
+ create_lineage_sql_parsed_result,
15
+ infer_output_schema,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def extract_sql_column_fields(target: Dict[str, Any]) -> List[SchemaFieldClass]:
22
+ """Extract fields from SQL-style columns."""
23
+ fields = []
24
+ for col in target.get("sql", {}).get("columns", []):
25
+ for param in col.get("parameters", []):
26
+ if param.get("type") == "column" and param.get("name"):
27
+ field_type: Union[NumberTypeClass, StringTypeClass, TimeTypeClass] = (
28
+ TimeTypeClass()
29
+ if col["type"] == "time"
30
+ else NumberTypeClass()
31
+ if col["type"] == "number"
32
+ else StringTypeClass()
33
+ )
34
+ fields.append(
35
+ SchemaFieldClass(
36
+ fieldPath=param["name"],
37
+ type=SchemaFieldDataTypeClass(type=field_type),
38
+ nativeDataType=col["type"],
39
+ )
40
+ )
41
+ return fields
42
+
43
+
44
+ def extract_prometheus_fields(target: Dict[str, Any]) -> List[SchemaFieldClass]:
45
+ """Extract fields from Prometheus expressions."""
46
+ expr = target.get("expr")
47
+ if expr:
48
+ legend = target.get("legendFormat", expr)
49
+ return [
50
+ SchemaFieldClass(
51
+ fieldPath=legend,
52
+ type=SchemaFieldDataTypeClass(type=NumberTypeClass()),
53
+ nativeDataType="prometheus_metric",
54
+ )
55
+ ]
56
+ return []
57
+
58
+
59
+ def extract_raw_sql_fields(
60
+ target: Dict[str, Any],
61
+ panel: Optional[Panel] = None,
62
+ connection_to_platform_map: Optional[Dict[str, Any]] = None,
63
+ graph: Optional[DataHubGraph] = None,
64
+ report: Optional[Any] = None,
65
+ ) -> List[SchemaFieldClass]:
66
+ """Extract fields from raw SQL queries using DataHub's SQL parsing."""
67
+ raw_sql = target.get("rawSql", "")
68
+ if not raw_sql:
69
+ return []
70
+
71
+ # Determine upstream platform and environment from datasource mapping
72
+ platform = "unknown"
73
+ env = "PROD"
74
+ default_db = None
75
+ default_schema = None
76
+ platform_instance = None
77
+ schema_aware = False
78
+
79
+ if panel and panel.datasource_ref and connection_to_platform_map:
80
+ ds_type = panel.datasource_ref.type or "unknown"
81
+ ds_uid = panel.datasource_ref.uid or "unknown"
82
+
83
+ # Try to find mapping by datasource UID first, then by type
84
+ platform_config = connection_to_platform_map.get(
85
+ ds_uid
86
+ ) or connection_to_platform_map.get(ds_type)
87
+
88
+ if platform_config:
89
+ platform = platform_config.platform
90
+ env = getattr(platform_config, "env", env)
91
+ default_db = getattr(platform_config, "database", None)
92
+ default_schema = getattr(platform_config, "database_schema", None)
93
+ platform_instance = getattr(platform_config, "platform_instance", None)
94
+
95
+ # Enable schema-aware parsing if we have platform mapping and graph access
96
+ if graph and platform != "unknown":
97
+ schema_aware = True
98
+
99
+ # Track SQL parsing attempt
100
+ if report:
101
+ report.report_sql_parsing_attempt()
102
+
103
+ try:
104
+ # Use DataHub's standard SQL parsing approach
105
+ sql_parsing_result = create_lineage_sql_parsed_result(
106
+ query=raw_sql,
107
+ default_db=default_db,
108
+ default_schema=default_schema,
109
+ platform=platform,
110
+ platform_instance=platform_instance,
111
+ env=env,
112
+ schema_aware=schema_aware,
113
+ graph=graph,
114
+ )
115
+
116
+ # Extract the output schema from the parsing result
117
+ output_schema = infer_output_schema(sql_parsing_result)
118
+
119
+ if output_schema:
120
+ if report:
121
+ report.report_sql_parsing_success()
122
+ return output_schema
123
+ else:
124
+ # If sqlglot parsing succeeds but no schema is inferred,
125
+ # fall back to basic parsing
126
+ logger.debug(f"No schema inferred from SQL: {raw_sql}")
127
+ fallback_result = _extract_raw_sql_fields_fallback(target)
128
+ if fallback_result and report:
129
+ report.report_sql_parsing_success()
130
+ elif report:
131
+ report.report_sql_parsing_failure()
132
+ return fallback_result
133
+
134
+ except Exception as e:
135
+ logger.debug(f"Failed to parse SQL with DataHub parser: {raw_sql}, error: {e}")
136
+ if report:
137
+ report.report_sql_parsing_failure()
138
+ # Fallback to basic parsing for backwards compatibility
139
+ return _extract_raw_sql_fields_fallback(target)
140
+
141
+
142
+ def _extract_raw_sql_fields_fallback(target: Dict[str, Any]) -> List[SchemaFieldClass]:
143
+ """Fallback basic SQL parsing for when sqlglot fails."""
144
+ raw_sql = target.get("rawSql", "").lower()
145
+ if not raw_sql:
146
+ return []
147
+
148
+ try:
149
+ sql = raw_sql.lower()
150
+ select_start = sql.index("select") + 6 # len("select")
151
+ from_start = sql.index("from")
152
+ select_part = sql[select_start:from_start].strip()
153
+
154
+ # Split by comma, handling nested parentheses
155
+ columns = []
156
+ current_column = ""
157
+ paren_count = 0
158
+
159
+ for char in select_part:
160
+ if char == "," and paren_count == 0:
161
+ if current_column.strip():
162
+ columns.append(current_column.strip())
163
+ current_column = ""
164
+ else:
165
+ if char == "(":
166
+ paren_count += 1
167
+ elif char == ")":
168
+ paren_count -= 1
169
+ current_column += char
170
+
171
+ if current_column.strip():
172
+ columns.append(current_column.strip())
173
+
174
+ # For each column, extract the alias if it exists
175
+ fields = []
176
+ for col in columns:
177
+ # Check for alias with 'AS' keyword
178
+ if " as " in col:
179
+ field_name = col.split(" as ")[-1].strip()
180
+ else:
181
+ # If no alias, use the last part after last space
182
+ # This handles both simple columns and function calls without alias
183
+ field_name = col.split()[-1].strip()
184
+
185
+ # Clean up any remaining quotes or parentheses
186
+ field_name = field_name.strip("\"'()")
187
+
188
+ fields.append(
189
+ SchemaFieldClass(
190
+ fieldPath=field_name,
191
+ type=SchemaFieldDataTypeClass(type=StringTypeClass()),
192
+ nativeDataType="sql_column",
193
+ )
194
+ )
195
+
196
+ return fields
197
+
198
+ except (IndexError, ValueError, StopIteration) as e:
199
+ logger.warning(f"Failed to parse SQL: {target.get('rawSql')}", e)
200
+ return []
201
+
202
+
203
+ def extract_fields_from_panel(
204
+ panel: Panel,
205
+ connection_to_platform_map: Optional[Dict[str, Any]] = None,
206
+ graph: Optional[DataHubGraph] = None,
207
+ report: Optional[Any] = None,
208
+ ) -> List[SchemaFieldClass]:
209
+ """Extract all fields from a panel."""
210
+ fields = []
211
+ fields.extend(
212
+ extract_fields_from_targets(
213
+ panel.query_targets, panel, connection_to_platform_map, graph, report
214
+ )
215
+ )
216
+ fields.extend(get_fields_from_field_config(panel.field_config))
217
+ fields.extend(get_fields_from_transformations(panel.transformations))
218
+
219
+ # Track schema field extraction
220
+ if report:
221
+ if fields:
222
+ report.report_schema_fields_extracted()
223
+ else:
224
+ report.report_no_schema_fields()
225
+
226
+ return fields
227
+
228
+
229
+ def extract_fields_from_targets(
230
+ targets: List[Dict[str, Any]],
231
+ panel: Optional[Panel] = None,
232
+ connection_to_platform_map: Optional[Dict[str, Any]] = None,
233
+ graph: Optional[DataHubGraph] = None,
234
+ report: Optional[Any] = None,
235
+ ) -> List[SchemaFieldClass]:
236
+ """Extract fields from panel targets."""
237
+ fields = []
238
+ for target in targets:
239
+ fields.extend(extract_sql_column_fields(target))
240
+ fields.extend(extract_prometheus_fields(target))
241
+ fields.extend(
242
+ extract_raw_sql_fields(
243
+ target, panel, connection_to_platform_map, graph, report
244
+ )
245
+ )
246
+ fields.extend(extract_time_format_fields(target))
247
+ return fields
248
+
249
+
250
+ def extract_time_format_fields(target: Dict[str, Any]) -> List[SchemaFieldClass]:
251
+ """Extract fields from time series and table formats."""
252
+ if target.get("format") in {"time_series", "table"}:
253
+ return [
254
+ SchemaFieldClass(
255
+ fieldPath="time",
256
+ type=SchemaFieldDataTypeClass(type=TimeTypeClass()),
257
+ nativeDataType="timestamp",
258
+ )
259
+ ]
260
+ return []
261
+
262
+
263
+ def get_fields_from_field_config(
264
+ field_config: Dict[str, Any],
265
+ ) -> List[SchemaFieldClass]:
266
+ """Extract fields from field configuration."""
267
+ fields = []
268
+ defaults = field_config.get("defaults", {})
269
+ unit = defaults.get("unit")
270
+ if unit:
271
+ fields.append(
272
+ SchemaFieldClass(
273
+ fieldPath=f"value_{unit}",
274
+ type=SchemaFieldDataTypeClass(type=NumberTypeClass()),
275
+ nativeDataType="value",
276
+ )
277
+ )
278
+ for override in field_config.get("overrides", []):
279
+ if override.get("matcher", {}).get("id") == "byName":
280
+ field_name = override.get("matcher", {}).get("options")
281
+ if field_name:
282
+ fields.append(
283
+ SchemaFieldClass(
284
+ fieldPath=field_name,
285
+ type=SchemaFieldDataTypeClass(type=NumberTypeClass()),
286
+ nativeDataType="metric",
287
+ )
288
+ )
289
+ return fields
290
+
291
+
292
+ def get_fields_from_transformations(
293
+ transformations: List[Dict[str, Any]],
294
+ ) -> List[SchemaFieldClass]:
295
+ """Extract fields from transformations."""
296
+ fields = []
297
+ for transform in transformations:
298
+ if transform.get("type") == "organize":
299
+ for field_name in transform.get("options", {}).get("indexByName", {}):
300
+ fields.append(
301
+ SchemaFieldClass(
302
+ fieldPath=field_name,
303
+ type=SchemaFieldDataTypeClass(type=StringTypeClass()),
304
+ nativeDataType="transformed",
305
+ )
306
+ )
307
+ return fields