acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (107) hide show
  1. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
  2. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
  3. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
  59. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  60. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  62. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  63. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  64. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  65. datahub/ingestion/source/sql/mssql/source.py +8 -4
  66. datahub/ingestion/source/sql/oracle.py +51 -4
  67. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  68. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  69. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  70. datahub/ingestion/source/superset.py +291 -35
  71. datahub/ingestion/source/usage/usage_common.py +0 -65
  72. datahub/ingestion/source/vertexai/__init__.py +0 -0
  73. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  74. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  75. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  76. datahub/metadata/_schema_classes.py +472 -1
  77. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  80. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  81. datahub/metadata/schema.avsc +313 -2
  82. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  83. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  84. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  85. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  86. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  87. datahub/metadata/schemas/Deprecation.avsc +2 -0
  88. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  89. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  90. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  91. datahub/metadata/schemas/Siblings.avsc +2 -0
  92. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  93. datahub/sdk/__init__.py +1 -0
  94. datahub/sdk/dataset.py +122 -0
  95. datahub/sdk/entity.py +99 -3
  96. datahub/sdk/entity_client.py +27 -3
  97. datahub/sdk/main_client.py +24 -1
  98. datahub/sdk/search_client.py +81 -8
  99. datahub/sdk/search_filters.py +94 -37
  100. datahub/sql_parsing/split_statements.py +17 -3
  101. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  102. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  103. datahub/testing/mcp_diff.py +1 -18
  104. datahub/utilities/threaded_iterator_executor.py +16 -3
  105. datahub/ingestion/source/vertexai.py +0 -697
  106. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  107. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import logging
3
3
  from dataclasses import dataclass, field
4
4
  from datetime import datetime
5
5
  from functools import lru_cache
6
- from typing import Any, Dict, Iterable, List, Optional
6
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
7
 
8
8
  import dateutil.parser as dp
9
9
  import requests
@@ -11,6 +11,7 @@ from pydantic import BaseModel
11
11
  from pydantic.class_validators import root_validator, validator
12
12
  from pydantic.fields import Field
13
13
 
14
+ import datahub.emitter.mce_builder as builder
14
15
  from datahub.configuration.common import AllowDenyPattern
15
16
  from datahub.configuration.source_common import (
16
17
  EnvConfigMixin,
@@ -23,8 +24,10 @@ from datahub.emitter.mce_builder import (
23
24
  make_dataset_urn,
24
25
  make_dataset_urn_with_platform_instance,
25
26
  make_domain_urn,
27
+ make_schema_field_urn,
26
28
  make_user_urn,
27
29
  )
30
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
31
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
29
32
  from datahub.ingestion.api.common import PipelineContext
30
33
  from datahub.ingestion.api.decorators import (
@@ -49,6 +52,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
52
  )
50
53
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
51
54
  ChangeAuditStamps,
55
+ InputField,
56
+ InputFields,
52
57
  Status,
53
58
  TimeStamp,
54
59
  )
@@ -59,11 +64,16 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
59
64
  )
60
65
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
61
66
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
67
+ BooleanTypeClass,
68
+ DateTypeClass,
62
69
  MySqlDDL,
63
70
  NullType,
71
+ NullTypeClass,
72
+ NumberTypeClass,
64
73
  SchemaField,
65
74
  SchemaFieldDataType,
66
75
  SchemaMetadata,
76
+ StringTypeClass,
67
77
  )
68
78
  from datahub.metadata.schema_classes import (
69
79
  AuditStampClass,
@@ -72,6 +82,9 @@ from datahub.metadata.schema_classes import (
72
82
  DashboardInfoClass,
73
83
  DatasetLineageTypeClass,
74
84
  DatasetPropertiesClass,
85
+ FineGrainedLineageClass,
86
+ FineGrainedLineageDownstreamTypeClass,
87
+ FineGrainedLineageUpstreamTypeClass,
75
88
  GlobalTagsClass,
76
89
  OwnerClass,
77
90
  OwnershipClass,
@@ -80,6 +93,10 @@ from datahub.metadata.schema_classes import (
80
93
  UpstreamClass,
81
94
  UpstreamLineageClass,
82
95
  )
96
+ from datahub.sql_parsing.sqlglot_lineage import (
97
+ SqlParsingResult,
98
+ create_lineage_sql_parsed_result,
99
+ )
83
100
  from datahub.utilities import config_clean
84
101
  from datahub.utilities.lossy_collections import LossyList
85
102
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -105,9 +122,17 @@ chart_type_from_viz_type = {
105
122
  "box_plot": ChartTypeClass.BAR,
106
123
  }
107
124
 
108
-
109
125
  platform_without_databases = ["druid"]
110
126
 
127
+ FIELD_TYPE_MAPPING = {
128
+ "INT": NumberTypeClass,
129
+ "STRING": StringTypeClass,
130
+ "FLOAT": NumberTypeClass,
131
+ "DATETIME": DateTypeClass,
132
+ "BOOLEAN": BooleanTypeClass,
133
+ "SQL": StringTypeClass,
134
+ }
135
+
111
136
 
112
137
  @dataclass
113
138
  class SupersetSourceReport(StaleEntityRemovalSourceReport):
@@ -181,6 +206,10 @@ class SupersetConfig(
181
206
  provider: str = Field(default="db", description="Superset provider.")
182
207
  options: Dict = Field(default={}, description="")
183
208
 
209
+ timeout: int = Field(
210
+ default=10, description="Timeout of single API call to superset."
211
+ )
212
+
184
213
  # TODO: Check and remove this if no longer needed.
185
214
  # Config database_alias is removed from sql sources.
186
215
  database_alias: Dict[str, str] = Field(
@@ -285,13 +314,16 @@ class SupersetSource(StatefulIngestionSourceBase):
285
314
  }
286
315
  )
287
316
 
288
- # Test the connection
289
317
  test_response = requests_session.get(
290
- f"{self.config.connect_uri}/api/v1/dashboard/"
318
+ f"{self.config.connect_uri}/api/v1/dashboard/",
319
+ timeout=self.config.timeout,
291
320
  )
292
- if test_response.status_code == 200:
293
- pass
294
- # TODO(Gabe): how should we message about this error?
321
+ if test_response.status_code != 200:
322
+ # throw an error and terminate ingestion,
323
+ # cannot proceed without access token
324
+ logger.error(
325
+ f"Failed to log in to Superset with status: {test_response.status_code}"
326
+ )
295
327
  return requests_session
296
328
 
297
329
  def paginate_entity_api_results(self, entity_type, page_size=100):
@@ -302,6 +334,7 @@ class SupersetSource(StatefulIngestionSourceBase):
302
334
  response = self.session.get(
303
335
  f"{self.config.connect_uri}/api/v1/{entity_type}",
304
336
  params={"q": f"(page:{current_page},page_size:{page_size})"},
337
+ timeout=self.config.timeout,
305
338
  )
306
339
 
307
340
  if response.status_code != 200:
@@ -339,10 +372,11 @@ class SupersetSource(StatefulIngestionSourceBase):
339
372
  def get_dataset_info(self, dataset_id: int) -> dict:
340
373
  dataset_response = self.session.get(
341
374
  f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
375
+ timeout=self.config.timeout,
342
376
  )
343
377
  if dataset_response.status_code != 200:
344
378
  logger.warning(f"Failed to get dataset info: {dataset_response.text}")
345
- dataset_response.raise_for_status()
379
+ return {}
346
380
  return dataset_response.json()
347
381
 
348
382
  def get_datasource_urn_from_id(
@@ -393,8 +427,9 @@ class SupersetSource(StatefulIngestionSourceBase):
393
427
  )
394
428
 
395
429
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
430
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
396
431
  modified_ts = int(
397
- dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
432
+ dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
398
433
  )
399
434
  title = dashboard_data.get("dashboard_title", "")
400
435
  # note: the API does not currently supply created_by usernames due to a bug
@@ -494,7 +529,119 @@ class SupersetSource(StatefulIngestionSourceBase):
494
529
  entity_urn=dashboard_snapshot.urn,
495
530
  )
496
531
 
497
- def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
532
+ def build_input_fields(
533
+ self,
534
+ chart_columns: List[Tuple[str, str, str]],
535
+ datasource_urn: Union[str, None],
536
+ ) -> List[InputField]:
537
+ input_fields: List[InputField] = []
538
+
539
+ for column in chart_columns:
540
+ col_name, col_type, description = column
541
+ if not col_type or not datasource_urn:
542
+ continue
543
+
544
+ type_class = FIELD_TYPE_MAPPING.get(
545
+ col_type.upper(), NullTypeClass
546
+ ) # gets the type mapping
547
+
548
+ input_fields.append(
549
+ InputField(
550
+ schemaFieldUrn=builder.make_schema_field_urn(
551
+ parent_urn=str(datasource_urn),
552
+ field_path=col_name,
553
+ ),
554
+ schemaField=SchemaField(
555
+ fieldPath=col_name,
556
+ type=SchemaFieldDataType(type=type_class()), # type: ignore
557
+ description=(description if description != "null" else ""),
558
+ nativeDataType=col_type,
559
+ globalTags=None,
560
+ nullable=True,
561
+ ),
562
+ )
563
+ )
564
+
565
+ return input_fields
566
+
567
+ def construct_chart_cll(
568
+ self,
569
+ chart_data: dict,
570
+ datasource_urn: Union[str, None],
571
+ datasource_id: Union[Any, int],
572
+ ) -> List[InputField]:
573
+ column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
574
+ "all_columns", []
575
+ )
576
+
577
+ # the second field represents whether its a SQL expression,
578
+ # false being just regular column and true being SQL col
579
+ chart_column_data: List[Tuple[str, bool]] = [
580
+ (column, False)
581
+ if isinstance(column, str)
582
+ else (column.get("label", ""), True)
583
+ for column in column_data
584
+ ]
585
+
586
+ dataset_columns: List[Tuple[str, str, str]] = []
587
+
588
+ # parses the superset dataset's column info, to build type and description info
589
+ if datasource_id:
590
+ dataset_info = self.get_dataset_info(datasource_id).get("result", {})
591
+ dataset_column_info = dataset_info.get("columns", [])
592
+
593
+ for column in dataset_column_info:
594
+ col_name = column.get("column_name", "")
595
+ col_type = column.get("type", "")
596
+ col_description = column.get("description", "")
597
+
598
+ # if missing column name or column type, cannot construct the column,
599
+ # so we skip this column, missing description is fine
600
+ if col_name == "" or col_type == "":
601
+ logger.info(f"could not construct column lineage for {column}")
602
+ continue
603
+
604
+ dataset_columns.append((col_name, col_type, col_description))
605
+ else:
606
+ # if no datasource id, cannot build cll, just return
607
+ logger.warning(
608
+ "no datasource id was found, cannot build column level lineage"
609
+ )
610
+ return []
611
+
612
+ chart_columns: List[Tuple[str, str, str]] = []
613
+ for chart_col in chart_column_data:
614
+ chart_col_name, is_sql = chart_col
615
+ if is_sql:
616
+ chart_columns.append(
617
+ (
618
+ chart_col_name,
619
+ "SQL",
620
+ "",
621
+ )
622
+ )
623
+ continue
624
+
625
+ # find matching upstream column
626
+ for dataset_col in dataset_columns:
627
+ dataset_col_name, dataset_col_type, dataset_col_description = (
628
+ dataset_col
629
+ )
630
+ if dataset_col_name == chart_col_name:
631
+ chart_columns.append(
632
+ (chart_col_name, dataset_col_type, dataset_col_description)
633
+ ) # column name, column type, description
634
+ break
635
+
636
+ # if no matching upstream column was found
637
+ if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
638
+ chart_columns.append((chart_col_name, "", ""))
639
+
640
+ return self.build_input_fields(chart_columns, datasource_urn)
641
+
642
+ def construct_chart_from_chart_data(
643
+ self, chart_data: dict
644
+ ) -> Iterable[MetadataWorkUnit]:
498
645
  chart_urn = make_chart_urn(
499
646
  platform=self.platform,
500
647
  name=str(chart_data["id"]),
@@ -506,8 +653,9 @@ class SupersetSource(StatefulIngestionSourceBase):
506
653
  )
507
654
 
508
655
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
656
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
509
657
  modified_ts = int(
510
- dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
658
+ dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
511
659
  )
512
660
  title = chart_data.get("slice_name", "")
513
661
 
@@ -581,6 +729,18 @@ class SupersetSource(StatefulIngestionSourceBase):
581
729
  )
582
730
  chart_snapshot.aspects.append(chart_info)
583
731
 
732
+ input_fields = self.construct_chart_cll(
733
+ chart_data, datasource_urn, datasource_id
734
+ )
735
+
736
+ if input_fields:
737
+ yield MetadataChangeProposalWrapper(
738
+ entityUrn=chart_urn,
739
+ aspect=InputFields(
740
+ fields=sorted(input_fields, key=lambda x: x.schemaFieldUrn)
741
+ ),
742
+ ).as_workunit()
743
+
584
744
  chart_owners_list = self.build_owner_urn(chart_data)
585
745
  owners_info = OwnershipClass(
586
746
  owners=[
@@ -593,7 +753,14 @@ class SupersetSource(StatefulIngestionSourceBase):
593
753
  lastModified=last_modified,
594
754
  )
595
755
  chart_snapshot.aspects.append(owners_info)
596
- return chart_snapshot
756
+ yield MetadataWorkUnit(
757
+ id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
758
+ )
759
+
760
+ yield from self._get_domain_wu(
761
+ title=chart_data.get("slice_name", ""),
762
+ entity_urn=chart_urn,
763
+ )
597
764
 
598
765
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
599
766
  for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
@@ -623,20 +790,12 @@ class SupersetSource(StatefulIngestionSourceBase):
623
790
  f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
624
791
  )
625
792
 
626
- chart_snapshot = self.construct_chart_from_chart_data(chart_data)
627
-
628
- mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
793
+ yield from self.construct_chart_from_chart_data(chart_data)
629
794
  except Exception as e:
630
795
  self.report.warning(
631
796
  f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
632
797
  )
633
798
  continue
634
- # Emit the chart
635
- yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
636
- yield from self._get_domain_wu(
637
- title=chart_data.get("slice_name", ""),
638
- entity_urn=chart_snapshot.urn,
639
- )
640
799
 
641
800
  def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
642
801
  schema_fields: List[SchemaField] = []
@@ -680,6 +839,88 @@ class SupersetSource(StatefulIngestionSourceBase):
680
839
  env=self.config.env,
681
840
  )
682
841
 
842
+ def generate_virtual_dataset_lineage(
843
+ self,
844
+ parsed_query_object: SqlParsingResult,
845
+ datasource_urn: str,
846
+ ) -> UpstreamLineageClass:
847
+ cll = (
848
+ parsed_query_object.column_lineage
849
+ if parsed_query_object.column_lineage is not None
850
+ else []
851
+ )
852
+
853
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
854
+
855
+ for cll_info in cll:
856
+ downstream = (
857
+ [make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
858
+ if cll_info.downstream and cll_info.downstream.column
859
+ else []
860
+ )
861
+ upstreams = [
862
+ make_schema_field_urn(column_ref.table, column_ref.column)
863
+ for column_ref in cll_info.upstreams
864
+ ]
865
+ fine_grained_lineages.append(
866
+ FineGrainedLineageClass(
867
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
868
+ downstreams=downstream,
869
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
870
+ upstreams=upstreams,
871
+ )
872
+ )
873
+
874
+ upstream_lineage = UpstreamLineageClass(
875
+ upstreams=[
876
+ UpstreamClass(
877
+ type=DatasetLineageTypeClass.TRANSFORMED,
878
+ dataset=input_table_urn,
879
+ )
880
+ for input_table_urn in parsed_query_object.in_tables
881
+ ],
882
+ fineGrainedLineages=fine_grained_lineages,
883
+ )
884
+ return upstream_lineage
885
+
886
+ def generate_physical_dataset_lineage(
887
+ self,
888
+ dataset_response: dict,
889
+ upstream_dataset: str,
890
+ datasource_urn: str,
891
+ ) -> UpstreamLineageClass:
892
+ # To generate column level lineage, we can manually decode the metadata
893
+ # to produce the ColumnLineageInfo
894
+ columns = dataset_response.get("result", {}).get("columns", [])
895
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
896
+
897
+ for column in columns:
898
+ column_name = column.get("column_name", "")
899
+ if not column_name:
900
+ continue
901
+
902
+ downstream = [make_schema_field_urn(datasource_urn, column_name)]
903
+ upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
904
+ fine_grained_lineages.append(
905
+ FineGrainedLineageClass(
906
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
907
+ downstreams=downstream,
908
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
909
+ upstreams=upstreams,
910
+ )
911
+ )
912
+
913
+ upstream_lineage = UpstreamLineageClass(
914
+ upstreams=[
915
+ UpstreamClass(
916
+ type=DatasetLineageTypeClass.TRANSFORMED,
917
+ dataset=upstream_dataset,
918
+ )
919
+ ],
920
+ fineGrainedLineages=fine_grained_lineages,
921
+ )
922
+ return upstream_lineage
923
+
683
924
  def construct_dataset_from_dataset_data(
684
925
  self, dataset_data: dict
685
926
  ) -> DatasetSnapshot:
@@ -692,14 +933,23 @@ class SupersetSource(StatefulIngestionSourceBase):
692
933
  dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
693
934
 
694
935
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
936
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
695
937
  modified_ts = int(
696
- dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
938
+ dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
697
939
  )
698
940
  last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
699
941
 
700
942
  upstream_warehouse_platform = (
701
943
  dataset_response.get("result", {}).get("database", {}).get("backend")
702
944
  )
945
+ upstream_warehouse_db_name = (
946
+ dataset_response.get("result", {}).get("database", {}).get("database_name")
947
+ )
948
+
949
+ # if we have rendered sql, we always use that and defualt back to regular sql
950
+ sql = dataset_response.get("result", {}).get(
951
+ "rendered_sql"
952
+ ) or dataset_response.get("result", {}).get("sql")
703
953
 
704
954
  # Preset has a way of naming their platforms differently than
705
955
  # how datahub names them, so map the platform name to the correct naming
@@ -712,22 +962,28 @@ class SupersetSource(StatefulIngestionSourceBase):
712
962
  if upstream_warehouse_platform in warehouse_naming:
713
963
  upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
714
964
 
715
- # TODO: Categorize physical vs virtual upstream dataset
716
- # mark all upstream dataset as physical for now, in the future we would ideally like
717
- # to differentiate physical vs virtual upstream datasets
718
- tag_urn = f"urn:li:tag:{self.platform}:physical"
719
965
  upstream_dataset = self.get_datasource_urn_from_id(
720
966
  dataset_response, upstream_warehouse_platform
721
967
  )
722
- upstream_lineage = UpstreamLineageClass(
723
- upstreams=[
724
- UpstreamClass(
725
- type=DatasetLineageTypeClass.TRANSFORMED,
726
- dataset=upstream_dataset,
727
- properties={"externalUrl": dataset_url},
728
- )
729
- ]
730
- )
968
+
969
+ # Sometimes the field will be null instead of not existing
970
+ if sql == "null" or not sql:
971
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
972
+ upstream_lineage = self.generate_physical_dataset_lineage(
973
+ dataset_response, upstream_dataset, datasource_urn
974
+ )
975
+ else:
976
+ tag_urn = f"urn:li:tag:{self.platform}:virtual"
977
+ parsed_query_object = create_lineage_sql_parsed_result(
978
+ query=sql,
979
+ default_db=upstream_warehouse_db_name,
980
+ platform=upstream_warehouse_platform,
981
+ platform_instance=None,
982
+ env=self.config.env,
983
+ )
984
+ upstream_lineage = self.generate_virtual_dataset_lineage(
985
+ parsed_query_object, datasource_urn
986
+ )
731
987
 
732
988
  dataset_info = DatasetPropertiesClass(
733
989
  name=dataset.table_name,
@@ -12,11 +12,9 @@ from typing import (
12
12
  Optional,
13
13
  Tuple,
14
14
  TypeVar,
15
- Union,
16
15
  )
17
16
 
18
17
  import pydantic
19
- from deprecated import deprecated
20
18
  from pydantic.fields import Field
21
19
 
22
20
  import datahub.emitter.mce_builder as builder
@@ -28,19 +26,13 @@ from datahub.configuration.time_window_config import (
28
26
  )
29
27
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
30
28
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetUsageStatistics
32
29
  from datahub.metadata.schema_classes import (
33
- CalendarIntervalClass,
34
30
  DatasetFieldUsageCountsClass,
35
31
  DatasetUsageStatisticsClass,
36
32
  DatasetUserUsageCountsClass,
37
33
  TimeWindowSizeClass,
38
- UsageAggregationClass,
39
- WindowDurationClass,
40
34
  )
41
35
  from datahub.utilities.sql_formatter import format_sql_query, trim_query
42
- from datahub.utilities.urns.dataset_urn import DatasetUrn
43
- from datahub.utilities.urns.urn import guess_entity_type
44
36
 
45
37
  logger = logging.getLogger(__name__)
46
38
 
@@ -295,60 +287,3 @@ class UsageAggregator(Generic[ResourceType]):
295
287
  user_urn_builder=user_urn_builder,
296
288
  queries_character_limit=self.config.queries_character_limit,
297
289
  )
298
-
299
-
300
- @deprecated
301
- def convert_usage_aggregation_class(
302
- obj: UsageAggregationClass,
303
- ) -> MetadataChangeProposalWrapper:
304
- # Legacy usage aggregation only supported dataset usage stats
305
- if guess_entity_type(obj.resource) == DatasetUrn.ENTITY_TYPE:
306
- aspect = DatasetUsageStatistics(
307
- timestampMillis=obj.bucket,
308
- eventGranularity=TimeWindowSizeClass(
309
- unit=convert_window_to_interval(obj.duration)
310
- ),
311
- uniqueUserCount=obj.metrics.uniqueUserCount,
312
- totalSqlQueries=obj.metrics.totalSqlQueries,
313
- topSqlQueries=obj.metrics.topSqlQueries,
314
- userCounts=(
315
- [
316
- DatasetUserUsageCountsClass(
317
- user=u.user, count=u.count, userEmail=u.userEmail
318
- )
319
- for u in obj.metrics.users
320
- if u.user is not None
321
- ]
322
- if obj.metrics.users
323
- else None
324
- ),
325
- fieldCounts=(
326
- [
327
- DatasetFieldUsageCountsClass(fieldPath=f.fieldName, count=f.count)
328
- for f in obj.metrics.fields
329
- ]
330
- if obj.metrics.fields
331
- else None
332
- ),
333
- )
334
- return MetadataChangeProposalWrapper(entityUrn=obj.resource, aspect=aspect)
335
- else:
336
- raise Exception(
337
- f"Skipping unsupported usage aggregation - invalid entity type: {obj}"
338
- )
339
-
340
-
341
- @deprecated
342
- def convert_window_to_interval(window: Union[str, WindowDurationClass]) -> str:
343
- if window == WindowDurationClass.YEAR:
344
- return CalendarIntervalClass.YEAR
345
- elif window == WindowDurationClass.MONTH:
346
- return CalendarIntervalClass.MONTH
347
- elif window == WindowDurationClass.WEEK:
348
- return CalendarIntervalClass.WEEK
349
- elif window == WindowDurationClass.DAY:
350
- return CalendarIntervalClass.DAY
351
- elif window == WindowDurationClass.HOUR:
352
- return CalendarIntervalClass.HOUR
353
- else:
354
- raise Exception(f"Unsupported window duration: {window}")
File without changes