acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (65) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2486 -2487
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +64 -49
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/ingestion/api/source.py +6 -2
  9. datahub/ingestion/api/source_helpers.py +6 -2
  10. datahub/ingestion/extractor/schema_util.py +1 -0
  11. datahub/ingestion/source/common/data_platforms.py +23 -0
  12. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  13. datahub/ingestion/source/common/subtypes.py +15 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  15. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  16. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  17. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  18. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  19. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  20. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  21. datahub/ingestion/source/hex/__init__.py +0 -0
  22. datahub/ingestion/source/hex/api.py +394 -0
  23. datahub/ingestion/source/hex/constants.py +3 -0
  24. datahub/ingestion/source/hex/hex.py +167 -0
  25. datahub/ingestion/source/hex/mapper.py +372 -0
  26. datahub/ingestion/source/hex/model.py +68 -0
  27. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  28. datahub/ingestion/source/mlflow.py +198 -7
  29. datahub/ingestion/source/mode.py +11 -1
  30. datahub/ingestion/source/openapi.py +69 -34
  31. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  32. datahub/ingestion/source/s3/source.py +11 -0
  33. datahub/ingestion/source/slack/slack.py +399 -82
  34. datahub/ingestion/source/superset.py +138 -22
  35. datahub/ingestion/source/vertexai/__init__.py +0 -0
  36. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  37. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  38. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  39. datahub/metadata/_schema_classes.py +472 -1
  40. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  42. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  43. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  44. datahub/metadata/schema.avsc +311 -2
  45. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  46. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  47. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  48. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  49. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  50. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  51. datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
  52. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  53. datahub/metadata/schemas/Siblings.avsc +2 -0
  54. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  55. datahub/sdk/dataset.py +122 -0
  56. datahub/sdk/entity.py +99 -3
  57. datahub/sdk/entity_client.py +27 -3
  58. datahub/sdk/main_client.py +22 -0
  59. datahub/sdk/search_filters.py +4 -4
  60. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  61. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  62. datahub/testing/mcp_diff.py +1 -18
  63. datahub/ingestion/source/vertexai.py +0 -697
  64. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
  65. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ from datahub.emitter.mce_builder import (
23
23
  make_dataset_urn,
24
24
  make_dataset_urn_with_platform_instance,
25
25
  make_domain_urn,
26
+ make_schema_field_urn,
26
27
  make_user_urn,
27
28
  )
28
29
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
@@ -72,6 +73,9 @@ from datahub.metadata.schema_classes import (
72
73
  DashboardInfoClass,
73
74
  DatasetLineageTypeClass,
74
75
  DatasetPropertiesClass,
76
+ FineGrainedLineageClass,
77
+ FineGrainedLineageDownstreamTypeClass,
78
+ FineGrainedLineageUpstreamTypeClass,
75
79
  GlobalTagsClass,
76
80
  OwnerClass,
77
81
  OwnershipClass,
@@ -80,6 +84,10 @@ from datahub.metadata.schema_classes import (
80
84
  UpstreamClass,
81
85
  UpstreamLineageClass,
82
86
  )
87
+ from datahub.sql_parsing.sqlglot_lineage import (
88
+ SqlParsingResult,
89
+ create_lineage_sql_parsed_result,
90
+ )
83
91
  from datahub.utilities import config_clean
84
92
  from datahub.utilities.lossy_collections import LossyList
85
93
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -181,6 +189,10 @@ class SupersetConfig(
181
189
  provider: str = Field(default="db", description="Superset provider.")
182
190
  options: Dict = Field(default={}, description="")
183
191
 
192
+ timeout: int = Field(
193
+ default=10, description="Timeout of single API call to superset."
194
+ )
195
+
184
196
  # TODO: Check and remove this if no longer needed.
185
197
  # Config database_alias is removed from sql sources.
186
198
  database_alias: Dict[str, str] = Field(
@@ -285,13 +297,16 @@ class SupersetSource(StatefulIngestionSourceBase):
285
297
  }
286
298
  )
287
299
 
288
- # Test the connection
289
300
  test_response = requests_session.get(
290
- f"{self.config.connect_uri}/api/v1/dashboard/"
301
+ f"{self.config.connect_uri}/api/v1/dashboard/",
302
+ timeout=self.config.timeout,
291
303
  )
292
- if test_response.status_code == 200:
293
- pass
294
- # TODO(Gabe): how should we message about this error?
304
+ if test_response.status_code != 200:
305
+ # throw an error and terminate ingestion,
306
+ # cannot proceed without access token
307
+ logger.error(
308
+ f"Failed to log in to Superset with status: {test_response.status_code}"
309
+ )
295
310
  return requests_session
296
311
 
297
312
  def paginate_entity_api_results(self, entity_type, page_size=100):
@@ -302,6 +317,7 @@ class SupersetSource(StatefulIngestionSourceBase):
302
317
  response = self.session.get(
303
318
  f"{self.config.connect_uri}/api/v1/{entity_type}",
304
319
  params={"q": f"(page:{current_page},page_size:{page_size})"},
320
+ timeout=self.config.timeout,
305
321
  )
306
322
 
307
323
  if response.status_code != 200:
@@ -339,10 +355,11 @@ class SupersetSource(StatefulIngestionSourceBase):
339
355
  def get_dataset_info(self, dataset_id: int) -> dict:
340
356
  dataset_response = self.session.get(
341
357
  f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
358
+ timeout=self.config.timeout,
342
359
  )
343
360
  if dataset_response.status_code != 200:
344
361
  logger.warning(f"Failed to get dataset info: {dataset_response.text}")
345
- dataset_response.raise_for_status()
362
+ return {}
346
363
  return dataset_response.json()
347
364
 
348
365
  def get_datasource_urn_from_id(
@@ -393,8 +410,9 @@ class SupersetSource(StatefulIngestionSourceBase):
393
410
  )
394
411
 
395
412
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
413
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
396
414
  modified_ts = int(
397
- dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
415
+ dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
398
416
  )
399
417
  title = dashboard_data.get("dashboard_title", "")
400
418
  # note: the API does not currently supply created_by usernames due to a bug
@@ -506,8 +524,9 @@ class SupersetSource(StatefulIngestionSourceBase):
506
524
  )
507
525
 
508
526
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
527
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
509
528
  modified_ts = int(
510
- dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
529
+ dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
511
530
  )
512
531
  title = chart_data.get("slice_name", "")
513
532
 
@@ -680,6 +699,88 @@ class SupersetSource(StatefulIngestionSourceBase):
680
699
  env=self.config.env,
681
700
  )
682
701
 
702
+ def generate_virtual_dataset_lineage(
703
+ self,
704
+ parsed_query_object: SqlParsingResult,
705
+ datasource_urn: str,
706
+ ) -> UpstreamLineageClass:
707
+ cll = (
708
+ parsed_query_object.column_lineage
709
+ if parsed_query_object.column_lineage is not None
710
+ else []
711
+ )
712
+
713
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
714
+
715
+ for cll_info in cll:
716
+ downstream = (
717
+ [make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
718
+ if cll_info.downstream and cll_info.downstream.column
719
+ else []
720
+ )
721
+ upstreams = [
722
+ make_schema_field_urn(column_ref.table, column_ref.column)
723
+ for column_ref in cll_info.upstreams
724
+ ]
725
+ fine_grained_lineages.append(
726
+ FineGrainedLineageClass(
727
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
728
+ downstreams=downstream,
729
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
730
+ upstreams=upstreams,
731
+ )
732
+ )
733
+
734
+ upstream_lineage = UpstreamLineageClass(
735
+ upstreams=[
736
+ UpstreamClass(
737
+ type=DatasetLineageTypeClass.TRANSFORMED,
738
+ dataset=input_table_urn,
739
+ )
740
+ for input_table_urn in parsed_query_object.in_tables
741
+ ],
742
+ fineGrainedLineages=fine_grained_lineages,
743
+ )
744
+ return upstream_lineage
745
+
746
+ def generate_physical_dataset_lineage(
747
+ self,
748
+ dataset_response: dict,
749
+ upstream_dataset: str,
750
+ datasource_urn: str,
751
+ ) -> UpstreamLineageClass:
752
+ # To generate column level lineage, we can manually decode the metadata
753
+ # to produce the ColumnLineageInfo
754
+ columns = dataset_response.get("result", {}).get("columns", [])
755
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
756
+
757
+ for column in columns:
758
+ column_name = column.get("column_name", "")
759
+ if not column_name:
760
+ continue
761
+
762
+ downstream = [make_schema_field_urn(datasource_urn, column_name)]
763
+ upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
764
+ fine_grained_lineages.append(
765
+ FineGrainedLineageClass(
766
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
767
+ downstreams=downstream,
768
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
769
+ upstreams=upstreams,
770
+ )
771
+ )
772
+
773
+ upstream_lineage = UpstreamLineageClass(
774
+ upstreams=[
775
+ UpstreamClass(
776
+ type=DatasetLineageTypeClass.TRANSFORMED,
777
+ dataset=upstream_dataset,
778
+ )
779
+ ],
780
+ fineGrainedLineages=fine_grained_lineages,
781
+ )
782
+ return upstream_lineage
783
+
683
784
  def construct_dataset_from_dataset_data(
684
785
  self, dataset_data: dict
685
786
  ) -> DatasetSnapshot:
@@ -692,14 +793,23 @@ class SupersetSource(StatefulIngestionSourceBase):
692
793
  dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
693
794
 
694
795
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
796
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
695
797
  modified_ts = int(
696
- dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
798
+ dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
697
799
  )
698
800
  last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
699
801
 
700
802
  upstream_warehouse_platform = (
701
803
  dataset_response.get("result", {}).get("database", {}).get("backend")
702
804
  )
805
+ upstream_warehouse_db_name = (
806
+ dataset_response.get("result", {}).get("database", {}).get("database_name")
807
+ )
808
+
809
+ # if we have rendered sql, we always use that and defualt back to regular sql
810
+ sql = dataset_response.get("result", {}).get(
811
+ "rendered_sql"
812
+ ) or dataset_response.get("result", {}).get("sql")
703
813
 
704
814
  # Preset has a way of naming their platforms differently than
705
815
  # how datahub names them, so map the platform name to the correct naming
@@ -712,22 +822,28 @@ class SupersetSource(StatefulIngestionSourceBase):
712
822
  if upstream_warehouse_platform in warehouse_naming:
713
823
  upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
714
824
 
715
- # TODO: Categorize physical vs virtual upstream dataset
716
- # mark all upstream dataset as physical for now, in the future we would ideally like
717
- # to differentiate physical vs virtual upstream datasets
718
- tag_urn = f"urn:li:tag:{self.platform}:physical"
719
825
  upstream_dataset = self.get_datasource_urn_from_id(
720
826
  dataset_response, upstream_warehouse_platform
721
827
  )
722
- upstream_lineage = UpstreamLineageClass(
723
- upstreams=[
724
- UpstreamClass(
725
- type=DatasetLineageTypeClass.TRANSFORMED,
726
- dataset=upstream_dataset,
727
- properties={"externalUrl": dataset_url},
728
- )
729
- ]
730
- )
828
+
829
+ # Sometimes the field will be null instead of not existing
830
+ if sql == "null" or not sql:
831
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
832
+ upstream_lineage = self.generate_physical_dataset_lineage(
833
+ dataset_response, upstream_dataset, datasource_urn
834
+ )
835
+ else:
836
+ tag_urn = f"urn:li:tag:{self.platform}:virtual"
837
+ parsed_query_object = create_lineage_sql_parsed_result(
838
+ query=sql,
839
+ default_db=upstream_warehouse_db_name,
840
+ platform=upstream_warehouse_platform,
841
+ platform_instance=None,
842
+ env=self.config.env,
843
+ )
844
+ upstream_lineage = self.generate_virtual_dataset_lineage(
845
+ parsed_query_object, datasource_urn
846
+ )
731
847
 
732
848
  dataset_info = DatasetPropertiesClass(
733
849
  name=dataset.table_name,
File without changes