acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show
  1. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2562 -2476
  2. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
  3. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +46 -6
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  11. datahub/ingestion/source/common/subtypes.py +3 -0
  12. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  13. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  14. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  15. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  16. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  17. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  18. datahub/ingestion/source/excel/__init__.py +0 -0
  19. datahub/ingestion/source/excel/config.py +92 -0
  20. datahub/ingestion/source/excel/excel_file.py +539 -0
  21. datahub/ingestion/source/excel/profiling.py +308 -0
  22. datahub/ingestion/source/excel/report.py +49 -0
  23. datahub/ingestion/source/excel/source.py +662 -0
  24. datahub/ingestion/source/excel/util.py +18 -0
  25. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  26. datahub/ingestion/source/openapi.py +1 -1
  27. datahub/ingestion/source/powerbi/config.py +33 -0
  28. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  29. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  30. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  31. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  32. datahub/ingestion/source/redshift/config.py +9 -6
  33. datahub/ingestion/source/redshift/lineage.py +386 -687
  34. datahub/ingestion/source/redshift/redshift.py +19 -106
  35. datahub/ingestion/source/s3/source.py +65 -59
  36. datahub/ingestion/source/snowflake/constants.py +2 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
  39. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  40. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  42. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
  43. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  44. datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
  46. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  47. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  48. datahub/ingestion/source/sql/mssql/source.py +62 -3
  49. datahub/ingestion/source/sql_queries.py +24 -2
  50. datahub/ingestion/source/state/checkpoint.py +3 -28
  51. datahub/ingestion/source/unity/config.py +74 -9
  52. datahub/ingestion/source/unity/proxy.py +167 -5
  53. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  54. datahub/ingestion/source/unity/proxy_types.py +24 -0
  55. datahub/ingestion/source/unity/report.py +5 -0
  56. datahub/ingestion/source/unity/source.py +111 -1
  57. datahub/ingestion/source/usage/usage_common.py +1 -0
  58. datahub/metadata/_internal_schema_classes.py +573 -517
  59. datahub/metadata/_urns/urn_defs.py +1748 -1748
  60. datahub/metadata/schema.avsc +18564 -18484
  61. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  62. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  63. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  64. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  65. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  66. datahub/metadata/schemas/Ownership.avsc +69 -0
  67. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  68. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  69. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  70. datahub/metadata/schemas/__init__.py +3 -3
  71. datahub/sdk/chart.py +36 -22
  72. datahub/sdk/dashboard.py +38 -62
  73. datahub/sdk/lineage_client.py +6 -26
  74. datahub/sdk/main_client.py +7 -3
  75. datahub/sdk/search_filters.py +16 -0
  76. datahub/specific/aspect_helpers/siblings.py +73 -0
  77. datahub/specific/dataset.py +2 -0
  78. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  79. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  80. datahub/upgrade/upgrade.py +14 -2
  81. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  82. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
  83. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
  84. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
@@ -337,3 +337,27 @@ class Notebook:
337
337
  "upstreams": frozenset([*notebook.upstreams, upstream]),
338
338
  }
339
339
  )
340
+
341
+
342
+ @dataclass
343
+ class Model:
344
+ id: str
345
+ name: str
346
+ schema_name: str
347
+ catalog_name: str
348
+ description: Optional[str]
349
+ created_at: Optional[datetime]
350
+ updated_at: Optional[datetime]
351
+
352
+
353
+ @dataclass
354
+ class ModelVersion:
355
+ id: str
356
+ name: str
357
+ model: Model
358
+ version: str
359
+ aliases: Optional[List[str]]
360
+ description: Optional[str]
361
+ created_at: Optional[datetime]
362
+ updated_at: Optional[datetime]
363
+ created_by: Optional[str]
@@ -31,6 +31,10 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
31
31
  tables: EntityFilterReport = EntityFilterReport.field(type="table/view")
32
32
  table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile")
33
33
  notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook")
34
+ ml_models: EntityFilterReport = EntityFilterReport.field(type="ml_model")
35
+ ml_model_versions: EntityFilterReport = EntityFilterReport.field(
36
+ type="ml_model_version"
37
+ )
34
38
 
35
39
  hive_metastore_catalog_found: Optional[bool] = None
36
40
 
@@ -64,6 +68,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
64
68
  num_catalogs_missing_name: int = 0
65
69
  num_schemas_missing_name: int = 0
66
70
  num_tables_missing_name: int = 0
71
+ num_ml_models_missing_name: int = 0
67
72
  num_columns_missing_name: int = 0
68
73
  num_queries_missing_info: int = 0
69
74
 
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
12
12
  make_dataset_urn_with_platform_instance,
13
13
  make_domain_urn,
14
14
  make_group_urn,
15
+ make_ml_model_group_urn,
15
16
  make_schema_field_urn,
16
17
  make_ts_millis,
17
18
  make_user_urn,
@@ -26,6 +27,7 @@ from datahub.emitter.mcp_builder import (
26
27
  UnitySchemaKey,
27
28
  UnitySchemaKeyWithMetastore,
28
29
  add_dataset_to_container,
30
+ add_entity_to_container,
29
31
  gen_containers,
30
32
  )
31
33
  from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
@@ -87,6 +89,8 @@ from datahub.ingestion.source.unity.proxy_types import (
87
89
  CustomCatalogType,
88
90
  HiveTableType,
89
91
  Metastore,
92
+ Model,
93
+ ModelVersion,
90
94
  Notebook,
91
95
  NotebookId,
92
96
  Schema,
@@ -121,6 +125,7 @@ from datahub.metadata.schema_classes import (
121
125
  DatasetLineageTypeClass,
122
126
  DatasetPropertiesClass,
123
127
  DomainsClass,
128
+ MLModelPropertiesClass,
124
129
  MySqlDDLClass,
125
130
  NullTypeClass,
126
131
  OwnerClass,
@@ -134,7 +139,8 @@ from datahub.metadata.schema_classes import (
134
139
  UpstreamClass,
135
140
  UpstreamLineageClass,
136
141
  )
137
- from datahub.metadata.urns import TagUrn
142
+ from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
143
+ from datahub.sdk import MLModel, MLModelGroup
138
144
  from datahub.sql_parsing.schema_resolver import SchemaResolver
139
145
  from datahub.sql_parsing.sqlglot_lineage import (
140
146
  SqlParsingResult,
@@ -182,6 +188,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
182
188
  - metastores
183
189
  - schemas
184
190
  - tables and column lineage
191
+ - model and model versions
185
192
  """
186
193
 
187
194
  config: UnityCatalogSourceConfig
@@ -250,6 +257,18 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
250
257
  else:
251
258
  self.platform_resource_repository = None
252
259
 
260
+ if self.config._forced_disable_tag_extraction:
261
+ self.report.report_warning(
262
+ "Some features disabled because of configuration conflicts",
263
+ "Tag Extraction is disabled due to missing warehouse_id in config",
264
+ )
265
+
266
+ if self.config._forced_disable_hive_metastore_extraction:
267
+ self.report.report_warning(
268
+ "Some features disabled because of configuration conflicts",
269
+ "Hive Metastore Extraction is disabled due to missing warehouse_id in config",
270
+ )
271
+
253
272
  # Include platform resource repository in report for automatic cache statistics
254
273
  if self.config.include_tags and self.platform_resource_repository:
255
274
  self.report.tag_urn_resolver_cache = self.platform_resource_repository
@@ -512,6 +531,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
512
531
  yield from self.gen_schema_containers(schema)
513
532
  try:
514
533
  yield from self.process_tables(schema)
534
+ yield from self.process_ml_models(schema)
515
535
  except Exception as e:
516
536
  logger.exception(f"Error parsing schema {schema}")
517
537
  self.report.report_warning(
@@ -665,6 +685,69 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
665
685
  )
666
686
  ]
667
687
 
688
+ def process_ml_models(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
689
+ for ml_model in self.unity_catalog_api_proxy.ml_models(
690
+ schema=schema, max_results=self.config.ml_model_max_results
691
+ ):
692
+ yield from self.process_ml_model(ml_model, schema)
693
+ ml_model_urn = self.gen_ml_model_urn(ml_model.id)
694
+ for ml_model_version in self.unity_catalog_api_proxy.ml_model_versions(
695
+ ml_model, include_aliases=self.config.include_ml_model_aliases
696
+ ):
697
+ yield from self.process_ml_model_version(
698
+ ml_model_urn, ml_model_version, schema
699
+ )
700
+
701
+ def process_ml_model(
702
+ self, ml_model: Model, schema: Schema
703
+ ) -> Iterable[MetadataWorkUnit]:
704
+ ml_model_group = MLModelGroup(
705
+ id=ml_model.id,
706
+ name=ml_model.name,
707
+ platform=self.platform,
708
+ platform_instance=schema.name,
709
+ env=self.config.env,
710
+ description=ml_model.description,
711
+ created=ml_model.created_at,
712
+ last_modified=ml_model.updated_at,
713
+ )
714
+ yield from ml_model_group.as_workunits()
715
+ yield from self.add_model_to_schema_container(str(ml_model_group.urn), schema)
716
+ self.report.ml_models.processed(ml_model.id)
717
+
718
+ def process_ml_model_version(
719
+ self, ml_model_urn: str, ml_model_version: ModelVersion, schema: Schema
720
+ ) -> Iterable[MetadataWorkUnit]:
721
+ extra_aspects = []
722
+ if ml_model_version.created_at is not None:
723
+ created_time = int(ml_model_version.created_at.timestamp() * 1000)
724
+ created_actor = (
725
+ f"urn:li:platformResource:{ml_model_version.created_by}"
726
+ if ml_model_version.created_by
727
+ else None
728
+ )
729
+ extra_aspects.append(
730
+ MLModelPropertiesClass(
731
+ created=TimeStampClass(time=created_time, actor=created_actor),
732
+ )
733
+ )
734
+
735
+ ml_model = MLModel(
736
+ id=ml_model_version.id,
737
+ name=ml_model_version.name,
738
+ version=str(ml_model_version.version),
739
+ aliases=ml_model_version.aliases,
740
+ description=ml_model_version.description,
741
+ model_group=ml_model_urn,
742
+ platform=self.platform,
743
+ last_modified=ml_model_version.updated_at,
744
+ extra_aspects=extra_aspects,
745
+ )
746
+
747
+ yield from ml_model.as_workunits()
748
+ yield from self.add_model_version_to_schema_container(str(ml_model.urn), schema)
749
+ self.report.ml_model_versions.processed(ml_model_version.id)
750
+
668
751
  def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
669
752
  # Calculate datetime filters for lineage
670
753
  lineage_start_time = None
@@ -802,6 +885,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
802
885
  env=self.config.env,
803
886
  )
804
887
 
888
+ def gen_ml_model_urn(self, name: str) -> str:
889
+ return make_ml_model_group_urn(
890
+ platform=self.platform,
891
+ group_name=name,
892
+ env=self.config.env,
893
+ )
894
+
805
895
  def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str:
806
896
  notebook_id = notebook.id if isinstance(notebook, Notebook) else notebook
807
897
  return NotebookKey(
@@ -973,6 +1063,26 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
973
1063
  dataset_urn=dataset_urn,
974
1064
  )
975
1065
 
1066
+ def add_model_to_schema_container(
1067
+ self, model_urn: str, schema: Schema
1068
+ ) -> Iterable[MetadataWorkUnit]:
1069
+ schema_container_key = self.gen_schema_key(schema)
1070
+ yield from add_entity_to_container(
1071
+ container_key=schema_container_key,
1072
+ entity_type=MlModelGroupUrn.ENTITY_TYPE,
1073
+ entity_urn=model_urn,
1074
+ )
1075
+
1076
+ def add_model_version_to_schema_container(
1077
+ self, model_version_urn: str, schema: Schema
1078
+ ) -> Iterable[MetadataWorkUnit]:
1079
+ schema_container_key = self.gen_schema_key(schema)
1080
+ yield from add_entity_to_container(
1081
+ container_key=schema_container_key,
1082
+ entity_type=MlModelUrn.ENTITY_TYPE,
1083
+ entity_urn=model_version_urn,
1084
+ )
1085
+
976
1086
  def _get_catalog_tags(
977
1087
  self, catalog: str, schema: str, table: str
978
1088
  ) -> List[UnityCatalogTag]:
@@ -268,6 +268,7 @@ class UsageAggregator(Generic[ResourceType]):
268
268
  user,
269
269
  query,
270
270
  fields,
271
+ user_email_pattern=self.config.user_email_pattern,
271
272
  count=count,
272
273
  )
273
274