acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (156) hide show
  1. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
  2. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
  3. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +41 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  60. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  61. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  62. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  63. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  64. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  65. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  66. datahub/ingestion/source/ge_data_profiler.py +76 -28
  67. datahub/ingestion/source/hex/api.py +26 -1
  68. datahub/ingestion/source/identity/azure_ad.py +1 -1
  69. datahub/ingestion/source/identity/okta.py +1 -14
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  72. datahub/ingestion/source/mlflow.py +11 -1
  73. datahub/ingestion/source/mock_data/__init__.py +0 -0
  74. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  75. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  76. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  77. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  78. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  79. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  80. datahub/ingestion/source/preset.py +2 -2
  81. datahub/ingestion/source/redshift/redshift.py +17 -0
  82. datahub/ingestion/source/redshift/usage.py +4 -3
  83. datahub/ingestion/source/s3/report.py +4 -2
  84. datahub/ingestion/source/s3/source.py +367 -115
  85. datahub/ingestion/source/salesforce.py +6 -3
  86. datahub/ingestion/source/sigma/sigma.py +6 -1
  87. datahub/ingestion/source/slack/slack.py +2 -1
  88. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  89. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  90. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  91. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  92. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  93. datahub/ingestion/source/sql/athena.py +119 -12
  94. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  95. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  96. datahub/ingestion/source/sql/mssql/source.py +24 -15
  97. datahub/ingestion/source/sql/oracle.py +1 -1
  98. datahub/ingestion/source/sql/sql_common.py +11 -0
  99. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  100. datahub/ingestion/source/sql/teradata.py +997 -235
  101. datahub/ingestion/source/sql/vertica.py +10 -6
  102. datahub/ingestion/source/sql_queries.py +2 -2
  103. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  104. datahub/ingestion/source/superset.py +57 -2
  105. datahub/ingestion/source/tableau/tableau.py +57 -37
  106. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  107. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  108. datahub/ingestion/source/unity/proxy.py +4 -3
  109. datahub/ingestion/source/unity/source.py +56 -30
  110. datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
  111. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  112. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  113. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  114. datahub/metadata/_internal_schema_classes.py +1253 -536
  115. datahub/metadata/_urns/urn_defs.py +1797 -1685
  116. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  117. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  118. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  119. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  120. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  121. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  122. datahub/metadata/schema.avsc +16614 -16538
  123. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  124. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  125. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  126. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  127. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  128. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  129. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  130. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  131. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  132. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  133. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  134. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  135. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  136. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  137. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  138. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  139. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  140. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  141. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  142. datahub/sdk/datajob.py +39 -15
  143. datahub/sdk/lineage_client.py +2 -0
  144. datahub/sdk/main_client.py +14 -2
  145. datahub/sdk/search_client.py +4 -3
  146. datahub/specific/dataproduct.py +4 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  148. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  149. datahub/telemetry/telemetry.py +17 -11
  150. datahub/upgrade/upgrade.py +46 -13
  151. datahub/utilities/server_config_util.py +8 -0
  152. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  153. datahub/utilities/stats_collections.py +4 -0
  154. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  155. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  156. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
52
52
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
53
53
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
54
54
  from datahub.metadata.schema_classes import (
55
- ChangeTypeClass,
56
55
  DatasetPropertiesClass,
57
56
  SubTypesClass,
58
57
  ViewPropertiesClass,
@@ -601,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
601
600
  yield dpi_aspect
602
601
 
603
602
  yield MetadataChangeProposalWrapper(
604
- entityType="dataset",
605
- changeType=ChangeTypeClass.UPSERT,
606
603
  entityUrn=dataset_urn,
607
- aspectName="subTypes",
608
604
  aspect=SubTypesClass(typeNames=[self.table_subtype]),
609
605
  ).as_workunit()
610
606
 
@@ -810,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
810
806
 
811
807
  # Add views subtype
812
808
  yield MetadataChangeProposalWrapper(
813
- entityType="dataset",
814
- changeType=ChangeTypeClass.UPSERT,
815
809
  entityUrn=dataset_urn,
816
- aspectName="subTypes",
817
810
  aspect=SubTypesClass(typeNames=[self.view_subtype]),
818
811
  ).as_workunit()
819
812
 
@@ -824,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
824
817
  viewLogic=dataset.view_definition if dataset.view_definition else "",
825
818
  )
826
819
  yield MetadataChangeProposalWrapper(
827
- entityType="dataset",
828
- changeType=ChangeTypeClass.UPSERT,
829
820
  entityUrn=dataset_urn,
830
- aspectName="viewProperties",
831
821
  aspect=view_properties_aspect,
832
822
  ).as_workunit()
833
823
 
@@ -27,6 +27,7 @@ from datahub.ingestion.api.decorators import (
27
27
  from datahub.ingestion.api.source import StructuredLogLevel
28
28
  from datahub.ingestion.api.source_helpers import auto_workunit
29
29
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
30
31
  from datahub.ingestion.source.sql.mssql.job_models import (
31
32
  JobStep,
32
33
  MSSQLDataFlow,
@@ -177,10 +178,18 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
177
178
  @capability(
178
179
  SourceCapability.LINEAGE_COARSE,
179
180
  "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
181
+ subtype_modifier=[
182
+ SourceCapabilityModifier.STORED_PROCEDURE,
183
+ SourceCapabilityModifier.VIEW,
184
+ ],
180
185
  )
181
186
  @capability(
182
187
  SourceCapability.LINEAGE_FINE,
183
188
  "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
189
+ subtype_modifier=[
190
+ SourceCapabilityModifier.STORED_PROCEDURE,
191
+ SourceCapabilityModifier.VIEW,
192
+ ],
184
193
  )
185
194
  class SQLServerSource(SQLAlchemySource):
186
195
  """
@@ -936,25 +945,25 @@ class SQLServerSource(SQLAlchemySource):
936
945
  url = self.config.get_sql_alchemy_url()
937
946
  logger.debug(f"sql_alchemy_url={url}")
938
947
  engine = create_engine(url, **self.config.options)
939
- with engine.connect() as conn:
940
- if self.config.database and self.config.database != "":
941
- inspector = inspect(conn)
942
- yield inspector
943
- else:
948
+
949
+ if self.config.database and self.config.database != "":
950
+ inspector = inspect(engine)
951
+ yield inspector
952
+ else:
953
+ with engine.begin() as conn:
944
954
  databases = conn.execute(
945
955
  "SELECT name FROM master.sys.databases WHERE name NOT IN \
946
956
  ('master', 'model', 'msdb', 'tempdb', 'Resource', \
947
957
  'distribution' , 'reportserver', 'reportservertempdb'); "
948
- )
949
- for db in databases:
950
- if self.config.database_pattern.allowed(db["name"]):
951
- url = self.config.get_sql_alchemy_url(current_db=db["name"])
952
- with create_engine(
953
- url, **self.config.options
954
- ).connect() as conn:
955
- inspector = inspect(conn)
956
- self.current_database = db["name"]
957
- yield inspector
958
+ ).fetchall()
959
+
960
+ for db in databases:
961
+ if self.config.database_pattern.allowed(db["name"]):
962
+ url = self.config.get_sql_alchemy_url(current_db=db["name"])
963
+ engine = create_engine(url, **self.config.options)
964
+ inspector = inspect(engine)
965
+ self.current_database = db["name"]
966
+ yield inspector
958
967
 
959
968
  def get_identifier(
960
969
  self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
@@ -441,7 +441,7 @@ class OracleInspectorObjectWrapper:
441
441
  "\nac.constraint_name,"
442
442
  "\nac.constraint_type,"
443
443
  "\nacc.column_name AS local_column,"
444
- "\nac.r_table_name AS remote_table,"
444
+ "\nac.table_name AS remote_table,"
445
445
  "\nrcc.column_name AS remote_column,"
446
446
  "\nac.r_owner AS remote_owner,"
447
447
  "\nacc.position AS loc_pos,"
@@ -54,6 +54,7 @@ from datahub.ingestion.source.common.data_reader import DataReader
54
54
  from datahub.ingestion.source.common.subtypes import (
55
55
  DatasetContainerSubTypes,
56
56
  DatasetSubTypes,
57
+ SourceCapabilityModifier,
57
58
  )
58
59
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
59
60
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
@@ -291,6 +292,10 @@ class ProfileMetadata:
291
292
  SourceCapability.CONTAINERS,
292
293
  "Enabled by default",
293
294
  supported=True,
295
+ subtype_modifier=[
296
+ SourceCapabilityModifier.DATABASE,
297
+ SourceCapabilityModifier.SCHEMA,
298
+ ],
294
299
  )
295
300
  @capability(
296
301
  SourceCapability.DESCRIPTIONS,
@@ -305,10 +310,12 @@ class ProfileMetadata:
305
310
  @capability(
306
311
  SourceCapability.LINEAGE_COARSE,
307
312
  "Enabled by default to get lineage for views via `include_view_lineage`",
313
+ subtype_modifier=[SourceCapabilityModifier.VIEW],
308
314
  )
309
315
  @capability(
310
316
  SourceCapability.LINEAGE_FINE,
311
317
  "Enabled by default to get lineage for views via `include_view_column_lineage`",
318
+ subtype_modifier=[SourceCapabilityModifier.VIEW],
312
319
  )
313
320
  @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
314
321
  @capability(
@@ -586,6 +593,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
586
593
  )
587
594
 
588
595
  # Generate workunit for aggregated SQL parsing results
596
+ yield from self._generate_aggregator_workunits()
597
+
598
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
599
+ """Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
589
600
  for mcp in self.aggregator.gen_metadata():
590
601
  yield mcp.as_workunit()
591
602
 
@@ -57,10 +57,11 @@ class GenericProfiler:
57
57
  platform: Optional[str] = None,
58
58
  profiler_args: Optional[Dict] = None,
59
59
  ) -> Iterable[MetadataWorkUnit]:
60
+ # We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
60
61
  ge_profile_requests: List[GEProfilerRequest] = [
61
62
  cast(GEProfilerRequest, request)
62
63
  for request in requests
63
- if not request.profile_table_level_only
64
+ if not request.profile_table_level_only or request.table.rows_count == 0
64
65
  ]
65
66
  table_level_profile_requests: List[TableProfilerRequest] = [
66
67
  request for request in requests if request.profile_table_level_only