acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,53 @@
1
+ from typing import Optional
2
+
3
+ import datahub.metadata.urns as urns
4
+ from datahub.utilities.urns.urn import guess_entity_type
5
+
6
+ _url_prefixes = {
7
+ # Atypical mappings.
8
+ urns.DataJobUrn.ENTITY_TYPE: "tasks",
9
+ urns.DataFlowUrn.ENTITY_TYPE: "pipelines",
10
+ urns.CorpUserUrn.ENTITY_TYPE: "user",
11
+ urns.CorpGroupUrn.ENTITY_TYPE: "group",
12
+ # Normal mappings - matches the entity type.
13
+ urns.ChartUrn.ENTITY_TYPE: "chart",
14
+ urns.ContainerUrn.ENTITY_TYPE: "container",
15
+ urns.DataProductUrn.ENTITY_TYPE: "dataProduct",
16
+ urns.DatasetUrn.ENTITY_TYPE: "dataset",
17
+ urns.DashboardUrn.ENTITY_TYPE: "dashboard",
18
+ urns.DomainUrn.ENTITY_TYPE: "domain",
19
+ urns.GlossaryNodeUrn.ENTITY_TYPE: "glossaryNode",
20
+ urns.GlossaryTermUrn.ENTITY_TYPE: "glossaryTerm",
21
+ urns.TagUrn.ENTITY_TYPE: "tag",
22
+ }
23
+
24
+
25
+ def make_url_for_urn(
26
+ frontend_base_url: str,
27
+ entity_urn: str,
28
+ *,
29
+ tab: Optional[str] = None,
30
+ ) -> str:
31
+ """Build the public-facing URL for an entity urn.
32
+
33
+ Args:
34
+ frontend_url: The public-facing base url of the frontend.
35
+ entity_urn: The urn of the entity to get the url for.
36
+ tab: The tab to deep link into. If not provided, the default tab for the entity will be shown.
37
+
38
+ Returns:
39
+ The public-facing url for the entity.
40
+
41
+ Examples:
42
+ >>> make_url_for_urn("https://demo.datahub.com", "urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992", tab="Contents")
43
+ 'https://demo.datahub.com/container/urn:li:container:b41c14bc5cb3ccfbb0433c8cbdef2992/Contents'
44
+ >>> make_url_for_urn("https://demo.datahub.com", "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)")
45
+ 'https://demo.datahub.com/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.actuating,PROD)/'
46
+ """
47
+ entity_type = guess_entity_type(entity_urn)
48
+
49
+ url_prefix = _url_prefixes.get(entity_type, entity_type)
50
+ url = f"{frontend_base_url}/{url_prefix}/{entity_urn}/"
51
+ if tab:
52
+ url += f"{tab}"
53
+ return url
@@ -31,6 +31,7 @@ from datahub.ingestion.api.source import Extractor, Source
31
31
  from datahub.ingestion.api.transform import Transformer
32
32
  from datahub.ingestion.extractor.extractor_registry import extractor_registry
33
33
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
34
+ from datahub.ingestion.graph.config import ClientMode
34
35
  from datahub.ingestion.reporting.reporting_provider_registry import (
35
36
  reporting_provider_registry,
36
37
  )
@@ -136,9 +137,8 @@ class CliReport(Report):
136
137
 
137
138
 
138
139
  def _make_default_rest_sink(ctx: PipelineContext) -> DatahubRestSink:
139
- graph = get_default_graph()
140
+ graph = get_default_graph(ClientMode.INGESTION)
140
141
  sink_config = graph._make_rest_sink_config()
141
-
142
142
  return DatahubRestSink(ctx, sink_config)
143
143
 
144
144
 
@@ -175,6 +175,7 @@ class Pipeline:
175
175
  self.graph: Optional[DataHubGraph] = None
176
176
  with _add_init_error_context("connect to DataHub"):
177
177
  if self.config.datahub_api:
178
+ self.config.datahub_api.client_mode = ClientMode.INGESTION
178
179
  self.graph = exit_stack.enter_context(
179
180
  DataHubGraph(self.config.datahub_api)
180
181
  )
@@ -555,18 +556,20 @@ class Pipeline:
555
556
  def raise_from_status(self, raise_warnings: bool = False) -> None:
556
557
  if self.source.get_report().failures:
557
558
  raise PipelineExecutionError(
558
- "Source reported errors", self.source.get_report()
559
+ "Source reported errors", self.source.get_report().failures
559
560
  )
560
561
  if self.sink.get_report().failures:
561
- raise PipelineExecutionError("Sink reported errors", self.sink.get_report())
562
+ raise PipelineExecutionError(
563
+ "Sink reported errors", self.sink.get_report().failures
564
+ )
562
565
  if raise_warnings:
563
566
  if self.source.get_report().warnings:
564
567
  raise PipelineExecutionError(
565
- "Source reported warnings", self.source.get_report()
568
+ "Source reported warnings", self.source.get_report().warnings
566
569
  )
567
570
  if self.sink.get_report().warnings:
568
571
  raise PipelineExecutionError(
569
- "Sink reported warnings", self.sink.get_report()
572
+ "Sink reported warnings", self.sink.get_report().warnings
570
573
  )
571
574
 
572
575
  def log_ingestion_stats(self) -> None:
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional
7
7
  from pydantic import Field, validator
8
8
 
9
9
  from datahub.configuration.common import ConfigModel, DynamicTypedConfig
10
- from datahub.ingestion.graph.client import DatahubClientConfig
10
+ from datahub.ingestion.graph.config import DatahubClientConfig
11
11
  from datahub.ingestion.sink.file import FileSinkConfig
12
12
 
13
13
  logger = logging.getLogger(__name__)
@@ -34,7 +34,7 @@ from datahub.ingestion.api.sink import (
34
34
  WriteCallback,
35
35
  )
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
- from datahub.ingestion.graph.client import DatahubClientConfig
37
+ from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
38
38
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
39
39
  MetadataChangeEvent,
40
40
  MetadataChangeProposal,
@@ -140,11 +140,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
140
140
  f"💥 Failed to connect to DataHub with {repr(self.emitter)}"
141
141
  ) from exc
142
142
 
143
- self.report.gms_version = (
144
- gms_config.get("versions", {})
145
- .get("acryldata/datahub", {})
146
- .get("version", None)
147
- )
143
+ self.report.gms_version = gms_config.service_version
148
144
  self.report.mode = self.config.mode
149
145
  self.report.max_threads = self.config.max_threads
150
146
  logger.debug("Setting env variables to override config")
@@ -180,6 +176,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
180
176
  disable_ssl_verification=config.disable_ssl_verification,
181
177
  openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
182
178
  default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
179
+ client_mode=config.client_mode,
180
+ datahub_component=config.datahub_component,
183
181
  )
184
182
 
185
183
  @property
@@ -190,6 +188,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
190
188
  # https://github.com/psf/requests/issues/1871#issuecomment-32751346
191
189
  thread_local = self._emitter_thread_local
192
190
  if not hasattr(thread_local, "emitter"):
191
+ self.config.client_mode = ClientMode.INGESTION
193
192
  thread_local.emitter = DatahubRestSink._make_emitter(self.config)
194
193
  return thread_local.emitter
195
194
 
@@ -18,6 +18,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, Sour
18
18
  from datahub.ingestion.api.source_helpers import auto_workunit_reporter
19
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
20
20
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
21
+ from datahub.ingestion.graph.config import ClientMode
21
22
  from datahub.metadata.schema_classes import (
22
23
  DomainsClass,
23
24
  GlossaryTermAssociationClass,
@@ -48,7 +49,7 @@ def apply_association_to_container(
48
49
  """
49
50
  urns: List[str] = [container_urn]
50
51
  if not graph:
51
- graph = get_default_graph()
52
+ graph = get_default_graph(ClientMode.INGESTION)
52
53
  logger.info(f"Using {graph}")
53
54
  urns.extend(
54
55
  graph.get_urns_by_filter(
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
205
205
  textwrap.dedent(
206
206
  f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
207
207
  To view full table metadata, run Glue ingestion
208
- (see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
208
+ (see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
209
209
  )
210
210
  )
211
211
 
@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
270
270
  ):
271
271
  return
272
272
 
273
- with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
274
- with BigQueryQueriesExtractor(
275
- connection=self.config.get_bigquery_client(),
276
- schema_api=self.bq_schema_extractor.schema_api,
277
- config=BigQueryQueriesExtractorConfig(
278
- window=self.config,
279
- user_email_pattern=self.config.usage.user_email_pattern,
280
- include_lineage=self.config.include_table_lineage,
281
- include_usage_statistics=self.config.include_usage_statistics,
282
- include_operations=self.config.usage.include_operational_stats,
283
- include_queries=self.config.include_queries,
284
- include_query_usage_statistics=self.config.include_query_usage_statistics,
285
- top_n_queries=self.config.usage.top_n_queries,
286
- region_qualifiers=self.config.region_qualifiers,
287
- ),
288
- structured_report=self.report,
289
- filters=self.filters,
290
- identifiers=self.identifiers,
291
- schema_resolver=self.sql_parser_schema_resolver,
292
- discovered_tables=self.bq_schema_extractor.table_refs,
293
- ) as queries_extractor:
294
- self.report.queries_extractor = queries_extractor.report
295
- yield from queries_extractor.get_workunits_internal()
273
+ with self.report.new_stage(
274
+ f"*: {QUERIES_EXTRACTION}"
275
+ ), BigQueryQueriesExtractor(
276
+ connection=self.config.get_bigquery_client(),
277
+ schema_api=self.bq_schema_extractor.schema_api,
278
+ config=BigQueryQueriesExtractorConfig(
279
+ window=self.config,
280
+ user_email_pattern=self.config.usage.user_email_pattern,
281
+ include_lineage=self.config.include_table_lineage,
282
+ include_usage_statistics=self.config.include_usage_statistics,
283
+ include_operations=self.config.usage.include_operational_stats,
284
+ include_queries=self.config.include_queries,
285
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
286
+ top_n_queries=self.config.usage.top_n_queries,
287
+ region_qualifiers=self.config.region_qualifiers,
288
+ ),
289
+ structured_report=self.report,
290
+ filters=self.filters,
291
+ identifiers=self.identifiers,
292
+ schema_resolver=self.sql_parser_schema_resolver,
293
+ discovered_tables=self.bq_schema_extractor.table_refs,
294
+ ) as queries_extractor:
295
+ self.report.queries_extractor = queries_extractor.report
296
+ yield from queries_extractor.get_workunits_internal()
296
297
  else:
297
298
  if self.config.include_usage_statistics:
298
299
  yield from self.usage_extractor.get_usage_workunits(
@@ -2,10 +2,8 @@ import logging
2
2
  import os
3
3
  import re
4
4
  from datetime import timedelta
5
- from typing import Any, Dict, List, Optional, Union
5
+ from typing import Dict, List, Optional, Union
6
6
 
7
- from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
8
- from google.cloud.logging_v2.client import Client as GCPLoggingClient
9
7
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
10
8
 
11
9
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
@@ -18,7 +16,9 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
18
16
  from datahub.ingestion.glossary.classification_mixin import (
19
17
  ClassificationSourceConfigMixin,
20
18
  )
21
- from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
+ BigQueryConnectionConfig,
21
+ )
22
22
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
23
23
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
24
24
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -105,64 +105,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
105
105
  )
106
106
 
107
107
 
108
- class BigQueryConnectionConfig(ConfigModel):
109
- credential: Optional[GCPCredential] = Field(
110
- default=None, description="BigQuery credential informations"
111
- )
112
-
113
- _credentials_path: Optional[str] = PrivateAttr(None)
114
-
115
- extra_client_options: Dict[str, Any] = Field(
116
- default={},
117
- description="Additional options to pass to google.cloud.logging_v2.client.Client.",
118
- )
119
-
120
- project_on_behalf: Optional[str] = Field(
121
- default=None,
122
- description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
123
- )
124
-
125
- def __init__(self, **data: Any):
126
- super().__init__(**data)
127
-
128
- if self.credential:
129
- self._credentials_path = self.credential.create_credential_temp_file()
130
- logger.debug(
131
- f"Creating temporary credential file at {self._credentials_path}"
132
- )
133
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
134
-
135
- def get_bigquery_client(self) -> bigquery.Client:
136
- client_options = self.extra_client_options
137
- return bigquery.Client(self.project_on_behalf, **client_options)
138
-
139
- def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
140
- return resourcemanager_v3.ProjectsClient()
141
-
142
- def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
143
- return datacatalog_v1.PolicyTagManagerClient()
144
-
145
- def make_gcp_logging_client(
146
- self, project_id: Optional[str] = None
147
- ) -> GCPLoggingClient:
148
- # See https://github.com/googleapis/google-cloud-python/issues/2674 for
149
- # why we disable gRPC here.
150
- client_options = self.extra_client_options.copy()
151
- client_options["_use_grpc"] = False
152
- if project_id is not None:
153
- return GCPLoggingClient(**client_options, project=project_id)
154
- else:
155
- return GCPLoggingClient(**client_options)
156
-
157
- def get_sql_alchemy_url(self) -> str:
158
- if self.project_on_behalf:
159
- return f"bigquery://{self.project_on_behalf}"
160
- # When project_id is not set, we will attempt to detect the project ID
161
- # based on the credentials or environment variables.
162
- # See https://github.com/mxmzdlv/pybigquery#authentication.
163
- return "bigquery://"
164
-
165
-
166
108
  class GcsLineageProviderConfig(ConfigModel):
167
109
  """
168
110
  Any source that produces gcs lineage from/to Datasets should inherit this class.
@@ -0,0 +1,70 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict, Optional
4
+
5
+ from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
6
+ from google.cloud.logging_v2.client import Client as GCPLoggingClient
7
+ from pydantic import Field, PrivateAttr
8
+
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class BigQueryConnectionConfig(ConfigModel):
16
+ credential: Optional[GCPCredential] = Field(
17
+ default=None, description="BigQuery credential informations"
18
+ )
19
+
20
+ _credentials_path: Optional[str] = PrivateAttr(None)
21
+
22
+ extra_client_options: Dict[str, Any] = Field(
23
+ default={},
24
+ description="Additional options to pass to google.cloud.logging_v2.client.Client.",
25
+ )
26
+
27
+ project_on_behalf: Optional[str] = Field(
28
+ default=None,
29
+ description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
30
+ )
31
+
32
+ def __init__(self, **data: Any):
33
+ super().__init__(**data)
34
+
35
+ if self.credential:
36
+ self._credentials_path = self.credential.create_credential_temp_file()
37
+ logger.debug(
38
+ f"Creating temporary credential file at {self._credentials_path}"
39
+ )
40
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
41
+
42
+ def get_bigquery_client(self) -> bigquery.Client:
43
+ client_options = self.extra_client_options
44
+ return bigquery.Client(self.project_on_behalf, **client_options)
45
+
46
+ def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
47
+ return resourcemanager_v3.ProjectsClient()
48
+
49
+ def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
50
+ return datacatalog_v1.PolicyTagManagerClient()
51
+
52
+ def make_gcp_logging_client(
53
+ self, project_id: Optional[str] = None
54
+ ) -> GCPLoggingClient:
55
+ # See https://github.com/googleapis/google-cloud-python/issues/2674 for
56
+ # why we disable gRPC here.
57
+ client_options = self.extra_client_options.copy()
58
+ client_options["_use_grpc"] = False
59
+ if project_id is not None:
60
+ return GCPLoggingClient(**client_options, project=project_id)
61
+ else:
62
+ return GCPLoggingClient(**client_options)
63
+
64
+ def get_sql_alchemy_url(self) -> str:
65
+ if self.project_on_behalf:
66
+ return f"bigquery://{self.project_on_behalf}"
67
+ # When project_id is not set, we will attempt to detect the project ID
68
+ # based on the credentials or environment variables.
69
+ # See https://github.com/mxmzdlv/pybigquery#authentication.
70
+ return "bigquery://"
@@ -10,10 +10,12 @@ from datahub.ingestion.api.common import PipelineContext
10
10
  from datahub.ingestion.api.source import Source, SourceReport
11
11
  from datahub.ingestion.api.workunit import MetadataWorkUnit
12
12
  from datahub.ingestion.source.bigquery_v2.bigquery_config import (
13
- BigQueryConnectionConfig,
14
13
  BigQueryFilterConfig,
15
14
  BigQueryIdentifierConfig,
16
15
  )
16
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
17
+ BigQueryConnectionConfig,
18
+ )
17
19
  from datahub.ingestion.source.bigquery_v2.bigquery_report import (
18
20
  BigQueryQueriesExtractorReport,
19
21
  BigQuerySchemaApiPerfReport,
@@ -70,30 +70,31 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
- with ThreadPoolExecutor(
75
- max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
73
+ with self.report.new_stage(
74
+ f"{keyspace_name}: {PROFILING}"
75
+ ), ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor:
78
+ future_to_dataset = {
79
+ executor.submit(
80
+ self.generate_profile,
81
+ keyspace_name,
82
+ table_name,
83
+ cassandra_data.columns.get(table_name, []),
84
+ ): table_name
85
+ for table_name in tables
86
+ }
87
+ for future in as_completed(future_to_dataset):
88
+ table_name = future_to_dataset[future]
89
+ try:
90
+ yield from future.result()
91
+ except Exception as exc:
92
+ self.report.profiling_skipped_other[table_name] += 1
93
+ self.report.failure(
94
+ message="Failed to profile for table",
95
+ context=f"{keyspace_name}.{table_name}",
96
+ exc=exc,
97
+ )
97
98
 
98
99
  def generate_profile(
99
100
  self,
@@ -113,3 +113,6 @@ class MLAssetSubTypes(StrEnum):
113
113
  VERTEX_ENDPOINT = "Endpoint"
114
114
  VERTEX_DATASET = "Dataset"
115
115
  VERTEX_PROJECT = "Project"
116
+ VERTEX_PIPELINE = "Pipeline Job"
117
+ VERTEX_PIPELINE_TASK = "Pipeline Task"
118
+ VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
195
195
  Yields:
196
196
  Row objects containing URNs of soft-deleted entities
197
197
  """
198
- with self.engine.connect() as conn:
199
- with contextlib.closing(conn.connection.cursor()) as cursor:
200
- logger.debug("Polling soft-deleted urns from database")
201
- cursor.execute(self.soft_deleted_urns_query)
202
- columns = [desc[0] for desc in cursor.description]
203
- while True:
204
- rows = cursor.fetchmany(self.config.database_query_batch_size)
205
- if not rows:
206
- return
207
- for row in rows:
208
- yield dict(zip(columns, row))
198
+ with self.engine.connect() as conn, contextlib.closing(
199
+ conn.connection.cursor()
200
+ ) as cursor:
201
+ logger.debug("Polling soft-deleted urns from database")
202
+ cursor.execute(self.soft_deleted_urns_query)
203
+ columns = [desc[0] for desc in cursor.description]
204
+ while True:
205
+ rows = cursor.fetchmany(self.config.database_query_batch_size)
206
+ if not rows:
207
+ return
208
+ for row in rows:
209
+ yield dict(zip(columns, row))
209
210
 
210
211
  def _parse_row(
211
212
  self, row: Dict[str, Any]
@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
12
  SupportStatus,
13
- capability,
14
13
  config_class,
15
14
  platform_name,
16
15
  support_status,
17
16
  )
18
17
  from datahub.ingestion.api.source import (
19
18
  CapabilityReport,
20
- SourceCapability,
21
19
  TestableSource,
22
20
  TestConnectionReport,
23
21
  )
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
262
260
 
263
261
  @platform_name("dbt")
264
262
  @config_class(DBTCloudConfig)
265
- @support_status(SupportStatus.INCUBATING)
266
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
267
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
263
+ @support_status(SupportStatus.CERTIFIED)
268
264
  class DBTCloudSource(DBTSourceBase, TestableSource):
269
265
  config: DBTCloudConfig
270
266
 
271
267
  @classmethod
272
268
  def create(cls, config_dict, ctx):
273
269
  config = DBTCloudConfig.parse_obj(config_dict)
274
- return cls(config, ctx, "dbt")
270
+ return cls(config, ctx)
275
271
 
276
272
  @staticmethod
277
273
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
125
125
  @dataclass
126
126
  class DBTSourceReport(StaleEntityRemovalSourceReport):
127
127
  sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
128
+ sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
128
129
  sql_parser_parse_failures: int = 0
129
130
  sql_parser_detach_ctes_failures: int = 0
130
131
  sql_parser_table_errors: int = 0
@@ -829,11 +830,13 @@ def get_column_type(
829
830
  "Enabled by default, configure using `include_column_lineage`",
830
831
  )
831
832
  class DBTSourceBase(StatefulIngestionSourceBase):
832
- def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
833
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
833
834
  super().__init__(config, ctx)
835
+ self.platform: str = "dbt"
836
+
834
837
  self.config = config
835
- self.platform: str = platform
836
838
  self.report: DBTSourceReport = DBTSourceReport()
839
+
837
840
  self.compiled_owner_extraction_pattern: Optional[Any] = None
838
841
  if self.config.owner_extraction_pattern:
839
842
  self.compiled_owner_extraction_pattern = re.compile(
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1177
1180
  logger.debug(
1178
1181
  f"Not generating CLL for {node.dbt_name} because we don't need it."
1179
1182
  )
1183
+ elif node.language != "sql":
1184
+ logger.debug(
1185
+ f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
1186
+ )
1187
+ self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
1180
1188
  elif node.compiled_code:
1181
1189
  # Add CTE stops based on the upstreams list.
1182
1190
  cte_mapping = {