acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2524 -2471
  2. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +87 -87
  3. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datajob/dataflow.py +3 -3
  7. datahub/api/entities/forms/forms.py +34 -34
  8. datahub/api/graphql/assertion.py +1 -1
  9. datahub/api/graphql/operation.py +4 -4
  10. datahub/cli/check_cli.py +3 -2
  11. datahub/cli/config_utils.py +2 -2
  12. datahub/cli/delete_cli.py +6 -5
  13. datahub/cli/docker_cli.py +2 -2
  14. datahub/cli/exists_cli.py +2 -1
  15. datahub/cli/get_cli.py +2 -1
  16. datahub/cli/iceberg_cli.py +6 -5
  17. datahub/cli/ingest_cli.py +9 -6
  18. datahub/cli/migrate.py +4 -3
  19. datahub/cli/migration_utils.py +4 -3
  20. datahub/cli/put_cli.py +3 -2
  21. datahub/cli/specific/assertions_cli.py +2 -1
  22. datahub/cli/specific/datacontract_cli.py +3 -2
  23. datahub/cli/specific/dataproduct_cli.py +10 -9
  24. datahub/cli/specific/dataset_cli.py +4 -3
  25. datahub/cli/specific/forms_cli.py +2 -1
  26. datahub/cli/specific/group_cli.py +2 -1
  27. datahub/cli/specific/structuredproperties_cli.py +4 -3
  28. datahub/cli/specific/user_cli.py +2 -1
  29. datahub/cli/state_cli.py +2 -1
  30. datahub/cli/timeline_cli.py +2 -1
  31. datahub/configuration/source_common.py +1 -1
  32. datahub/emitter/request_helper.py +116 -3
  33. datahub/emitter/rest_emitter.py +163 -93
  34. datahub/entrypoints.py +2 -1
  35. datahub/ingestion/api/source.py +2 -5
  36. datahub/ingestion/glossary/classification_mixin.py +4 -2
  37. datahub/ingestion/graph/client.py +16 -7
  38. datahub/ingestion/graph/config.py +14 -0
  39. datahub/ingestion/graph/filters.py +1 -1
  40. datahub/ingestion/run/pipeline.py +3 -2
  41. datahub/ingestion/run/pipeline_config.py +1 -1
  42. datahub/ingestion/sink/datahub_rest.py +5 -6
  43. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  44. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  45. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  46. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  47. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  48. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  49. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  50. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  51. datahub/ingestion/source/feast.py +4 -4
  52. datahub/ingestion/source/ge_data_profiler.py +2 -1
  53. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  54. datahub/ingestion/source/ldap.py +1 -1
  55. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  56. datahub/ingestion/source/looker/lookml_source.py +7 -1
  57. datahub/ingestion/source/metadata/lineage.py +2 -1
  58. datahub/ingestion/source/mode.py +74 -28
  59. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  60. datahub/ingestion/source/powerbi/config.py +1 -1
  61. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  62. datahub/ingestion/source/redshift/usage.py +10 -9
  63. datahub/ingestion/source/sql/clickhouse.py +5 -1
  64. datahub/ingestion/source/sql/druid.py +7 -2
  65. datahub/ingestion/source/sql/oracle.py +6 -2
  66. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  67. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  68. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  69. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  70. datahub/integrations/assertion/common.py +3 -2
  71. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
  72. datahub/metadata/_urns/urn_defs.py +1786 -1786
  73. datahub/metadata/schema.avsc +17364 -16988
  74. datahub/metadata/schema_classes.py +3 -3
  75. datahub/metadata/schemas/__init__.py +3 -3
  76. datahub/sdk/main_client.py +2 -2
  77. datahub/secret/datahub_secret_store.py +2 -1
  78. datahub/telemetry/telemetry.py +2 -2
  79. datahub/testing/check_imports.py +1 -1
  80. datahub/upgrade/upgrade.py +10 -12
  81. datahub/utilities/logging_manager.py +8 -1
  82. datahub/utilities/server_config_util.py +378 -10
  83. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  84. datahub/utilities/urn_encoder.py +1 -1
  85. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, Sour
18
18
  from datahub.ingestion.api.source_helpers import auto_workunit_reporter
19
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
20
20
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
21
+ from datahub.ingestion.graph.config import ClientMode
21
22
  from datahub.metadata.schema_classes import (
22
23
  DomainsClass,
23
24
  GlossaryTermAssociationClass,
@@ -48,7 +49,7 @@ def apply_association_to_container(
48
49
  """
49
50
  urns: List[str] = [container_urn]
50
51
  if not graph:
51
- graph = get_default_graph()
52
+ graph = get_default_graph(ClientMode.INGESTION)
52
53
  logger.info(f"Using {graph}")
53
54
  urns.extend(
54
55
  graph.get_urns_by_filter(
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
205
205
  textwrap.dedent(
206
206
  f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
207
207
  To view full table metadata, run Glue ingestion
208
- (see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
208
+ (see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
209
209
  )
210
210
  )
211
211
 
@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
270
270
  ):
271
271
  return
272
272
 
273
- with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
274
- with BigQueryQueriesExtractor(
275
- connection=self.config.get_bigquery_client(),
276
- schema_api=self.bq_schema_extractor.schema_api,
277
- config=BigQueryQueriesExtractorConfig(
278
- window=self.config,
279
- user_email_pattern=self.config.usage.user_email_pattern,
280
- include_lineage=self.config.include_table_lineage,
281
- include_usage_statistics=self.config.include_usage_statistics,
282
- include_operations=self.config.usage.include_operational_stats,
283
- include_queries=self.config.include_queries,
284
- include_query_usage_statistics=self.config.include_query_usage_statistics,
285
- top_n_queries=self.config.usage.top_n_queries,
286
- region_qualifiers=self.config.region_qualifiers,
287
- ),
288
- structured_report=self.report,
289
- filters=self.filters,
290
- identifiers=self.identifiers,
291
- schema_resolver=self.sql_parser_schema_resolver,
292
- discovered_tables=self.bq_schema_extractor.table_refs,
293
- ) as queries_extractor:
294
- self.report.queries_extractor = queries_extractor.report
295
- yield from queries_extractor.get_workunits_internal()
273
+ with self.report.new_stage(
274
+ f"*: {QUERIES_EXTRACTION}"
275
+ ), BigQueryQueriesExtractor(
276
+ connection=self.config.get_bigquery_client(),
277
+ schema_api=self.bq_schema_extractor.schema_api,
278
+ config=BigQueryQueriesExtractorConfig(
279
+ window=self.config,
280
+ user_email_pattern=self.config.usage.user_email_pattern,
281
+ include_lineage=self.config.include_table_lineage,
282
+ include_usage_statistics=self.config.include_usage_statistics,
283
+ include_operations=self.config.usage.include_operational_stats,
284
+ include_queries=self.config.include_queries,
285
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
286
+ top_n_queries=self.config.usage.top_n_queries,
287
+ region_qualifiers=self.config.region_qualifiers,
288
+ ),
289
+ structured_report=self.report,
290
+ filters=self.filters,
291
+ identifiers=self.identifiers,
292
+ schema_resolver=self.sql_parser_schema_resolver,
293
+ discovered_tables=self.bq_schema_extractor.table_refs,
294
+ ) as queries_extractor:
295
+ self.report.queries_extractor = queries_extractor.report
296
+ yield from queries_extractor.get_workunits_internal()
296
297
  else:
297
298
  if self.config.include_usage_statistics:
298
299
  yield from self.usage_extractor.get_usage_workunits(
@@ -70,30 +70,31 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
- with ThreadPoolExecutor(
75
- max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
73
+ with self.report.new_stage(
74
+ f"{keyspace_name}: {PROFILING}"
75
+ ), ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor:
78
+ future_to_dataset = {
79
+ executor.submit(
80
+ self.generate_profile,
81
+ keyspace_name,
82
+ table_name,
83
+ cassandra_data.columns.get(table_name, []),
84
+ ): table_name
85
+ for table_name in tables
86
+ }
87
+ for future in as_completed(future_to_dataset):
88
+ table_name = future_to_dataset[future]
89
+ try:
90
+ yield from future.result()
91
+ except Exception as exc:
92
+ self.report.profiling_skipped_other[table_name] += 1
93
+ self.report.failure(
94
+ message="Failed to profile for table",
95
+ context=f"{keyspace_name}.{table_name}",
96
+ exc=exc,
97
+ )
97
98
 
98
99
  def generate_profile(
99
100
  self,
@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
195
195
  Yields:
196
196
  Row objects containing URNs of soft-deleted entities
197
197
  """
198
- with self.engine.connect() as conn:
199
- with contextlib.closing(conn.connection.cursor()) as cursor:
200
- logger.debug("Polling soft-deleted urns from database")
201
- cursor.execute(self.soft_deleted_urns_query)
202
- columns = [desc[0] for desc in cursor.description]
203
- while True:
204
- rows = cursor.fetchmany(self.config.database_query_batch_size)
205
- if not rows:
206
- return
207
- for row in rows:
208
- yield dict(zip(columns, row))
198
+ with self.engine.connect() as conn, contextlib.closing(
199
+ conn.connection.cursor()
200
+ ) as cursor:
201
+ logger.debug("Polling soft-deleted urns from database")
202
+ cursor.execute(self.soft_deleted_urns_query)
203
+ columns = [desc[0] for desc in cursor.description]
204
+ while True:
205
+ rows = cursor.fetchmany(self.config.database_query_batch_size)
206
+ if not rows:
207
+ return
208
+ for row in rows:
209
+ yield dict(zip(columns, row))
209
210
 
210
211
  def _parse_row(
211
212
  self, row: Dict[str, Any]
@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
12
  SupportStatus,
13
- capability,
14
13
  config_class,
15
14
  platform_name,
16
15
  support_status,
17
16
  )
18
17
  from datahub.ingestion.api.source import (
19
18
  CapabilityReport,
20
- SourceCapability,
21
19
  TestableSource,
22
20
  TestConnectionReport,
23
21
  )
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
262
260
 
263
261
  @platform_name("dbt")
264
262
  @config_class(DBTCloudConfig)
265
- @support_status(SupportStatus.INCUBATING)
266
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
267
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
263
+ @support_status(SupportStatus.CERTIFIED)
268
264
  class DBTCloudSource(DBTSourceBase, TestableSource):
269
265
  config: DBTCloudConfig
270
266
 
271
267
  @classmethod
272
268
  def create(cls, config_dict, ctx):
273
269
  config = DBTCloudConfig.parse_obj(config_dict)
274
- return cls(config, ctx, "dbt")
270
+ return cls(config, ctx)
275
271
 
276
272
  @staticmethod
277
273
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
125
125
  @dataclass
126
126
  class DBTSourceReport(StaleEntityRemovalSourceReport):
127
127
  sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
128
+ sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
128
129
  sql_parser_parse_failures: int = 0
129
130
  sql_parser_detach_ctes_failures: int = 0
130
131
  sql_parser_table_errors: int = 0
@@ -829,11 +830,13 @@ def get_column_type(
829
830
  "Enabled by default, configure using `include_column_lineage`",
830
831
  )
831
832
  class DBTSourceBase(StatefulIngestionSourceBase):
832
- def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
833
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
833
834
  super().__init__(config, ctx)
835
+ self.platform: str = "dbt"
836
+
834
837
  self.config = config
835
- self.platform: str = platform
836
838
  self.report: DBTSourceReport = DBTSourceReport()
839
+
837
840
  self.compiled_owner_extraction_pattern: Optional[Any] = None
838
841
  if self.config.owner_extraction_pattern:
839
842
  self.compiled_owner_extraction_pattern = re.compile(
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1177
1180
  logger.debug(
1178
1181
  f"Not generating CLL for {node.dbt_name} because we don't need it."
1179
1182
  )
1183
+ elif node.language != "sql":
1184
+ logger.debug(
1185
+ f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
1186
+ )
1187
+ self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
1180
1188
  elif node.compiled_code:
1181
1189
  # Add CTE stops based on the upstreams list.
1182
1190
  cte_mapping = {
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import json
2
3
  import logging
3
4
  import re
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
12
13
 
13
14
  from datahub.configuration.git import GitReference
14
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
+ from datahub.ingestion.api.common import PipelineContext
15
17
  from datahub.ingestion.api.decorators import (
16
18
  SupportStatus,
17
- capability,
18
19
  config_class,
19
20
  platform_name,
20
21
  support_status,
21
22
  )
22
23
  from datahub.ingestion.api.source import (
23
24
  CapabilityReport,
24
- SourceCapability,
25
25
  TestableSource,
26
26
  TestConnectionReport,
27
27
  )
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
40
40
  logger = logging.getLogger(__name__)
41
41
 
42
42
 
43
+ @dataclasses.dataclass
44
+ class DBTCoreReport(DBTSourceReport):
45
+ catalog_info: Optional[dict] = None
46
+ manifest_info: Optional[dict] = None
47
+
48
+
43
49
  class DBTCoreConfig(DBTCommonConfig):
44
50
  manifest_path: str = Field(
45
- description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note "
46
- "this can be a local file or a URI."
51
+ description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
52
+ "This can be a local file or a URI."
47
53
  )
48
- catalog_path: str = Field(
49
- description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this "
50
- "can be a local file or a URI."
54
+ catalog_path: Optional[str] = Field(
55
+ None,
56
+ description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
57
+ "This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
58
+ "This can be a local file or a URI.",
51
59
  )
52
60
  sources_path: Optional[str] = Field(
53
61
  default=None,
54
- description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not "
55
- "specified, last-modified fields will not be populated. Note this can be a local file or a URI.",
62
+ description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
63
+ "If not specified, last-modified fields will not be populated. "
64
+ "This can be a local file or a URI.",
56
65
  )
57
66
  run_results_paths: List[str] = Field(
58
67
  default=[],
@@ -161,7 +170,7 @@ def get_columns(
161
170
 
162
171
  def extract_dbt_entities(
163
172
  all_manifest_entities: Dict[str, Dict[str, Any]],
164
- all_catalog_entities: Dict[str, Dict[str, Any]],
173
+ all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
165
174
  sources_results: List[Dict[str, Any]],
166
175
  manifest_adapter: str,
167
176
  use_identifiers: bool,
@@ -186,15 +195,6 @@ def extract_dbt_entities(
186
195
  ):
187
196
  name = manifest_node["alias"]
188
197
 
189
- # initialize comment to "" for consistency with descriptions
190
- # (since dbt null/undefined descriptions as "")
191
- comment = ""
192
-
193
- if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
194
- "comment"
195
- ):
196
- comment = all_catalog_entities[key]["metadata"]["comment"]
197
-
198
198
  materialization = None
199
199
  if "materialized" in manifest_node.get("config", {}):
200
200
  # It's a model
@@ -204,8 +204,9 @@ def extract_dbt_entities(
204
204
  if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
205
205
  upstream_nodes = manifest_node["depends_on"]["nodes"]
206
206
 
207
- # It's a source
208
- catalog_node = all_catalog_entities.get(key)
207
+ catalog_node = (
208
+ all_catalog_entities.get(key) if all_catalog_entities is not None else None
209
+ )
209
210
  missing_from_catalog = catalog_node is None
210
211
  catalog_type = None
211
212
 
@@ -214,16 +215,23 @@ def extract_dbt_entities(
214
215
  # Test and ephemeral nodes will never show up in the catalog.
215
216
  missing_from_catalog = False
216
217
  else:
217
- if not only_include_if_in_catalog:
218
+ if all_catalog_entities is not None and not only_include_if_in_catalog:
219
+ # If the catalog file is missing, we have already generated a general message.
218
220
  report.warning(
219
221
  title="Node missing from catalog",
220
222
  message="Found a node in the manifest file but not in the catalog. "
221
223
  "This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
222
- "Some metadata, such as column types and descriptions, will be impacted.",
224
+ "Some metadata, particularly schema information, will be impacted.",
223
225
  context=key,
224
226
  )
225
227
  else:
226
- catalog_type = all_catalog_entities[key]["metadata"]["type"]
228
+ catalog_type = catalog_node["metadata"]["type"]
229
+
230
+ # initialize comment to "" for consistency with descriptions
231
+ # (since dbt null/undefined descriptions as "")
232
+ comment = ""
233
+ if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
234
+ comment = catalog_node["metadata"]["comment"]
227
235
 
228
236
  query_tag_props = manifest_node.get("query_tag", {})
229
237
 
@@ -231,12 +239,15 @@ def extract_dbt_entities(
231
239
 
232
240
  owner = meta.get("owner")
233
241
  if owner is None:
234
- owner = manifest_node.get("config", {}).get("meta", {}).get("owner")
242
+ owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
243
+
244
+ if not meta:
245
+ # On older versions of dbt, the meta field was nested under config
246
+ # for some node types.
247
+ meta = manifest_node.get("config", {}).get("meta") or {}
235
248
 
236
249
  tags = manifest_node.get("tags", [])
237
250
  tags = [tag_prefix + tag for tag in tags]
238
- if not meta:
239
- meta = manifest_node.get("config", {}).get("meta", {})
240
251
 
241
252
  max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
242
253
  max_loaded_at = None
@@ -453,15 +464,18 @@ def load_run_results(
453
464
  @platform_name("dbt")
454
465
  @config_class(DBTCoreConfig)
455
466
  @support_status(SupportStatus.CERTIFIED)
456
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
457
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
458
467
  class DBTCoreSource(DBTSourceBase, TestableSource):
459
468
  config: DBTCoreConfig
469
+ report: DBTCoreReport
470
+
471
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
472
+ super().__init__(config, ctx)
473
+ self.report = DBTCoreReport()
460
474
 
461
475
  @classmethod
462
476
  def create(cls, config_dict, ctx):
463
477
  config = DBTCoreConfig.parse_obj(config_dict)
464
- return cls(config, ctx, "dbt")
478
+ return cls(config, ctx)
465
479
 
466
480
  @staticmethod
467
481
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
471
485
  DBTCoreSource.load_file_as_json(
472
486
  source_config.manifest_path, source_config.aws_connection
473
487
  )
474
- DBTCoreSource.load_file_as_json(
475
- source_config.catalog_path, source_config.aws_connection
476
- )
488
+ if source_config.catalog_path is not None:
489
+ DBTCoreSource.load_file_as_json(
490
+ source_config.catalog_path, source_config.aws_connection
491
+ )
477
492
  test_report.basic_connectivity = CapabilityReport(capable=True)
478
493
  except Exception as e:
479
494
  test_report.basic_connectivity = CapabilityReport(
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
511
526
  dbt_manifest_json = self.load_file_as_json(
512
527
  self.config.manifest_path, self.config.aws_connection
513
528
  )
514
-
515
- dbt_catalog_json = self.load_file_as_json(
516
- self.config.catalog_path, self.config.aws_connection
529
+ dbt_manifest_metadata = dbt_manifest_json["metadata"]
530
+ self.report.manifest_info = dict(
531
+ generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
532
+ dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
533
+ project_name=dbt_manifest_metadata.get("project_name", "unknown"),
517
534
  )
518
535
 
536
+ dbt_catalog_json = None
537
+ dbt_catalog_metadata = None
538
+ if self.config.catalog_path is not None:
539
+ dbt_catalog_json = self.load_file_as_json(
540
+ self.config.catalog_path, self.config.aws_connection
541
+ )
542
+ dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
543
+ self.report.catalog_info = dict(
544
+ generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
545
+ dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
546
+ project_name=dbt_catalog_metadata.get("project_name", "unknown"),
547
+ )
548
+ else:
549
+ self.report.warning(
550
+ title="No catalog file configured",
551
+ message="Some metadata, particularly schema information, will be missing.",
552
+ )
553
+
519
554
  if self.config.sources_path is not None:
520
555
  dbt_sources_json = self.load_file_as_json(
521
556
  self.config.sources_path, self.config.aws_connection
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
528
563
  manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
529
564
  manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
530
565
 
531
- catalog_schema = dbt_catalog_json.get("metadata", {}).get("dbt_schema_version")
532
- catalog_version = dbt_catalog_json.get("metadata", {}).get("dbt_version")
566
+ catalog_schema = None
567
+ catalog_version = None
568
+ if dbt_catalog_metadata is not None:
569
+ catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
570
+ catalog_version = dbt_catalog_metadata.get("dbt_version")
533
571
 
534
572
  manifest_nodes = dbt_manifest_json["nodes"]
535
573
  manifest_sources = dbt_manifest_json["sources"]
536
574
 
537
575
  all_manifest_entities = {**manifest_nodes, **manifest_sources}
538
576
 
539
- catalog_nodes = dbt_catalog_json["nodes"]
540
- catalog_sources = dbt_catalog_json["sources"]
577
+ all_catalog_entities = None
578
+ if dbt_catalog_json is not None:
579
+ catalog_nodes = dbt_catalog_json["nodes"]
580
+ catalog_sources = dbt_catalog_json["sources"]
541
581
 
542
- all_catalog_entities = {**catalog_nodes, **catalog_sources}
582
+ all_catalog_entities = {**catalog_nodes, **catalog_sources}
543
583
 
544
584
  nodes = extract_dbt_entities(
545
585
  all_manifest_entities=all_manifest_entities,
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
590
630
  )
591
631
  except Exception as e:
592
632
  self.report.info(
593
- title="Dbt Catalog Version",
633
+ title="dbt Catalog Version",
594
634
  message="Failed to determine the catalog version",
595
635
  exc=e,
596
636
  )
@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
135
135
  """
136
136
  This plugin extracts:
137
137
 
138
- - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
139
- - Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
140
- - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
141
- - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
138
+ - Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
139
+ - Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
140
+ - Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
141
+ - Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
142
142
  - Column types associated with each entity and feature
143
143
  """
144
144
 
@@ -51,6 +51,7 @@ from typing_extensions import Concatenate, ParamSpec
51
51
  from datahub.emitter import mce_builder
52
52
  from datahub.emitter.mce_builder import get_sys_time
53
53
  from datahub.ingestion.graph.client import get_default_graph
54
+ from datahub.ingestion.graph.config import ClientMode
54
55
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
55
56
  from datahub.ingestion.source.profiling.common import (
56
57
  Cardinality,
@@ -1569,7 +1570,7 @@ def _get_columns_to_ignore_sampling(
1569
1570
  name=dataset_name, platform=platform, env=env
1570
1571
  )
1571
1572
 
1572
- datahub_graph = get_default_graph()
1573
+ datahub_graph = get_default_graph(ClientMode.INGESTION)
1573
1574
 
1574
1575
  dataset_tags = datahub_graph.get_tags(dataset_urn)
1575
1576
  if dataset_tags:
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
40
40
  del kwargs["timeout"]
41
41
  super().__init__(*args, **kwargs)
42
42
 
43
- def send(self, request, **kwargs):
43
+ def send(self, request, *args, **kwargs):
44
44
  timeout = kwargs.get("timeout")
45
45
  if timeout is None and hasattr(self, "timeout"):
46
46
  kwargs["timeout"] = self.timeout
47
- return super().send(request, **kwargs)
47
+ return super().send(request, *args, **kwargs)
48
48
 
49
49
 
50
50
  class IcebergProfilingConfig(ConfigModel):
@@ -515,5 +515,5 @@ def parse_ldap_dn(input_clean: bytes) -> str:
515
515
 
516
516
  def get_attr_or_none(
517
517
  attrs: Dict[str, Any], key: str, default: Optional[str] = None
518
- ) -> str:
518
+ ) -> Optional[str]:
519
519
  return attrs[key][0].decode() if attrs.get(key) else default
@@ -113,7 +113,7 @@ class LookerAPI:
113
113
  )
114
114
  except SDKError as e:
115
115
  raise ConfigurationError(
116
- f"Failed to connect/authenticate with looker - check your configuration: {e}"
116
+ "Failed to connect/authenticate with looker - check your configuration"
117
117
  ) from e
118
118
 
119
119
  self.client_stats = LookerAPIStats()
@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
497
497
  f"Failed to find a project name for model {model_name}"
498
498
  )
499
499
  return model.project_name
500
- except SDKError:
500
+ except SDKError as e:
501
+ self.reporter.failure(
502
+ title="Failed to find a project name for model",
503
+ message="Consider configuring a static project name in your config file",
504
+ context=str(dict(model_name=model_name)),
505
+ exc=e,
506
+ )
501
507
  raise ValueError(
502
508
  f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
503
509
  f"in your config file"
@@ -36,6 +36,7 @@ from datahub.ingestion.api.source_helpers import (
36
36
  )
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.graph.client import get_default_graph
39
+ from datahub.ingestion.graph.config import ClientMode
39
40
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
40
41
  FineGrainedLineageDownstreamType,
41
42
  FineGrainedLineageUpstreamType,
@@ -210,7 +211,7 @@ def _get_lineage_mcp(
210
211
 
211
212
  # extract the old lineage and save it for the new mcp
212
213
  if preserve_upstream:
213
- client = get_default_graph()
214
+ client = get_default_graph(ClientMode.INGESTION)
214
215
 
215
216
  old_upstream_lineage = get_aspects_for_entity(
216
217
  client._session,