acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (54) hide show
  1. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/METADATA +2480 -2480
  2. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/RECORD +54 -54
  3. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datajob/dataflow.py +3 -3
  7. datahub/api/entities/forms/forms.py +34 -35
  8. datahub/api/graphql/assertion.py +1 -1
  9. datahub/api/graphql/operation.py +4 -4
  10. datahub/cli/delete_cli.py +1 -1
  11. datahub/cli/docker_cli.py +2 -2
  12. datahub/configuration/source_common.py +1 -1
  13. datahub/emitter/request_helper.py +116 -3
  14. datahub/emitter/rest_emitter.py +44 -52
  15. datahub/ingestion/api/source.py +2 -5
  16. datahub/ingestion/glossary/classification_mixin.py +4 -2
  17. datahub/ingestion/graph/client.py +3 -1
  18. datahub/ingestion/graph/config.py +1 -0
  19. datahub/ingestion/graph/filters.py +1 -1
  20. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  21. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  22. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  23. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  24. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  25. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  26. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  27. datahub/ingestion/source/feast.py +4 -4
  28. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  29. datahub/ingestion/source/ldap.py +1 -1
  30. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  31. datahub/ingestion/source/looker/lookml_source.py +7 -1
  32. datahub/ingestion/source/mode.py +74 -28
  33. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  34. datahub/ingestion/source/powerbi/config.py +1 -1
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  36. datahub/ingestion/source/redshift/usage.py +10 -9
  37. datahub/ingestion/source/sql/clickhouse.py +5 -1
  38. datahub/ingestion/source/sql/druid.py +7 -2
  39. datahub/ingestion/source/sql/oracle.py +6 -2
  40. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  41. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  42. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  43. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
  44. datahub/metadata/_urns/urn_defs.py +1786 -1786
  45. datahub/metadata/schema.avsc +17364 -16988
  46. datahub/metadata/schema_classes.py +3 -3
  47. datahub/metadata/schemas/__init__.py +3 -3
  48. datahub/testing/check_imports.py +1 -1
  49. datahub/utilities/logging_manager.py +8 -1
  50. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  51. datahub/utilities/urn_encoder.py +1 -1
  52. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/entry_points.txt +0 -0
  53. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/licenses/LICENSE +0 -0
  54. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,31 @@
1
+ import json
1
2
  import shlex
2
- from typing import List, Optional, Union
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Optional, Union
3
5
 
4
6
  import requests
5
7
  from requests.auth import HTTPBasicAuth
6
8
 
9
+ from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
10
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
+ from datahub.emitter.serialization_helper import pre_json_transform
12
+ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
13
+ MetadataChangeProposal,
14
+ )
15
+ from datahub.metadata.schema_classes import ChangeTypeClass
16
+
17
+
18
+ def _decode_bytes(value: Union[str, bytes]) -> str:
19
+ """Decode bytes to string, if necessary."""
20
+ if isinstance(value, bytes):
21
+ return value.decode()
22
+ return value
23
+
7
24
 
8
25
  def _format_header(name: str, value: Union[str, bytes]) -> str:
9
26
  if name == "Authorization":
10
27
  return f"{name!s}: <redacted>"
11
- return f"{name!s}: {value!s}"
28
+ return f"{name!s}: {_decode_bytes(value)}"
12
29
 
13
30
 
14
31
  def make_curl_command(
@@ -21,7 +38,9 @@ def make_curl_command(
21
38
 
22
39
  if session.auth:
23
40
  if isinstance(session.auth, HTTPBasicAuth):
24
- fragments.extend(["-u", f"{session.auth.username}:<redacted>"])
41
+ fragments.extend(
42
+ ["-u", f"{_decode_bytes(session.auth.username)}:<redacted>"]
43
+ )
25
44
  else:
26
45
  # For other auth types, they should be handled via headers
27
46
  fragments.extend(["-H", "<unknown auth type>"])
@@ -31,3 +50,97 @@ def make_curl_command(
31
50
 
32
51
  fragments.append(url)
33
52
  return shlex.join(fragments)
53
+
54
+
55
+ @dataclass
56
+ class OpenApiRequest:
57
+ """Represents an OpenAPI request for entity operations."""
58
+
59
+ method: str
60
+ url: str
61
+ payload: List[Dict[str, Any]]
62
+
63
+ @classmethod
64
+ def from_mcp(
65
+ cls,
66
+ mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
67
+ gms_server: str,
68
+ async_flag: Optional[bool] = None,
69
+ async_default: bool = False,
70
+ ) -> Optional["OpenApiRequest"]:
71
+ """Factory method to create an OpenApiRequest from a MetadataChangeProposal."""
72
+ if not mcp.aspectName or (
73
+ mcp.changeType != ChangeTypeClass.DELETE and not mcp.aspect
74
+ ):
75
+ return None
76
+
77
+ resolved_async_flag = async_flag if async_flag is not None else async_default
78
+
79
+ method = "post"
80
+ url = f"{gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
81
+ payload = []
82
+
83
+ if mcp.changeType == ChangeTypeClass.DELETE:
84
+ method = "delete"
85
+ url = f"{gms_server}/openapi/v3/entity/{mcp.entityType}/{mcp.entityUrn}"
86
+ else:
87
+ if mcp.aspect:
88
+ if mcp.changeType == ChangeTypeClass.PATCH:
89
+ method = "patch"
90
+ obj = mcp.aspect.to_obj()
91
+ content_type = obj.get("contentType")
92
+ if obj.get("value") and content_type == JSON_PATCH_CONTENT_TYPE:
93
+ # Undo double serialization.
94
+ obj = json.loads(obj["value"])
95
+ patch_value = obj
96
+ else:
97
+ raise NotImplementedError(
98
+ f"ChangeType {mcp.changeType} only supports context type {JSON_PATCH_CONTENT_TYPE}, found {content_type}."
99
+ )
100
+
101
+ if isinstance(patch_value, list):
102
+ patch_value = {"patch": patch_value}
103
+
104
+ payload = [
105
+ {
106
+ "urn": mcp.entityUrn,
107
+ mcp.aspectName: {
108
+ "value": patch_value,
109
+ "systemMetadata": mcp.systemMetadata.to_obj()
110
+ if mcp.systemMetadata
111
+ else None,
112
+ },
113
+ }
114
+ ]
115
+ else:
116
+ if isinstance(mcp, MetadataChangeProposalWrapper):
117
+ aspect_value = pre_json_transform(
118
+ mcp.to_obj(simplified_structure=True)
119
+ )["aspect"]["json"]
120
+ else:
121
+ obj = mcp.aspect.to_obj()
122
+ content_type = obj.get("contentType")
123
+ if obj.get("value") and content_type == JSON_CONTENT_TYPE:
124
+ # Undo double serialization.
125
+ obj = json.loads(obj["value"])
126
+ elif content_type == JSON_PATCH_CONTENT_TYPE:
127
+ raise NotImplementedError(
128
+ f"ChangeType {mcp.changeType} does not support patch."
129
+ )
130
+ aspect_value = pre_json_transform(obj)
131
+
132
+ payload = [
133
+ {
134
+ "urn": mcp.entityUrn,
135
+ mcp.aspectName: {
136
+ "value": aspect_value,
137
+ "systemMetadata": mcp.systemMetadata.to_obj()
138
+ if mcp.systemMetadata
139
+ else None,
140
+ },
141
+ }
142
+ ]
143
+ else:
144
+ raise ValueError(f"ChangeType {mcp.changeType} requires a value.")
145
+
146
+ return cls(method=method, url=url, payload=payload)
@@ -41,10 +41,9 @@ from datahub.configuration.common import (
41
41
  TraceTimeoutError,
42
42
  TraceValidationError,
43
43
  )
44
- from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
45
44
  from datahub.emitter.generic_emitter import Emitter
46
45
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
47
- from datahub.emitter.request_helper import make_curl_command
46
+ from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
48
47
  from datahub.emitter.response_helper import (
49
48
  TraceData,
50
49
  extract_trace_data,
@@ -348,43 +347,24 @@ class DataHubRestEmitter(Closeable, Emitter):
348
347
  mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
349
348
  async_flag: Optional[bool] = None,
350
349
  async_default: bool = False,
351
- ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
352
- if mcp.aspect and mcp.aspectName:
353
- resolved_async_flag = (
354
- async_flag if async_flag is not None else async_default
355
- )
356
- url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
350
+ ) -> Optional[OpenApiRequest]:
351
+ """
352
+ Convert a MetadataChangeProposal to an OpenAPI request format.
357
353
 
358
- if isinstance(mcp, MetadataChangeProposalWrapper):
359
- aspect_value = pre_json_transform(
360
- mcp.to_obj(simplified_structure=True)
361
- )["aspect"]["json"]
362
- else:
363
- obj = mcp.aspect.to_obj()
364
- content_type = obj.get("contentType")
365
- if obj.get("value") and content_type == JSON_CONTENT_TYPE:
366
- # Undo double serialization.
367
- obj = json.loads(obj["value"])
368
- elif content_type == JSON_PATCH_CONTENT_TYPE:
369
- raise NotImplementedError(
370
- "Patches are not supported for OpenAPI ingestion. Set the endpoint to RESTLI."
371
- )
372
- aspect_value = pre_json_transform(obj)
373
- return (
374
- url,
375
- [
376
- {
377
- "urn": mcp.entityUrn,
378
- mcp.aspectName: {
379
- "value": aspect_value,
380
- "systemMetadata": mcp.systemMetadata.to_obj()
381
- if mcp.systemMetadata
382
- else None,
383
- },
384
- }
385
- ],
386
- )
387
- return None
354
+ Args:
355
+ mcp: The metadata change proposal
356
+ async_flag: Optional flag to override async behavior
357
+ async_default: Default async behavior if not specified
358
+
359
+ Returns:
360
+ An OpenApiRequest object or None if the MCP doesn't have required fields
361
+ """
362
+ return OpenApiRequest.from_mcp(
363
+ mcp=mcp,
364
+ gms_server=self._gms_server,
365
+ async_flag=async_flag,
366
+ async_default=async_default,
367
+ )
388
368
 
389
369
  def emit(
390
370
  self,
@@ -448,7 +428,9 @@ class DataHubRestEmitter(Closeable, Emitter):
448
428
  if self._openapi_ingestion:
449
429
  request = self._to_openapi_request(mcp, async_flag, async_default=False)
450
430
  if request:
451
- response = self._emit_generic(request[0], payload=request[1])
431
+ response = self._emit_generic(
432
+ request.url, payload=request.payload, method=request.method
433
+ )
452
434
 
453
435
  if self._should_trace(async_flag, trace_flag):
454
436
  trace_data = extract_trace_data(response) if response else None
@@ -503,31 +485,36 @@ class DataHubRestEmitter(Closeable, Emitter):
503
485
  trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
504
486
  ) -> int:
505
487
  """
506
- 1. Grouping MCPs by their entity URL
488
+ 1. Grouping MCPs by their HTTP method and entity URL
507
489
  2. Breaking down large batches into smaller chunks based on both:
508
490
  * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
509
491
  * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
510
492
 
511
493
  The Chunk class encapsulates both the items and their byte size tracking
512
- Serializing the items only once with json.dumps(request[1]) and reusing that
494
+ Serializing the items only once with json.dumps(request.payload) and reusing that
513
495
  The chunking logic handles edge cases (always accepting at least one item per chunk)
514
496
  The joining logic is efficient with a simple string concatenation
515
497
 
516
498
  :param mcps: metadata change proposals to transmit
517
499
  :param async_flag: the mode
500
+ :param trace_flag: whether to trace the requests
501
+ :param trace_timeout: timeout for tracing
518
502
  :return: number of requests
519
503
  """
520
- # group by entity url
521
- batches: Dict[str, List[_Chunk]] = defaultdict(
504
+ # Group by entity URL and HTTP method
505
+ batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
522
506
  lambda: [_Chunk(items=[])]
523
507
  ) # Initialize with one empty Chunk
524
508
 
525
509
  for mcp in mcps:
526
510
  request = self._to_openapi_request(mcp, async_flag, async_default=True)
527
511
  if request:
528
- current_chunk = batches[request[0]][-1] # Get the last chunk
529
- # Only serialize once
530
- serialized_item = json.dumps(request[1][0])
512
+ # Create a composite key with both method and URL
513
+ key = (request.method, request.url)
514
+ current_chunk = batches[key][-1] # Get the last chunk
515
+
516
+ # Only serialize once - we're serializing a single payload item
517
+ serialized_item = json.dumps(request.payload[0])
531
518
  item_bytes = len(serialized_item.encode())
532
519
 
533
520
  # If adding this item would exceed max_bytes, create a new chunk
@@ -537,15 +524,17 @@ class DataHubRestEmitter(Closeable, Emitter):
537
524
  or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
538
525
  ):
539
526
  new_chunk = _Chunk(items=[])
540
- batches[request[0]].append(new_chunk)
527
+ batches[key].append(new_chunk)
541
528
  current_chunk = new_chunk
542
529
 
543
530
  current_chunk.add_item(serialized_item)
544
531
 
545
532
  responses = []
546
- for url, chunks in batches.items():
533
+ for (method, url), chunks in batches.items():
547
534
  for chunk in chunks:
548
- response = self._emit_generic(url, payload=_Chunk.join(chunk))
535
+ response = self._emit_generic(
536
+ url, payload=_Chunk.join(chunk), method=method
537
+ )
549
538
  responses.append(response)
550
539
 
551
540
  if self._should_trace(async_flag, trace_flag, async_default=True):
@@ -618,11 +607,13 @@ class DataHubRestEmitter(Closeable, Emitter):
618
607
  payload = json.dumps(snapshot)
619
608
  self._emit_generic(url, payload)
620
609
 
621
- def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
610
+ def _emit_generic(
611
+ self, url: str, payload: Union[str, Any], method: str = "POST"
612
+ ) -> requests.Response:
622
613
  if not isinstance(payload, str):
623
614
  payload = json.dumps(payload)
624
615
 
625
- curl_command = make_curl_command(self._session, "POST", url, payload)
616
+ curl_command = make_curl_command(self._session, method, url, payload)
626
617
  payload_size = len(payload)
627
618
  if payload_size > INGEST_MAX_PAYLOAD_BYTES:
628
619
  # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
@@ -635,7 +626,8 @@ class DataHubRestEmitter(Closeable, Emitter):
635
626
  curl_command,
636
627
  )
637
628
  try:
638
- response = self._session.post(url, data=payload)
629
+ method_func = getattr(self._session, method.lower())
630
+ response = method_func(url, data=payload) if payload else method_func(url)
639
631
  response.raise_for_status()
640
632
  return response
641
633
  except HTTPError as e:
@@ -420,12 +420,9 @@ class Source(Closeable, metaclass=ABCMeta):
420
420
  Run in order, first in list is applied first. Be careful with order when overriding.
421
421
  """
422
422
  browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
423
- if (
424
- self.ctx.pipeline_config
425
- and self.ctx.pipeline_config.flags.generate_browse_path_v2
426
- ):
423
+ if self.ctx.flags.generate_browse_path_v2:
427
424
  browse_path_processor = self._get_browse_path_processor(
428
- self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run
425
+ self.ctx.flags.generate_browse_path_v2_dry_run
429
426
  )
430
427
 
431
428
  auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
@@ -319,8 +319,10 @@ def classification_workunit_processor(
319
319
  partial(
320
320
  data_reader.get_sample_data_for_table,
321
321
  table_id,
322
- classification_handler.config.classification.sample_size
323
- * SAMPLE_SIZE_MULTIPLIER,
322
+ int(
323
+ classification_handler.config.classification.sample_size
324
+ * SAMPLE_SIZE_MULTIPLIER
325
+ ),
324
326
  **(data_reader_kwargs or {}),
325
327
  )
326
328
  if data_reader
@@ -158,7 +158,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
158
158
  ca_certificate_path=self.config.ca_certificate_path,
159
159
  client_certificate_path=self.config.client_certificate_path,
160
160
  disable_ssl_verification=self.config.disable_ssl_verification,
161
- openapi_ingestion=DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI,
161
+ openapi_ingestion=self.config.openapi_ingestion
162
+ if self.config.openapi_ingestion is not None
163
+ else (DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI),
162
164
  default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
163
165
  )
164
166
 
@@ -17,3 +17,4 @@ class DatahubClientConfig(ConfigModel):
17
17
  ca_certificate_path: Optional[str] = None
18
18
  client_certificate_path: Optional[str] = None
19
19
  disable_ssl_verification: bool = False
20
+ openapi_ingestion: Optional[bool] = None
@@ -18,7 +18,7 @@ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
18
18
  # This can be put directly into the orFilters parameter in GraphQL.
19
19
  RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
20
20
 
21
- # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
21
+ # Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
22
22
  FilterOperator: TypeAlias = Literal[
23
23
  "CONTAIN",
24
24
  "EQUAL",
@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
205
205
  textwrap.dedent(
206
206
  f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
207
207
  To view full table metadata, run Glue ingestion
208
- (see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
208
+ (see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
209
209
  )
210
210
  )
211
211
 
@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
270
270
  ):
271
271
  return
272
272
 
273
- with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
274
- with BigQueryQueriesExtractor(
275
- connection=self.config.get_bigquery_client(),
276
- schema_api=self.bq_schema_extractor.schema_api,
277
- config=BigQueryQueriesExtractorConfig(
278
- window=self.config,
279
- user_email_pattern=self.config.usage.user_email_pattern,
280
- include_lineage=self.config.include_table_lineage,
281
- include_usage_statistics=self.config.include_usage_statistics,
282
- include_operations=self.config.usage.include_operational_stats,
283
- include_queries=self.config.include_queries,
284
- include_query_usage_statistics=self.config.include_query_usage_statistics,
285
- top_n_queries=self.config.usage.top_n_queries,
286
- region_qualifiers=self.config.region_qualifiers,
287
- ),
288
- structured_report=self.report,
289
- filters=self.filters,
290
- identifiers=self.identifiers,
291
- schema_resolver=self.sql_parser_schema_resolver,
292
- discovered_tables=self.bq_schema_extractor.table_refs,
293
- ) as queries_extractor:
294
- self.report.queries_extractor = queries_extractor.report
295
- yield from queries_extractor.get_workunits_internal()
273
+ with self.report.new_stage(
274
+ f"*: {QUERIES_EXTRACTION}"
275
+ ), BigQueryQueriesExtractor(
276
+ connection=self.config.get_bigquery_client(),
277
+ schema_api=self.bq_schema_extractor.schema_api,
278
+ config=BigQueryQueriesExtractorConfig(
279
+ window=self.config,
280
+ user_email_pattern=self.config.usage.user_email_pattern,
281
+ include_lineage=self.config.include_table_lineage,
282
+ include_usage_statistics=self.config.include_usage_statistics,
283
+ include_operations=self.config.usage.include_operational_stats,
284
+ include_queries=self.config.include_queries,
285
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
286
+ top_n_queries=self.config.usage.top_n_queries,
287
+ region_qualifiers=self.config.region_qualifiers,
288
+ ),
289
+ structured_report=self.report,
290
+ filters=self.filters,
291
+ identifiers=self.identifiers,
292
+ schema_resolver=self.sql_parser_schema_resolver,
293
+ discovered_tables=self.bq_schema_extractor.table_refs,
294
+ ) as queries_extractor:
295
+ self.report.queries_extractor = queries_extractor.report
296
+ yield from queries_extractor.get_workunits_internal()
296
297
  else:
297
298
  if self.config.include_usage_statistics:
298
299
  yield from self.usage_extractor.get_usage_workunits(
@@ -70,30 +70,31 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
- with ThreadPoolExecutor(
75
- max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
73
+ with self.report.new_stage(
74
+ f"{keyspace_name}: {PROFILING}"
75
+ ), ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor:
78
+ future_to_dataset = {
79
+ executor.submit(
80
+ self.generate_profile,
81
+ keyspace_name,
82
+ table_name,
83
+ cassandra_data.columns.get(table_name, []),
84
+ ): table_name
85
+ for table_name in tables
86
+ }
87
+ for future in as_completed(future_to_dataset):
88
+ table_name = future_to_dataset[future]
89
+ try:
90
+ yield from future.result()
91
+ except Exception as exc:
92
+ self.report.profiling_skipped_other[table_name] += 1
93
+ self.report.failure(
94
+ message="Failed to profile for table",
95
+ context=f"{keyspace_name}.{table_name}",
96
+ exc=exc,
97
+ )
97
98
 
98
99
  def generate_profile(
99
100
  self,
@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
195
195
  Yields:
196
196
  Row objects containing URNs of soft-deleted entities
197
197
  """
198
- with self.engine.connect() as conn:
199
- with contextlib.closing(conn.connection.cursor()) as cursor:
200
- logger.debug("Polling soft-deleted urns from database")
201
- cursor.execute(self.soft_deleted_urns_query)
202
- columns = [desc[0] for desc in cursor.description]
203
- while True:
204
- rows = cursor.fetchmany(self.config.database_query_batch_size)
205
- if not rows:
206
- return
207
- for row in rows:
208
- yield dict(zip(columns, row))
198
+ with self.engine.connect() as conn, contextlib.closing(
199
+ conn.connection.cursor()
200
+ ) as cursor:
201
+ logger.debug("Polling soft-deleted urns from database")
202
+ cursor.execute(self.soft_deleted_urns_query)
203
+ columns = [desc[0] for desc in cursor.description]
204
+ while True:
205
+ rows = cursor.fetchmany(self.config.database_query_batch_size)
206
+ if not rows:
207
+ return
208
+ for row in rows:
209
+ yield dict(zip(columns, row))
209
210
 
210
211
  def _parse_row(
211
212
  self, row: Dict[str, Any]
@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
10
10
 
11
11
  from datahub.ingestion.api.decorators import (
12
12
  SupportStatus,
13
- capability,
14
13
  config_class,
15
14
  platform_name,
16
15
  support_status,
17
16
  )
18
17
  from datahub.ingestion.api.source import (
19
18
  CapabilityReport,
20
- SourceCapability,
21
19
  TestableSource,
22
20
  TestConnectionReport,
23
21
  )
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
262
260
 
263
261
  @platform_name("dbt")
264
262
  @config_class(DBTCloudConfig)
265
- @support_status(SupportStatus.INCUBATING)
266
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
267
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
263
+ @support_status(SupportStatus.CERTIFIED)
268
264
  class DBTCloudSource(DBTSourceBase, TestableSource):
269
265
  config: DBTCloudConfig
270
266
 
271
267
  @classmethod
272
268
  def create(cls, config_dict, ctx):
273
269
  config = DBTCloudConfig.parse_obj(config_dict)
274
- return cls(config, ctx, "dbt")
270
+ return cls(config, ctx)
275
271
 
276
272
  @staticmethod
277
273
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
125
125
  @dataclass
126
126
  class DBTSourceReport(StaleEntityRemovalSourceReport):
127
127
  sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
128
+ sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
128
129
  sql_parser_parse_failures: int = 0
129
130
  sql_parser_detach_ctes_failures: int = 0
130
131
  sql_parser_table_errors: int = 0
@@ -829,11 +830,13 @@ def get_column_type(
829
830
  "Enabled by default, configure using `include_column_lineage`",
830
831
  )
831
832
  class DBTSourceBase(StatefulIngestionSourceBase):
832
- def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
833
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
833
834
  super().__init__(config, ctx)
835
+ self.platform: str = "dbt"
836
+
834
837
  self.config = config
835
- self.platform: str = platform
836
838
  self.report: DBTSourceReport = DBTSourceReport()
839
+
837
840
  self.compiled_owner_extraction_pattern: Optional[Any] = None
838
841
  if self.config.owner_extraction_pattern:
839
842
  self.compiled_owner_extraction_pattern = re.compile(
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1177
1180
  logger.debug(
1178
1181
  f"Not generating CLL for {node.dbt_name} because we don't need it."
1179
1182
  )
1183
+ elif node.language != "sql":
1184
+ logger.debug(
1185
+ f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
1186
+ )
1187
+ self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
1180
1188
  elif node.compiled_code:
1181
1189
  # Add CTE stops based on the upstreams list.
1182
1190
  cte_mapping = {