acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show
  1. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
  2. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
  3. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +141 -93
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
  30. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  31. datahub/ingestion/api/report.py +1 -2
  32. datahub/ingestion/api/source.py +8 -2
  33. datahub/ingestion/api/source_helpers.py +1 -1
  34. datahub/ingestion/extractor/json_schema_util.py +3 -3
  35. datahub/ingestion/extractor/schema_util.py +3 -5
  36. datahub/ingestion/fs/s3_fs.py +3 -3
  37. datahub/ingestion/glossary/classifier.py +2 -3
  38. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  39. datahub/ingestion/graph/client.py +22 -19
  40. datahub/ingestion/graph/config.py +1 -1
  41. datahub/ingestion/run/pipeline.py +8 -7
  42. datahub/ingestion/run/pipeline_config.py +3 -3
  43. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  44. datahub/ingestion/source/abs/source.py +19 -8
  45. datahub/ingestion/source/aws/glue.py +77 -47
  46. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  47. datahub/ingestion/source/aws/s3_util.py +24 -1
  48. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  49. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  50. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  51. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  53. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  54. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  55. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  56. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  57. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  58. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  59. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  60. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  61. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  62. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  63. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  64. datahub/ingestion/source/csv_enricher.py +29 -29
  65. datahub/ingestion/source/datahub/config.py +20 -0
  66. datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
  67. datahub/ingestion/source/datahub/datahub_source.py +13 -3
  68. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  69. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  70. datahub/ingestion/source/delta_lake/source.py +0 -5
  71. datahub/ingestion/source/demo_data.py +1 -1
  72. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  73. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  74. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  75. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  76. datahub/ingestion/source/elastic_search.py +4 -4
  77. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  78. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  79. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  80. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  81. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  82. datahub/ingestion/source/ge_data_profiler.py +2 -5
  83. datahub/ingestion/source/ge_profiling_config.py +3 -3
  84. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  85. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  86. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +3 -3
  88. datahub/ingestion/source/identity/okta.py +3 -3
  89. datahub/ingestion/source/kafka/kafka.py +11 -9
  90. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  93. datahub/ingestion/source/looker/looker_common.py +19 -19
  94. datahub/ingestion/source/looker/looker_config.py +11 -6
  95. datahub/ingestion/source/looker/looker_source.py +25 -25
  96. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  97. datahub/ingestion/source/looker/looker_usage.py +5 -7
  98. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  99. datahub/ingestion/source/looker/lookml_source.py +13 -15
  100. datahub/ingestion/source/looker/view_upstream.py +5 -5
  101. datahub/ingestion/source/metabase.py +1 -6
  102. datahub/ingestion/source/mlflow.py +4 -9
  103. datahub/ingestion/source/mode.py +5 -5
  104. datahub/ingestion/source/mongodb.py +6 -4
  105. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  106. datahub/ingestion/source/nifi.py +24 -31
  107. datahub/ingestion/source/openapi.py +9 -9
  108. datahub/ingestion/source/powerbi/config.py +12 -12
  109. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  110. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  111. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  112. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  113. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  114. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  115. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  116. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  117. datahub/ingestion/source/redash.py +0 -5
  118. datahub/ingestion/source/redshift/config.py +3 -3
  119. datahub/ingestion/source/redshift/redshift.py +45 -46
  120. datahub/ingestion/source/redshift/usage.py +33 -33
  121. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  122. datahub/ingestion/source/s3/source.py +11 -15
  123. datahub/ingestion/source/salesforce.py +26 -25
  124. datahub/ingestion/source/schema/json_schema.py +1 -1
  125. datahub/ingestion/source/sigma/sigma.py +3 -3
  126. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  127. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  128. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  129. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
  130. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  131. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  132. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  133. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  134. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  135. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  136. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  137. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  138. datahub/ingestion/source/sql/athena.py +1 -3
  139. datahub/ingestion/source/sql/clickhouse.py +8 -14
  140. datahub/ingestion/source/sql/oracle.py +1 -3
  141. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  142. datahub/ingestion/source/sql/sql_types.py +1 -2
  143. datahub/ingestion/source/sql/sql_utils.py +5 -0
  144. datahub/ingestion/source/sql/teradata.py +18 -5
  145. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  146. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  147. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  148. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  149. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/superset.py +1 -6
  151. datahub/ingestion/source/tableau/tableau.py +343 -117
  152. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  153. datahub/ingestion/source/unity/config.py +3 -1
  154. datahub/ingestion/source/unity/proxy.py +1 -1
  155. datahub/ingestion/source/unity/source.py +74 -78
  156. datahub/ingestion/source/unity/usage.py +3 -1
  157. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  159. datahub/ingestion/source/usage/usage_common.py +1 -1
  160. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  161. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  162. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  163. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  164. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  165. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  166. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  167. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  168. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  169. datahub/lite/duckdb_lite.py +12 -10
  170. datahub/metadata/_schema_classes.py +317 -44
  171. datahub/metadata/_urns/urn_defs.py +69 -15
  172. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  173. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  174. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  175. datahub/metadata/schema.avsc +302 -89
  176. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  177. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  179. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  180. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  181. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  182. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  183. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  184. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  185. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  186. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  187. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  188. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  189. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  190. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  191. datahub/secret/datahub_secrets_client.py +12 -21
  192. datahub/secret/secret_common.py +14 -8
  193. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  194. datahub/sql_parsing/schema_resolver.py +5 -10
  195. datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
  196. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  197. datahub/sql_parsing/sqlglot_utils.py +1 -1
  198. datahub/telemetry/stats.py +1 -2
  199. datahub/testing/mcp_diff.py +1 -1
  200. datahub/utilities/file_backed_collections.py +11 -11
  201. datahub/utilities/hive_schema_to_avro.py +2 -2
  202. datahub/utilities/logging_manager.py +2 -2
  203. datahub/utilities/lossy_collections.py +3 -3
  204. datahub/utilities/mapping.py +3 -3
  205. datahub/utilities/memory_footprint.py +3 -2
  206. datahub/utilities/perf_timer.py +11 -6
  207. datahub/utilities/serialized_lru_cache.py +3 -1
  208. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  209. datahub/utilities/sqllineage_patch.py +1 -1
  210. datahub/utilities/stats_collections.py +3 -1
  211. datahub/utilities/urns/_urn_base.py +28 -5
  212. datahub/utilities/urns/urn_iter.py +2 -2
  213. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  214. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -148,9 +148,9 @@ class DataHubClassifierConfig(ConfigModel):
148
148
  weight,
149
149
  ) in custom_infotype_config.Prediction_Factors_and_Weights.dict().items():
150
150
  if weight > 0:
151
- assert (
152
- getattr(custom_infotype_config, factor) is not None
153
- ), f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
151
+ assert getattr(custom_infotype_config, factor) is not None, (
152
+ f"Missing Configuration for Prediction Factor {factor} for Custom Info Type {custom_infotype}"
153
+ )
154
154
 
155
155
  # Custom infotype supports only regex based prediction for column values
156
156
  if custom_infotype_config.Prediction_Factors_and_Weights.Values > 0:
@@ -158,7 +158,9 @@ class DataHubClassifierConfig(ConfigModel):
158
158
  assert (
159
159
  custom_infotype_config.Values.prediction_type
160
160
  == ValuePredictionType.REGEX
161
- ), f"Invalid Prediction Type for Values for Custom Info Type {custom_infotype}. Only `regex` is supported."
161
+ ), (
162
+ f"Invalid Prediction Type for Values for Custom Info Type {custom_infotype}. Only `regex` is supported."
163
+ )
162
164
 
163
165
  return info_types_config
164
166
 
@@ -179,21 +179,24 @@ class DataHubGraph(DatahubRestEmitter):
179
179
 
180
180
  @classmethod
181
181
  def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
182
+ session_config = emitter._session_config
183
+ if isinstance(session_config.timeout, tuple):
184
+ # TODO: This is slightly lossy. Eventually, we want to modify the emitter
185
+ # to accept a tuple for timeout_sec, and then we'll be able to remove this.
186
+ timeout_sec: Optional[float] = session_config.timeout[0]
187
+ else:
188
+ timeout_sec = session_config.timeout
182
189
  return cls(
183
190
  DatahubClientConfig(
184
191
  server=emitter._gms_server,
185
192
  token=emitter._token,
186
- timeout_sec=emitter._read_timeout_sec,
187
- retry_status_codes=emitter._retry_status_codes,
188
- retry_max_times=emitter._retry_max_times,
189
- extra_headers=emitter._session.headers,
190
- disable_ssl_verification=emitter._session.verify is False,
191
- ca_certificate_path=(
192
- emitter._session.verify
193
- if isinstance(emitter._session.verify, str)
194
- else None
195
- ),
196
- client_certificate_path=emitter._session.cert,
193
+ timeout_sec=timeout_sec,
194
+ retry_status_codes=session_config.retry_status_codes,
195
+ retry_max_times=session_config.retry_max_times,
196
+ extra_headers=session_config.extra_headers,
197
+ disable_ssl_verification=session_config.disable_ssl_verification,
198
+ ca_certificate_path=session_config.ca_certificate_path,
199
+ client_certificate_path=session_config.client_certificate_path,
197
200
  )
198
201
  )
199
202
 
@@ -245,9 +248,11 @@ class DataHubGraph(DatahubRestEmitter):
245
248
  with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
246
249
  yield sink
247
250
  if sink.report.failures:
251
+ logger.error(
252
+ f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}"
253
+ )
248
254
  raise OperationalError(
249
- f"Failed to emit {len(sink.report.failures)} records",
250
- info=sink.report.as_obj(),
255
+ f"Failed to emit {len(sink.report.failures)} records"
251
256
  )
252
257
 
253
258
  def emit_all(
@@ -514,9 +519,9 @@ class DataHubGraph(DatahubRestEmitter):
514
519
  :return: Optionally, a map of aspect_name to aspect_value as a dictionary if present, aspect_value will be set to None if that aspect was not found. Returns None on HTTP status 404.
515
520
  :raises HttpError: if the HTTP response is not a 200
516
521
  """
517
- assert len(aspects) == len(
518
- aspect_types
519
- ), f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
522
+ assert len(aspects) == len(aspect_types), (
523
+ f"number of aspects requested ({len(aspects)}) should be the same as number of aspect types provided ({len(aspect_types)})"
524
+ )
520
525
 
521
526
  # TODO: generate aspects list from type classes
522
527
  response_json = self.get_entity_raw(entity_urn, aspects)
@@ -1571,9 +1576,7 @@ class DataHubGraph(DatahubRestEmitter):
1571
1576
  ... assertionResult
1572
1577
  }
1573
1578
  }
1574
- """ % (
1575
- self._assertion_result_shared()
1576
- )
1579
+ """ % (self._assertion_result_shared())
1577
1580
 
1578
1581
  variables = {
1579
1582
  "assertionUrn": urn,
@@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel):
10
10
  # by callers / the CLI, but the actual client should not have any magic.
11
11
  server: str
12
12
  token: Optional[str] = None
13
- timeout_sec: Optional[int] = None
13
+ timeout_sec: Optional[float] = None
14
14
  retry_status_codes: Optional[List[int]] = None
15
15
  retry_max_times: Optional[int] = None
16
16
  extra_headers: Optional[Dict[str, str]] = None
@@ -76,8 +76,9 @@ class LoggingCallback(WriteCallback):
76
76
  failure_metadata: dict,
77
77
  ) -> None:
78
78
  logger.error(
79
- f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}"
80
- f" with {failure_exception} and info {failure_metadata}"
79
+ f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
80
+ extra={"failure_metadata": failure_metadata},
81
+ exc_info=failure_exception,
81
82
  )
82
83
 
83
84
 
@@ -108,9 +109,9 @@ class DeadLetterQueueCallback(WriteCallback):
108
109
  mcp.systemMetadata.properties = {}
109
110
  if "workunit_id" not in mcp.systemMetadata.properties:
110
111
  # update the workunit id
111
- mcp.systemMetadata.properties[
112
- "workunit_id"
113
- ] = record_envelope.metadata["workunit_id"]
112
+ mcp.systemMetadata.properties["workunit_id"] = (
113
+ record_envelope.metadata["workunit_id"]
114
+ )
114
115
  record_envelope.record = mcp
115
116
  self.file_sink.write_record_async(record_envelope, self.logging_callback)
116
117
 
@@ -700,7 +701,7 @@ class Pipeline:
700
701
  num_failures_sink = len(self.sink.get_report().failures)
701
702
  click.secho(
702
703
  message_template.format(
703
- status=f"with at least {num_failures_source+num_failures_sink} failures"
704
+ status=f"with at least {num_failures_source + num_failures_sink} failures"
704
705
  ),
705
706
  fg=self._get_text_color(
706
707
  running=currently_running, failures=True, warnings=False
@@ -718,7 +719,7 @@ class Pipeline:
718
719
  num_warn_global = len(global_warnings)
719
720
  click.secho(
720
721
  message_template.format(
721
- status=f"with at least {num_warn_source+num_warn_sink+num_warn_global} warnings"
722
+ status=f"with at least {num_warn_source + num_warn_sink + num_warn_global} warnings"
722
723
  ),
723
724
  fg=self._get_text_color(
724
725
  running=currently_running, failures=False, warnings=True
@@ -92,9 +92,9 @@ class PipelineConfig(ConfigModel):
92
92
  pipeline_name: Optional[str] = None
93
93
  failure_log: FailureLoggingConfig = FailureLoggingConfig()
94
94
 
95
- _raw_dict: Optional[
96
- dict
97
- ] = None # the raw dict that was parsed to construct this config
95
+ _raw_dict: Optional[dict] = (
96
+ None # the raw dict that was parsed to construct this config
97
+ )
98
98
 
99
99
  @validator("run_id", pre=True, always=True)
100
100
  def run_id_should_be_semantic(
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
85
85
  if field_level_metric.startswith("include_field_"):
86
86
  values.setdefault(field_level_metric, False)
87
87
 
88
- assert (
89
- max_num_fields_to_profile is None
90
- ), f"{max_num_fields_to_profile_key} should be set to None"
88
+ assert max_num_fields_to_profile is None, (
89
+ f"{max_num_fields_to_profile_key} should be set to None"
90
+ )
91
91
 
92
92
  return values
@@ -508,7 +508,12 @@ class ABSSource(StatefulIngestionSourceBase):
508
508
  ):
509
509
  abs_path = self.create_abs_path(obj.name)
510
510
  logger.debug(f"Sampling file: {abs_path}")
511
- yield abs_path, obj.name, obj.last_modified, obj.size,
511
+ yield (
512
+ abs_path,
513
+ obj.name,
514
+ obj.last_modified,
515
+ obj.size,
516
+ )
512
517
  except Exception as e:
513
518
  # This odd check if being done because boto does not have a proper exception to catch
514
519
  # The exception that appears in stacktrace cannot actually be caught without a lot more work
@@ -552,9 +557,12 @@ class ABSSource(StatefulIngestionSourceBase):
552
557
  if os.path.isfile(prefix):
553
558
  logger.debug(f"Scanning single local file: {prefix}")
554
559
  file_name = prefix
555
- yield prefix, file_name, datetime.utcfromtimestamp(
556
- os.path.getmtime(prefix)
557
- ), os.path.getsize(prefix)
560
+ yield (
561
+ prefix,
562
+ file_name,
563
+ datetime.utcfromtimestamp(os.path.getmtime(prefix)),
564
+ os.path.getsize(prefix),
565
+ )
558
566
  else:
559
567
  logger.debug(f"Scanning files under local folder: {prefix}")
560
568
  for root, dirs, files in os.walk(prefix):
@@ -565,9 +573,12 @@ class ABSSource(StatefulIngestionSourceBase):
565
573
  full_path = PurePath(
566
574
  os.path.normpath(os.path.join(root, file))
567
575
  ).as_posix()
568
- yield full_path, file, datetime.utcfromtimestamp(
569
- os.path.getmtime(full_path)
570
- ), os.path.getsize(full_path)
576
+ yield (
577
+ full_path,
578
+ file,
579
+ datetime.utcfromtimestamp(os.path.getmtime(full_path)),
580
+ os.path.getsize(full_path),
581
+ )
571
582
 
572
583
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
573
584
  self.container_WU_creator = ContainerWUCreator(
@@ -613,7 +624,7 @@ class ABSSource(StatefulIngestionSourceBase):
613
624
  table_data.table_path
614
625
  ].timestamp = table_data.timestamp
615
626
 
616
- for guid, table_data in table_dict.items():
627
+ for _, table_data in table_dict.items():
617
628
  yield from self.ingest_table(table_data, path_spec)
618
629
 
619
630
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -248,6 +248,9 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
248
248
  "Enabled by default when stateful ingestion is turned on.",
249
249
  )
250
250
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
251
+ @capability(
252
+ SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
253
+ )
251
254
  class GlueSource(StatefulIngestionSourceBase):
252
255
  """
253
256
  Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
284
287
  "Action": [
285
288
  "glue:GetDataflowGraph",
286
289
  "glue:GetJobs",
290
+ "s3:GetObject",
287
291
  ],
288
292
  "Resource": "*"
289
293
  }
290
294
  ```
291
295
 
292
- plus `s3:GetObject` for the job script locations.
296
+ For profiling datasets, the following additional permissions are required:
297
+ ```json
298
+ {
299
+ "Effect": "Allow",
300
+ "Action": [
301
+ "glue:GetPartitions",
302
+ ],
303
+ "Resource": "*"
304
+ }
305
+ ```
293
306
 
294
307
  """
295
308
 
@@ -508,7 +521,7 @@ class GlueSource(StatefulIngestionSourceBase):
508
521
  # otherwise, a node represents a transformation
509
522
  else:
510
523
  node_urn = mce_builder.make_data_job_urn_with_flow(
511
- flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}'
524
+ flow_urn, job_id=f"{node['NodeType']}-{node['Id']}"
512
525
  )
513
526
 
514
527
  return {
@@ -666,7 +679,7 @@ class GlueSource(StatefulIngestionSourceBase):
666
679
  )
667
680
  )
668
681
 
669
- return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce)
682
+ return MetadataWorkUnit(id=f"{job_name}-{node['Id']}", mce=mce)
670
683
 
671
684
  def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
672
685
  logger.debug("Getting all databases")
@@ -737,13 +750,13 @@ class GlueSource(StatefulIngestionSourceBase):
737
750
  ) -> Optional[MetadataWorkUnit]:
738
751
  if self.source_config.emit_s3_lineage:
739
752
  # extract dataset properties aspect
740
- dataset_properties: Optional[
741
- DatasetPropertiesClass
742
- ] = mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
753
+ dataset_properties: Optional[DatasetPropertiesClass] = (
754
+ mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
755
+ )
743
756
  # extract dataset schema aspect
744
- schema_metadata: Optional[
745
- SchemaMetadataClass
746
- ] = mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
757
+ schema_metadata: Optional[SchemaMetadataClass] = (
758
+ mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)
759
+ )
747
760
 
748
761
  if dataset_properties and "Location" in dataset_properties.customProperties:
749
762
  location = dataset_properties.customProperties["Location"]
@@ -752,9 +765,9 @@ class GlueSource(StatefulIngestionSourceBase):
752
765
  location, self.source_config.env
753
766
  )
754
767
  assert self.ctx.graph
755
- schema_metadata_for_s3: Optional[
756
- SchemaMetadataClass
757
- ] = self.ctx.graph.get_schema_metadata(s3_dataset_urn)
768
+ schema_metadata_for_s3: Optional[SchemaMetadataClass] = (
769
+ self.ctx.graph.get_schema_metadata(s3_dataset_urn)
770
+ )
758
771
 
759
772
  if self.source_config.glue_s3_lineage_direction == "upstream":
760
773
  fine_grained_lineages = None
@@ -1054,49 +1067,66 @@ class GlueSource(StatefulIngestionSourceBase):
1054
1067
  yield from self.gen_database_containers(database)
1055
1068
 
1056
1069
  for table in tables:
1057
- database_name = table["DatabaseName"]
1058
1070
  table_name = table["Name"]
1059
- full_table_name = f"{database_name}.{table_name}"
1060
- self.report.report_table_scanned()
1061
- if not self.source_config.database_pattern.allowed(
1062
- database_name
1063
- ) or not self.source_config.table_pattern.allowed(full_table_name):
1064
- self.report.report_table_dropped(full_table_name)
1065
- continue
1071
+ try:
1072
+ yield from self._gen_table_wu(table=table)
1073
+ except KeyError as e:
1074
+ self.report.report_failure(
1075
+ message="Failed to extract workunit for table",
1076
+ context=f"Table: {table_name}",
1077
+ exc=e,
1078
+ )
1079
+ if self.extract_transforms:
1080
+ yield from self._transform_extraction()
1066
1081
 
1067
- dataset_urn = make_dataset_urn_with_platform_instance(
1068
- platform=self.platform,
1069
- name=full_table_name,
1070
- env=self.env,
1071
- platform_instance=self.source_config.platform_instance,
1072
- )
1082
+ def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
1083
+ database_name = table["DatabaseName"]
1084
+ table_name = table["Name"]
1085
+ full_table_name = f"{database_name}.{table_name}"
1086
+ self.report.report_table_scanned()
1087
+ if not self.source_config.database_pattern.allowed(
1088
+ database_name
1089
+ ) or not self.source_config.table_pattern.allowed(full_table_name):
1090
+ self.report.report_table_dropped(full_table_name)
1091
+ return
1092
+
1093
+ dataset_urn = make_dataset_urn_with_platform_instance(
1094
+ platform=self.platform,
1095
+ name=full_table_name,
1096
+ env=self.env,
1097
+ platform_instance=self.source_config.platform_instance,
1098
+ )
1073
1099
 
1074
- mce = self._extract_record(dataset_urn, table, full_table_name)
1075
- yield MetadataWorkUnit(full_table_name, mce=mce)
1100
+ mce = self._extract_record(dataset_urn, table, full_table_name)
1101
+ yield MetadataWorkUnit(full_table_name, mce=mce)
1076
1102
 
1077
- # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1078
- # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1079
- yield MetadataChangeProposalWrapper(
1080
- entityUrn=dataset_urn,
1081
- aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
1082
- ).as_workunit()
1103
+ # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1104
+ # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1105
+ yield MetadataChangeProposalWrapper(
1106
+ entityUrn=dataset_urn,
1107
+ aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
1108
+ ).as_workunit()
1083
1109
 
1084
- yield from self._get_domain_wu(
1085
- dataset_name=full_table_name,
1086
- entity_urn=dataset_urn,
1087
- )
1088
- yield from self.add_table_to_database_container(
1089
- dataset_urn=dataset_urn, db_name=database_name
1090
- )
1110
+ yield from self._get_domain_wu(
1111
+ dataset_name=full_table_name,
1112
+ entity_urn=dataset_urn,
1113
+ )
1114
+ yield from self.add_table_to_database_container(
1115
+ dataset_urn=dataset_urn, db_name=database_name
1116
+ )
1091
1117
 
1092
- wu = self.get_lineage_if_enabled(mce)
1093
- if wu:
1094
- yield wu
1118
+ wu = self.get_lineage_if_enabled(mce)
1119
+ if wu:
1120
+ yield wu
1095
1121
 
1122
+ try:
1096
1123
  yield from self.get_profile_if_enabled(mce, database_name, table_name)
1097
-
1098
- if self.extract_transforms:
1099
- yield from self._transform_extraction()
1124
+ except KeyError as e:
1125
+ self.report.report_failure(
1126
+ message="Failed to extract profile for table",
1127
+ context=f"Table: {dataset_urn}",
1128
+ exc=e,
1129
+ )
1100
1130
 
1101
1131
  def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
1102
1132
  dags: Dict[str, Optional[Dict[str, Any]]] = {}
@@ -40,7 +40,7 @@ def get_s3_tags(
40
40
  ]
41
41
  )
42
42
  except s3.meta.client.exceptions.ClientError:
43
- logger.warn(f"No tags found for bucket={bucket_name}")
43
+ logger.warning(f"No tags found for bucket={bucket_name}")
44
44
 
45
45
  if use_s3_object_tags and key_name is not None:
46
46
  s3_client = aws_config.get_s3_client()
@@ -53,7 +53,7 @@ def get_s3_tags(
53
53
  else:
54
54
  # Unlike bucket tags, if an object does not have tags, it will just return an empty array
55
55
  # as opposed to an exception.
56
- logger.warn(f"No tags found for bucket={bucket_name} key={key_name}")
56
+ logger.warning(f"No tags found for bucket={bucket_name} key={key_name}")
57
57
  if len(tags_to_add) == 0:
58
58
  return None
59
59
  if ctx.graph is not None:
@@ -65,7 +65,7 @@ def get_s3_tags(
65
65
  if current_tags:
66
66
  tags_to_add.extend([current_tag.tag for current_tag in current_tags.tags])
67
67
  else:
68
- logger.warn("Could not connect to DatahubApi. No current tags to maintain")
68
+ logger.warning("Could not connect to DatahubApi. No current tags to maintain")
69
69
  # Remove duplicate tags
70
70
  tags_to_add = sorted(list(set(tags_to_add)))
71
71
  new_tags = GlobalTagsClass(
@@ -1,6 +1,11 @@
1
1
  import logging
2
2
  import os
3
- from typing import Optional
3
+ from collections import defaultdict
4
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
5
+
6
+ if TYPE_CHECKING:
7
+ from mypy_boto3_s3.service_resource import ObjectSummary
8
+
4
9
 
5
10
  S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
6
11
 
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
68
73
  f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
69
74
  )
70
75
  return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
76
+
77
+
78
+ def group_s3_objects_by_dirname(
79
+ s3_objects: Iterable["ObjectSummary"],
80
+ ) -> Dict[str, List["ObjectSummary"]]:
81
+ """
82
+ Groups S3 objects by their directory name.
83
+
84
+ If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
85
+ """
86
+ grouped_s3_objs = defaultdict(list)
87
+ for obj in s3_objects:
88
+ if "/" in obj.key:
89
+ dirname = obj.key.rsplit("/", 1)[0]
90
+ else:
91
+ dirname = "/"
92
+ grouped_s3_objs[dirname].append(obj)
93
+ return grouped_s3_objs
@@ -257,7 +257,7 @@ class FeatureGroupProcessor:
257
257
  mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
258
258
 
259
259
  return MetadataWorkUnit(
260
- id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}',
260
+ id=f"{feature_group_details['FeatureGroupName']}-{feature['FeatureName']}",
261
261
  mce=mce,
262
262
  )
263
263
 
@@ -212,7 +212,7 @@ class ModelProcessor:
212
212
  mce = MetadataChangeEvent(proposedSnapshot=endpoint_snapshot)
213
213
 
214
214
  return MetadataWorkUnit(
215
- id=f'{endpoint_details["EndpointName"]}',
215
+ id=f"{endpoint_details['EndpointName']}",
216
216
  mce=mce,
217
217
  )
218
218
 
@@ -503,7 +503,7 @@ class ModelProcessor:
503
503
  mce = MetadataChangeEvent(proposedSnapshot=model_snapshot)
504
504
 
505
505
  return MetadataWorkUnit(
506
- id=f'{model_details["ModelName"]}',
506
+ id=f"{model_details['ModelName']}",
507
507
  mce=mce,
508
508
  )
509
509
 
@@ -132,9 +132,9 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
132
132
  self.filters = BigQueryFilter(self.config, self.report)
133
133
  self.identifiers = BigQueryIdentifierBuilder(self.config, self.report)
134
134
 
135
- redundant_lineage_run_skip_handler: Optional[
136
- RedundantLineageRunSkipHandler
137
- ] = None
135
+ redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
136
+ None
137
+ )
138
138
  if self.config.enable_stateful_lineage_ingestion:
139
139
  redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
140
140
  source=self,
@@ -253,14 +253,14 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
253
253
  for project in projects:
254
254
  yield from self.bq_schema_extractor.get_project_workunits(project)
255
255
 
256
- self.report.set_ingestion_stage("*", "View and Snapshot Lineage")
257
- yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
258
- [p.id for p in projects],
259
- self.bq_schema_extractor.view_refs_by_project,
260
- self.bq_schema_extractor.view_definitions,
261
- self.bq_schema_extractor.snapshot_refs_by_project,
262
- self.bq_schema_extractor.snapshots_by_ref,
263
- )
256
+ with self.report.new_stage("*: View and Snapshot Lineage"):
257
+ yield from self.lineage_extractor.get_lineage_workunits_for_views_and_snapshots(
258
+ [p.id for p in projects],
259
+ self.bq_schema_extractor.view_refs_by_project,
260
+ self.bq_schema_extractor.view_definitions,
261
+ self.bq_schema_extractor.snapshot_refs_by_project,
262
+ self.bq_schema_extractor.snapshots_by_ref,
263
+ )
264
264
 
265
265
  if self.config.use_queries_v2:
266
266
  # if both usage and lineage are disabled then skip queries extractor piece
@@ -270,29 +270,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
270
270
  ):
271
271
  return
272
272
 
273
- self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
274
-
275
- with BigQueryQueriesExtractor(
276
- connection=self.config.get_bigquery_client(),
277
- schema_api=self.bq_schema_extractor.schema_api,
278
- config=BigQueryQueriesExtractorConfig(
279
- window=self.config,
280
- user_email_pattern=self.config.usage.user_email_pattern,
281
- include_lineage=self.config.include_table_lineage,
282
- include_usage_statistics=self.config.include_usage_statistics,
283
- include_operations=self.config.usage.include_operational_stats,
284
- top_n_queries=self.config.usage.top_n_queries,
285
- region_qualifiers=self.config.region_qualifiers,
286
- ),
287
- structured_report=self.report,
288
- filters=self.filters,
289
- identifiers=self.identifiers,
290
- schema_resolver=self.sql_parser_schema_resolver,
291
- discovered_tables=self.bq_schema_extractor.table_refs,
292
- ) as queries_extractor:
293
- self.report.queries_extractor = queries_extractor.report
294
- yield from queries_extractor.get_workunits_internal()
295
-
273
+ with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
274
+ with BigQueryQueriesExtractor(
275
+ connection=self.config.get_bigquery_client(),
276
+ schema_api=self.bq_schema_extractor.schema_api,
277
+ config=BigQueryQueriesExtractorConfig(
278
+ window=self.config,
279
+ user_email_pattern=self.config.usage.user_email_pattern,
280
+ include_lineage=self.config.include_table_lineage,
281
+ include_usage_statistics=self.config.include_usage_statistics,
282
+ include_operations=self.config.usage.include_operational_stats,
283
+ include_queries=self.config.include_queries,
284
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
285
+ top_n_queries=self.config.usage.top_n_queries,
286
+ region_qualifiers=self.config.region_qualifiers,
287
+ ),
288
+ structured_report=self.report,
289
+ filters=self.filters,
290
+ identifiers=self.identifiers,
291
+ schema_resolver=self.sql_parser_schema_resolver,
292
+ discovered_tables=self.bq_schema_extractor.table_refs,
293
+ ) as queries_extractor:
294
+ self.report.queries_extractor = queries_extractor.report
295
+ yield from queries_extractor.get_workunits_internal()
296
296
  else:
297
297
  if self.config.include_usage_statistics:
298
298
  yield from self.usage_extractor.get_usage_workunits(
@@ -37,9 +37,9 @@ class BigqueryTableIdentifier:
37
37
 
38
38
  # Note: this regex may get overwritten by the sharded_table_pattern config.
39
39
  # The class-level constant, however, will not be overwritten.
40
- _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[
41
- str
42
- ] = _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
40
+ _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = (
41
+ _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX
42
+ )
43
43
  _BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$"
44
44
  _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd"
45
45