acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
194
194
 
195
195
  _DBT_FIELDS_BY_TYPE = {
196
196
  "models": f"""
197
- { _DBT_GRAPHQL_COMMON_FIELDS }
198
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
199
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
197
+ {_DBT_GRAPHQL_COMMON_FIELDS}
198
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
199
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
200
200
  dependsOn
201
201
  materializedType
202
202
  """,
203
203
  "seeds": f"""
204
- { _DBT_GRAPHQL_COMMON_FIELDS }
205
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
206
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
204
+ {_DBT_GRAPHQL_COMMON_FIELDS}
205
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
206
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
207
207
  """,
208
208
  "sources": f"""
209
- { _DBT_GRAPHQL_COMMON_FIELDS }
210
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
209
+ {_DBT_GRAPHQL_COMMON_FIELDS}
210
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
211
211
  identifier
212
212
  sourceName
213
213
  sourceDescription
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
218
218
  loader
219
219
  """,
220
220
  "snapshots": f"""
221
- { _DBT_GRAPHQL_COMMON_FIELDS }
222
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
223
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
221
+ {_DBT_GRAPHQL_COMMON_FIELDS}
222
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
223
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
224
224
  parentsSources {{
225
225
  uniqueId
226
226
  }}
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
229
229
  }}
230
230
  """,
231
231
  "tests": f"""
232
- { _DBT_GRAPHQL_COMMON_FIELDS }
232
+ {_DBT_GRAPHQL_COMMON_FIELDS}
233
233
  state
234
234
  columnName
235
235
  status
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
315
315
  res = response.json()
316
316
  if "errors" in res:
317
317
  raise ValueError(
318
- f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
318
+ f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
319
319
  )
320
320
  data = res["data"]
321
321
  except JSONDecodeError as e:
@@ -506,16 +506,18 @@ class DBTNode:
506
506
  materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
507
507
  # see https://docs.getdbt.com/reference/artifacts/manifest-json
508
508
  catalog_type: Optional[str]
509
- missing_from_catalog: bool # indicates if the node was missing from the catalog.json
509
+ missing_from_catalog: (
510
+ bool # indicates if the node was missing from the catalog.json
511
+ )
510
512
 
511
513
  owner: Optional[str]
512
514
 
513
515
  columns: List[DBTColumn] = field(default_factory=list)
514
516
  upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
515
517
  upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
516
- raw_sql_parsing_result: Optional[
517
- SqlParsingResult
518
- ] = None # only set for nodes that don't depend on ephemeral models
518
+ raw_sql_parsing_result: Optional[SqlParsingResult] = (
519
+ None # only set for nodes that don't depend on ephemeral models
520
+ )
519
521
  cll_debug_info: Optional[SqlParsingDebugInfo] = None
520
522
 
521
523
  meta: Dict[str, Any] = field(default_factory=dict)
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
869
871
  "platform": DBT_PLATFORM,
870
872
  "name": node.dbt_name,
871
873
  "instance": self.config.platform_instance,
874
+ # Ideally we'd include the env unconditionally. However, we started out
875
+ # not including env in the guid, so we need to maintain backwards compatibility
876
+ # with existing PROD assertions.
872
877
  **(
873
- # Ideally we'd include the env unconditionally. However, we started out
874
- # not including env in the guid, so we need to maintain backwards compatibility
875
- # with existing PROD assertions.
876
878
  {"env": self.config.env}
877
879
  if self.config.env != mce_builder.DEFAULT_ENV
878
880
  and self.config.include_env_in_assertion_guid
@@ -122,11 +122,6 @@ class DeltaLakeSource(Source):
122
122
  config_report,
123
123
  )
124
124
 
125
- @classmethod
126
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
127
- config = DeltaLakeSourceConfig.parse_obj(config_dict)
128
- return cls(config, ctx)
129
-
130
125
  def _parse_datatype(self, raw_field_json_str: str) -> List[SchemaFieldClass]:
131
126
  raw_field_json = json.loads(raw_field_json_str)
132
127
 
@@ -29,7 +29,7 @@ class DemoDataSource(Source):
29
29
 
30
30
  def __init__(self, ctx: PipelineContext, config: DemoDataConfig):
31
31
  file_config = FileSourceConfig(path=str(download_sample_data()))
32
- self.file_source = GenericFileSource(ctx, file_config)
32
+ self.file_source: GenericFileSource = GenericFileSource(ctx, file_config)
33
33
 
34
34
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
35
35
  yield from self.file_source.get_workunits()
@@ -181,7 +181,7 @@ class DremioAPIOperations:
181
181
  return
182
182
 
183
183
  # On-prem Dremio authentication (PAT or Basic Auth)
184
- for retry in range(1, self._retry_count + 1):
184
+ for _ in range(1, self._retry_count + 1):
185
185
  try:
186
186
  if connection_args.authentication_method == "PAT":
187
187
  self.session.headers.update(
@@ -191,9 +191,9 @@ class DremioAPIOperations:
191
191
  )
192
192
  return
193
193
  else:
194
- assert (
195
- connection_args.username and connection_args.password
196
- ), "Username and password are required for authentication"
194
+ assert connection_args.username and connection_args.password, (
195
+ "Username and password are required for authentication"
196
+ )
197
197
  host = connection_args.hostname
198
198
  port = connection_args.port
199
199
  protocol = "https" if connection_args.tls else "http"
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
101
101
  Add a new source type if not in the map (e.g., Dremio ARP).
102
102
  """
103
103
  dremio_source_type = dremio_source_type.upper()
104
- DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
105
- dremio_source_type
106
- ] = datahub_source_type
104
+ DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
105
+ datahub_source_type
106
+ )
107
107
 
108
108
  if category:
109
109
  if category.lower() == "file_object_storage":
@@ -45,6 +45,3 @@ class DremioSourceReport(
45
45
  self.views_scanned += 1
46
46
  else:
47
47
  raise KeyError(f"Unknown entity {ent_type}.")
48
-
49
- def set_ingestion_stage(self, dataset: str, stage: str) -> None:
50
- self.report_ingestion_stage_start(f"{dataset}: {stage}")
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
472
472
  env=self.config.env,
473
473
  platform_instance=self.config.platform_instance,
474
474
  )
475
- self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
476
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
475
+ with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
476
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
477
477
 
478
478
  def generate_view_lineage(
479
479
  self, dataset_urn: str, parents: List[str]
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
111
111
 
112
112
  @staticmethod
113
113
  def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
114
- type_class: Optional[
115
- Type
116
- ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
117
- elastic_column_type
114
+ type_class: Optional[Type] = (
115
+ ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
116
+ elastic_column_type
117
+ )
118
118
  )
119
119
  if type_class is None:
120
120
  logger.warning(
@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  platform_name,
17
17
  support_status,
18
18
  )
19
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
19
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
20
20
  from datahub.ingestion.api.workunit import MetadataWorkUnit
21
21
  from datahub.ingestion.source.fivetran.config import (
22
22
  KNOWN_DATA_PLATFORM_MAPPING,
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
291
291
  dpi = self._generate_dpi_from_job(job, datajob)
292
292
  yield from self._get_dpi_workunits(job, dpi)
293
293
 
294
- @classmethod
295
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
296
- config = FivetranSourceConfig.parse_obj(config_dict)
297
- return cls(config, ctx)
298
-
299
294
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
300
295
  return [
301
296
  *super().get_workunit_processors(),
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
141
141
  ) -> Iterable[MetadataWorkUnit]:
142
142
  if self.config.cleanup_expired_tokens:
143
143
  try:
144
- self.report.report_ingestion_stage_start("Expired Token Cleanup")
145
- self.revoke_expired_tokens()
144
+ with self.report.new_stage("Expired Token Cleanup"):
145
+ self.revoke_expired_tokens()
146
146
  except Exception as e:
147
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
148
148
  if self.config.truncate_indices:
149
149
  try:
150
- self.report.report_ingestion_stage_start("Truncate Indices")
151
- self.truncate_indices()
150
+ with self.report.new_stage("Truncate Indices"):
151
+ self.truncate_indices()
152
152
  except Exception as e:
153
153
  self.report.failure("While trying to truncate indices ", exc=e)
154
154
  if self.config.soft_deleted_entities_cleanup.enabled:
155
155
  try:
156
- self.report.report_ingestion_stage_start(
157
- "Soft Deleted Entities Cleanup"
158
- )
159
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
156
+ with self.report.new_stage("Soft Deleted Entities Cleanup"):
157
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
160
158
  except Exception as e:
161
159
  self.report.failure(
162
160
  "While trying to cleanup soft deleted entities ", exc=e
163
161
  )
164
162
  if self.config.dataprocess_cleanup.enabled:
165
163
  try:
166
- self.report.report_ingestion_stage_start("Data Process Cleanup")
167
- yield from self.dataprocess_cleanup.get_workunits_internal()
164
+ with self.report.new_stage("Data Process Cleanup"):
165
+ yield from self.dataprocess_cleanup.get_workunits_internal()
168
166
  except Exception as e:
169
167
  self.report.failure("While trying to cleanup data process ", exc=e)
170
168
  if self.config.execution_request_cleanup.enabled:
171
169
  try:
172
- self.report.report_ingestion_stage_start("Execution request Cleanup")
173
- self.execution_request_cleanup.run()
170
+ with self.report.new_stage("Execution request Cleanup"):
171
+ self.execution_request_cleanup.run()
174
172
  except Exception as e:
175
173
  self.report.failure("While trying to cleanup execution request ", exc=e)
176
- # Otherwise last stage's duration does not get calculated.
177
- self.report.report_ingestion_stage_start("End")
178
174
  yield from []
179
175
 
180
176
  def truncate_indices(self) -> None:
@@ -296,6 +292,7 @@ class DataHubGcSource(Source):
296
292
  tokens = list_access_tokens.get("tokens", [])
297
293
  total = list_access_tokens.get("total", 0)
298
294
  if tokens == []:
295
+ # Due to a server bug we cannot rely on just total
299
296
  break
300
297
  for token in tokens:
301
298
  self.report.expired_tokens_revoked += 1
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
29
29
  )
30
30
 
31
31
  keep_history_max_days: int = Field(
32
- 30,
32
+ 90,
33
33
  description="Maximum number of days to keep execution requests for, per ingestion source",
34
34
  )
35
35
 
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
48
48
  description="Maximum runtime in seconds for the cleanup task",
49
49
  )
50
50
 
51
+ limit_entities_delete: Optional[int] = Field(
52
+ 10000, description="Max number of execution requests to hard delete."
53
+ )
54
+
51
55
  max_read_errors: int = Field(
52
56
  default=10,
53
57
  description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
65
69
  ergc_delete_errors: int = 0
66
70
  ergc_start_time: Optional[datetime.datetime] = None
67
71
  ergc_end_time: Optional[datetime.datetime] = None
72
+ ergc_delete_limit_reached: bool = False
73
+ ergc_runtime_limit_reached: bool = False
68
74
 
69
75
 
70
76
  class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
85
91
  self.graph = graph
86
92
  self.report = report
87
93
  self.instance_id = int(time.time())
94
+ self.last_print_time = 0.0
88
95
 
89
96
  if config is not None:
90
97
  self.config = config
91
98
  else:
92
99
  self.config = DatahubExecutionRequestCleanupConfig()
93
100
 
101
+ def _print_report(self) -> None:
102
+ time_taken = round(time.time() - self.last_print_time, 1)
103
+ # Print report every 2 minutes
104
+ if time_taken > 120:
105
+ self.last_print_time = time.time()
106
+ logger.info(f"\n{self.report.as_string()}")
107
+
94
108
  def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
95
109
  input_aspect = (
96
110
  entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
175
189
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
176
190
 
177
191
  for entry in self._scroll_execution_requests():
192
+ self._print_report()
178
193
  self.report.ergc_records_read += 1
179
194
  key = entry.ingestion_source
180
195
 
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
225
240
  f"record timestamp: {entry.requested_at}."
226
241
  )
227
242
  )
228
- self.report.ergc_records_deleted += 1
229
243
  yield entry
230
244
 
231
245
  def _delete_entry(self, entry: CleanupRecord) -> None:
232
246
  try:
233
- logger.info(
234
- f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
235
- )
236
247
  self.graph.delete_entity(entry.urn, True)
248
+ self.report.ergc_records_deleted += 1
237
249
  except Exception as e:
238
250
  self.report.ergc_delete_errors += 1
239
251
  self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
252
264
  >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
253
265
  )
254
266
  ):
267
+ self.report.ergc_runtime_limit_reached = True
255
268
  logger.info(f"ergc({self.instance_id}): max runtime reached.")
256
269
  return True
257
270
  return False
258
271
 
272
+ def _reached_delete_limit(self) -> bool:
273
+ if (
274
+ self.config.limit_entities_delete
275
+ and self.report.ergc_records_deleted >= self.config.limit_entities_delete
276
+ ):
277
+ logger.info(
278
+ f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
279
+ )
280
+ self.report.ergc_delete_limit_reached = True
281
+ return True
282
+ return False
283
+
259
284
  def run(self) -> None:
260
285
  if not self.config.enabled:
261
286
  logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
274
299
  )
275
300
 
276
301
  for entry in self._scroll_garbage_records():
277
- if self._reached_runtime_limit():
302
+ if self._reached_runtime_limit() or self._reached_delete_limit():
278
303
  break
279
304
  self._delete_entry(entry)
280
305
 
@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
- QUERY_QUERY_ENTITY = """
23
- query listQueries($input: ScrollAcrossEntitiesInput!) {
22
+ QUERY_ENTITIES = """
23
+ query listEntities($input: ScrollAcrossEntitiesInput!) {
24
24
  scrollAcrossEntities(input: $input) {
25
25
  nextScrollId
26
26
  count
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
29
29
  ... on QueryEntity {
30
30
  urn
31
31
  }
32
+ ... on DataProcessInstance {
33
+ urn
34
+ }
32
35
  }
33
36
  }
34
37
  }
@@ -96,7 +99,8 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
96
99
 
97
100
  @dataclass
98
101
  class SoftDeletedEntitiesReport(SourceReport):
99
- num_queries_found: int = 0
102
+ num_calls_made: Dict[str, int] = field(default_factory=dict)
103
+ num_entities_found: Dict[str, int] = field(default_factory=dict)
100
104
  num_soft_deleted_entity_processed: int = 0
101
105
  num_soft_deleted_retained_due_to_age: int = 0
102
106
  num_soft_deleted_entity_removal_started: int = 0
@@ -151,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
151
155
  current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
152
156
  self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
153
157
  if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
154
- self.report.sample_hard_deleted_aspects_by_type[
155
- entity_type
156
- ] = LossyList()
158
+ self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
159
+ LossyList()
160
+ )
157
161
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
158
162
 
159
163
  def delete_entity(self, urn: str) -> None:
@@ -225,19 +229,33 @@ class SoftDeletedEntitiesCleanup:
225
229
  time.sleep(self.config.delay)
226
230
  return futures
227
231
 
228
- def _get_soft_deleted_queries(self) -> Iterable[str]:
232
+ def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
229
233
  assert self.ctx.graph
230
234
  scroll_id: Optional[str] = None
235
+
236
+ batch_size = self.config.batch_size
237
+ if entity_type == "DATA_PROCESS_INSTANCE":
238
+ # Due to a bug in Data process instance querying this is a temp workaround
239
+ # to avoid a giant stacktrace by having a smaller batch size in first call
240
+ # This will be remove in future version after server with fix has been
241
+ # around for a while
242
+ batch_size = 10
243
+
231
244
  while True:
232
245
  try:
246
+ if entity_type not in self.report.num_calls_made:
247
+ self.report.num_calls_made[entity_type] = 1
248
+ else:
249
+ self.report.num_calls_made[entity_type] += 1
250
+ self._print_report()
233
251
  result = self.ctx.graph.execute_graphql(
234
- QUERY_QUERY_ENTITY,
252
+ graphql_query,
235
253
  {
236
254
  "input": {
237
- "types": ["QUERY"],
255
+ "types": [entity_type],
238
256
  "query": "*",
239
257
  "scrollId": scroll_id if scroll_id else None,
240
- "count": self.config.batch_size,
258
+ "count": batch_size,
241
259
  "orFilters": [
242
260
  {
243
261
  "and": [
@@ -254,15 +272,29 @@ class SoftDeletedEntitiesCleanup:
254
272
  )
255
273
  except Exception as e:
256
274
  self.report.failure(
257
- f"While trying to get queries with {scroll_id}", exc=e
275
+ f"While trying to get {entity_type} with {scroll_id}", exc=e
258
276
  )
259
277
  break
260
278
  scroll_across_entities = result.get("scrollAcrossEntities")
261
- if not scroll_across_entities or not scroll_across_entities.get("count"):
279
+ if not scroll_across_entities:
262
280
  break
281
+ search_results = scroll_across_entities.get("searchResults")
282
+ count = scroll_across_entities.get("count")
283
+ if not count or not search_results:
284
+ # Due to a server bug we cannot rely on just count as it was returning response like this
285
+ # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
+ break
287
+ if entity_type == "DATA_PROCESS_INSTANCE":
288
+ # Temp workaround. See note in beginning of the function
289
+ # We make the batch size = config after call has succeeded once
290
+ batch_size = self.config.batch_size
263
291
  scroll_id = scroll_across_entities.get("nextScrollId")
264
- self.report.num_queries_found += scroll_across_entities.get("count")
265
- for query in scroll_across_entities.get("searchResults"):
292
+ if entity_type not in self.report.num_entities_found:
293
+ self.report.num_entities_found[entity_type] = 0
294
+ self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
+ "count"
296
+ )
297
+ for query in search_results:
266
298
  yield query["entity"]["urn"]
267
299
 
268
300
  def _get_urns(self) -> Iterable[str]:
@@ -275,7 +307,8 @@ class SoftDeletedEntitiesCleanup:
275
307
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
276
308
  batch_size=self.config.batch_size,
277
309
  )
278
- yield from self._get_soft_deleted_queries()
310
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
279
312
 
280
313
  def _times_up(self) -> bool:
281
314
  if (
@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
141
141
  source.source_config.platform = PLATFORM_GCS
142
142
 
143
143
  source.is_s3_platform = lambda: True # type: ignore
144
- source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}") # type: ignore
145
-
144
+ source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
145
+ f"s3://{bucket_name}/{key}"
146
+ )
146
147
  return source
147
148
 
148
149
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
@@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
267
267
  "get_column_max",
268
268
  "get_column_mean",
269
269
  "get_column_stdev",
270
- "get_column_stdev",
271
270
  "get_column_nonnull_count",
272
271
  "get_column_unique_count",
273
272
  }
@@ -328,7 +327,7 @@ def _is_single_row_query_method(query: Any) -> bool:
328
327
 
329
328
 
330
329
  def _run_with_query_combiner(
331
- method: Callable[Concatenate["_SingleDatasetProfiler", P], None]
330
+ method: Callable[Concatenate["_SingleDatasetProfiler", P], None],
332
331
  ) -> Callable[Concatenate["_SingleDatasetProfiler", P], None]:
333
332
  @functools.wraps(method)
334
333
  def inner(
@@ -1538,9 +1537,7 @@ def create_bigquery_temp_table(
1538
1537
  query_job: Optional["google.cloud.bigquery.job.query.QueryJob"] = (
1539
1538
  # In google-cloud-bigquery 3.15.0, the _query_job attribute was
1540
1539
  # made public and renamed to query_job.
1541
- cursor.query_job
1542
- if hasattr(cursor, "query_job")
1543
- else cursor._query_job # type: ignore[attr-defined]
1540
+ cursor.query_job if hasattr(cursor, "query_job") else cursor._query_job # type: ignore[attr-defined]
1544
1541
  )
1545
1542
  assert query_job
1546
1543
  temp_destination_table = query_job.destination
@@ -220,9 +220,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
220
220
  )
221
221
  values[field_level_metric] = False
222
222
 
223
- assert (
224
- max_num_fields_to_profile is None
225
- ), f"{max_num_fields_to_profile_key} should be set to None"
223
+ assert max_num_fields_to_profile is None, (
224
+ f"{max_num_fields_to_profile_key} should be set to None"
225
+ )
226
226
 
227
227
  # Disable expensive queries.
228
228
  if values.get("turn_off_expensive_profiling_metrics"):
@@ -203,7 +203,9 @@ class IcebergSource(StatefulIngestionSourceBase):
203
203
  with PerfTimer() as timer:
204
204
  table = thread_local.local_catalog.load_table(dataset_path)
205
205
  time_taken = timer.elapsed_seconds()
206
- self.report.report_table_load_time(time_taken)
206
+ self.report.report_table_load_time(
207
+ time_taken, dataset_name, table.metadata_location
208
+ )
207
209
  LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
208
210
  yield from self._create_iceberg_workunit(dataset_name, table)
209
211
  except NoSuchPropertyException as e:
@@ -247,7 +249,10 @@ class IcebergSource(StatefulIngestionSourceBase):
247
249
  f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
248
250
  )
249
251
  except Exception as e:
250
- self.report.report_failure("general", f"Failed to create workunit: {e}")
252
+ self.report.report_failure(
253
+ "general",
254
+ f"Failed to create workunit for dataset {dataset_name}: {e}",
255
+ )
251
256
  LOGGER.exception(
252
257
  f"Exception while processing table {dataset_path}, skipping it.",
253
258
  )
@@ -291,9 +296,9 @@ class IcebergSource(StatefulIngestionSourceBase):
291
296
  custom_properties["snapshot-id"] = str(
292
297
  table.current_snapshot().snapshot_id
293
298
  )
294
- custom_properties[
295
- "manifest-list"
296
- ] = table.current_snapshot().manifest_list
299
+ custom_properties["manifest-list"] = (
300
+ table.current_snapshot().manifest_list
301
+ )
297
302
  dataset_properties = DatasetPropertiesClass(
298
303
  name=table.name()[-1],
299
304
  description=table.metadata.properties.get("comment", None),
@@ -312,7 +317,9 @@ class IcebergSource(StatefulIngestionSourceBase):
312
317
  dataset_snapshot.aspects.append(schema_metadata)
313
318
 
314
319
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
315
- self.report.report_table_processing_time(timer.elapsed_seconds())
320
+ self.report.report_table_processing_time(
321
+ timer.elapsed_seconds(), dataset_name, table.metadata_location
322
+ )
316
323
  yield MetadataWorkUnit(id=dataset_name, mce=mce)
317
324
 
318
325
  dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)