acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (214) hide show
  1. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
  2. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
  3. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +141 -93
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
  30. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  31. datahub/ingestion/api/report.py +1 -2
  32. datahub/ingestion/api/source.py +8 -2
  33. datahub/ingestion/api/source_helpers.py +1 -1
  34. datahub/ingestion/extractor/json_schema_util.py +3 -3
  35. datahub/ingestion/extractor/schema_util.py +3 -5
  36. datahub/ingestion/fs/s3_fs.py +3 -3
  37. datahub/ingestion/glossary/classifier.py +2 -3
  38. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  39. datahub/ingestion/graph/client.py +22 -19
  40. datahub/ingestion/graph/config.py +1 -1
  41. datahub/ingestion/run/pipeline.py +8 -7
  42. datahub/ingestion/run/pipeline_config.py +3 -3
  43. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  44. datahub/ingestion/source/abs/source.py +19 -8
  45. datahub/ingestion/source/aws/glue.py +77 -47
  46. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  47. datahub/ingestion/source/aws/s3_util.py +24 -1
  48. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  49. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  50. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  51. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  52. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  53. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  54. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  55. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  56. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  57. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  58. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  59. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  60. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  61. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  62. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  63. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  64. datahub/ingestion/source/csv_enricher.py +29 -29
  65. datahub/ingestion/source/datahub/config.py +20 -0
  66. datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
  67. datahub/ingestion/source/datahub/datahub_source.py +13 -3
  68. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  69. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  70. datahub/ingestion/source/delta_lake/source.py +0 -5
  71. datahub/ingestion/source/demo_data.py +1 -1
  72. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  73. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  74. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  75. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  76. datahub/ingestion/source/elastic_search.py +4 -4
  77. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  78. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  79. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  80. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  81. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  82. datahub/ingestion/source/ge_data_profiler.py +2 -5
  83. datahub/ingestion/source/ge_profiling_config.py +3 -3
  84. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  85. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  86. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  87. datahub/ingestion/source/identity/azure_ad.py +3 -3
  88. datahub/ingestion/source/identity/okta.py +3 -3
  89. datahub/ingestion/source/kafka/kafka.py +11 -9
  90. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  93. datahub/ingestion/source/looker/looker_common.py +19 -19
  94. datahub/ingestion/source/looker/looker_config.py +11 -6
  95. datahub/ingestion/source/looker/looker_source.py +25 -25
  96. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  97. datahub/ingestion/source/looker/looker_usage.py +5 -7
  98. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  99. datahub/ingestion/source/looker/lookml_source.py +13 -15
  100. datahub/ingestion/source/looker/view_upstream.py +5 -5
  101. datahub/ingestion/source/metabase.py +1 -6
  102. datahub/ingestion/source/mlflow.py +4 -9
  103. datahub/ingestion/source/mode.py +5 -5
  104. datahub/ingestion/source/mongodb.py +6 -4
  105. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  106. datahub/ingestion/source/nifi.py +24 -31
  107. datahub/ingestion/source/openapi.py +9 -9
  108. datahub/ingestion/source/powerbi/config.py +12 -12
  109. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  110. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  111. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  112. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  113. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  114. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  115. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  116. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  117. datahub/ingestion/source/redash.py +0 -5
  118. datahub/ingestion/source/redshift/config.py +3 -3
  119. datahub/ingestion/source/redshift/redshift.py +45 -46
  120. datahub/ingestion/source/redshift/usage.py +33 -33
  121. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  122. datahub/ingestion/source/s3/source.py +11 -15
  123. datahub/ingestion/source/salesforce.py +26 -25
  124. datahub/ingestion/source/schema/json_schema.py +1 -1
  125. datahub/ingestion/source/sigma/sigma.py +3 -3
  126. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  127. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  128. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  129. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
  130. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  131. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  132. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  133. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  134. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  135. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  136. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  137. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  138. datahub/ingestion/source/sql/athena.py +1 -3
  139. datahub/ingestion/source/sql/clickhouse.py +8 -14
  140. datahub/ingestion/source/sql/oracle.py +1 -3
  141. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  142. datahub/ingestion/source/sql/sql_types.py +1 -2
  143. datahub/ingestion/source/sql/sql_utils.py +5 -0
  144. datahub/ingestion/source/sql/teradata.py +18 -5
  145. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  146. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  147. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  148. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  149. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  150. datahub/ingestion/source/superset.py +1 -6
  151. datahub/ingestion/source/tableau/tableau.py +343 -117
  152. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  153. datahub/ingestion/source/unity/config.py +3 -1
  154. datahub/ingestion/source/unity/proxy.py +1 -1
  155. datahub/ingestion/source/unity/source.py +74 -78
  156. datahub/ingestion/source/unity/usage.py +3 -1
  157. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  158. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  159. datahub/ingestion/source/usage/usage_common.py +1 -1
  160. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  161. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  162. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  163. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  164. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  165. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  166. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  167. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  168. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  169. datahub/lite/duckdb_lite.py +12 -10
  170. datahub/metadata/_schema_classes.py +317 -44
  171. datahub/metadata/_urns/urn_defs.py +69 -15
  172. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  173. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  174. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  175. datahub/metadata/schema.avsc +302 -89
  176. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  177. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  179. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  180. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  181. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  182. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  183. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  184. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  185. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  186. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  187. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  188. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  189. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  190. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  191. datahub/secret/datahub_secrets_client.py +12 -21
  192. datahub/secret/secret_common.py +14 -8
  193. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  194. datahub/sql_parsing/schema_resolver.py +5 -10
  195. datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
  196. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  197. datahub/sql_parsing/sqlglot_utils.py +1 -1
  198. datahub/telemetry/stats.py +1 -2
  199. datahub/testing/mcp_diff.py +1 -1
  200. datahub/utilities/file_backed_collections.py +11 -11
  201. datahub/utilities/hive_schema_to_avro.py +2 -2
  202. datahub/utilities/logging_manager.py +2 -2
  203. datahub/utilities/lossy_collections.py +3 -3
  204. datahub/utilities/mapping.py +3 -3
  205. datahub/utilities/memory_footprint.py +3 -2
  206. datahub/utilities/perf_timer.py +11 -6
  207. datahub/utilities/serialized_lru_cache.py +3 -1
  208. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  209. datahub/utilities/sqllineage_patch.py +1 -1
  210. datahub/utilities/stats_collections.py +3 -1
  211. datahub/utilities/urns/_urn_base.py +28 -5
  212. datahub/utilities/urns/urn_iter.py +2 -2
  213. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  214. {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import Optional, Set
3
3
 
4
+ import pydantic
4
5
  from pydantic import Field, root_validator
5
6
 
6
7
  from datahub.configuration.common import AllowDenyPattern
@@ -24,6 +25,10 @@ DEFAULT_EXCLUDE_ASPECTS = {
24
25
  "globalSettingsKey",
25
26
  "globalSettingsInfo",
26
27
  "testResults",
28
+ "dataHubExecutionRequestKey",
29
+ "dataHubExecutionRequestInput",
30
+ "dataHubExecutionRequestSignal",
31
+ "dataHubExecutionRequestResult",
27
32
  }
28
33
 
29
34
 
@@ -107,6 +112,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
107
112
 
108
113
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
109
114
 
115
+ drop_duplicate_schema_fields: bool = Field(
116
+ default=False,
117
+ description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
118
+ "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
119
+ )
120
+
110
121
  @root_validator(skip_on_failure=True)
111
122
  def check_ingesting_data(cls, values):
112
123
  if (
@@ -119,3 +130,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
119
130
  " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
120
131
  )
121
132
  return values
133
+
134
+ @pydantic.validator("database_connection")
135
+ def validate_mysql_scheme(
136
+ cls, v: SQLAlchemyConnectionConfig
137
+ ) -> SQLAlchemyConnectionConfig:
138
+ if "mysql" in v.scheme:
139
+ if v.scheme != "mysql+pymysql":
140
+ raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
141
+ return v
@@ -151,8 +151,12 @@ class DataHubDatabaseReader:
151
151
  self, query: str, params: Dict[str, Any]
152
152
  ) -> Iterable[Dict[str, Any]]:
153
153
  with self.engine.connect() as conn:
154
- if self.engine.dialect.name == "postgresql":
155
- with conn.begin(): # Transaction required for PostgreSQL server-side cursor
154
+ if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
+ with (
156
+ conn.begin()
157
+ ): # Transaction required for PostgreSQL server-side cursor
158
+ # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
159
+ # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
156
160
  conn = conn.execution_options(
157
161
  stream_results=True,
158
162
  yield_per=self.config.database_query_batch_size,
@@ -160,22 +164,6 @@ class DataHubDatabaseReader:
160
164
  result = conn.execute(query, params)
161
165
  for row in result:
162
166
  yield dict(row)
163
- elif self.engine.dialect.name == "mysql": # MySQL
164
- import MySQLdb
165
-
166
- with contextlib.closing(
167
- conn.connection.cursor(MySQLdb.cursors.SSCursor)
168
- ) as cursor:
169
- logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
170
- cursor.execute(query, params)
171
-
172
- columns = [desc[0] for desc in cursor.description]
173
- while True:
174
- rows = cursor.fetchmany(self.config.database_query_batch_size)
175
- if not rows:
176
- break # Use break instead of return in generator
177
- for row in rows:
178
- yield dict(zip(columns, row))
179
167
  else:
180
168
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
181
169
 
@@ -236,7 +224,7 @@ class DataHubDatabaseReader:
236
224
  )
237
225
  except Exception as e:
238
226
  logger.warning(
239
- f'Failed to parse metadata for {row["urn"]}: {e}', exc_info=True
227
+ f"Failed to parse metadata for {row['urn']}: {e}", exc_info=True
240
228
  )
241
229
  self.report.num_database_parse_errors += 1
242
230
  self.report.database_parse_errors.setdefault(
@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
12
12
  support_status,
13
13
  )
14
14
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
15
- from datahub.ingestion.api.source_helpers import auto_workunit_reporter
15
+ from datahub.ingestion.api.source_helpers import (
16
+ auto_fix_duplicate_schema_field_paths,
17
+ auto_workunit_reporter,
18
+ )
16
19
  from datahub.ingestion.api.workunit import MetadataWorkUnit
17
20
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
18
21
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
57
60
 
58
61
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
59
62
  # Exactly replicate data from DataHub source
60
- return [partial(auto_workunit_reporter, self.get_report())]
63
+ return [
64
+ (
65
+ auto_fix_duplicate_schema_field_paths
66
+ if self.config.drop_duplicate_schema_fields
67
+ else None
68
+ ),
69
+ partial(auto_workunit_reporter, self.get_report()),
70
+ ]
61
71
 
62
72
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
63
73
  self.report.stop_time = datetime.now(tz=timezone.utc)
@@ -130,7 +140,7 @@ class DataHubSource(StatefulIngestionSourceBase):
130
140
  self._commit_progress(i)
131
141
 
132
142
  def _get_kafka_workunits(
133
- self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
143
+ self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
134
144
  ) -> Iterable[MetadataWorkUnit]:
135
145
  if self.config.kafka_connection is None:
136
146
  return
@@ -194,20 +194,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
194
194
 
195
195
  _DBT_FIELDS_BY_TYPE = {
196
196
  "models": f"""
197
- { _DBT_GRAPHQL_COMMON_FIELDS }
198
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
199
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
197
+ {_DBT_GRAPHQL_COMMON_FIELDS}
198
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
199
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
200
200
  dependsOn
201
201
  materializedType
202
202
  """,
203
203
  "seeds": f"""
204
- { _DBT_GRAPHQL_COMMON_FIELDS }
205
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
206
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
204
+ {_DBT_GRAPHQL_COMMON_FIELDS}
205
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
206
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
207
207
  """,
208
208
  "sources": f"""
209
- { _DBT_GRAPHQL_COMMON_FIELDS }
210
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
209
+ {_DBT_GRAPHQL_COMMON_FIELDS}
210
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
211
211
  identifier
212
212
  sourceName
213
213
  sourceDescription
@@ -218,9 +218,9 @@ _DBT_FIELDS_BY_TYPE = {
218
218
  loader
219
219
  """,
220
220
  "snapshots": f"""
221
- { _DBT_GRAPHQL_COMMON_FIELDS }
222
- { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
223
- { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
221
+ {_DBT_GRAPHQL_COMMON_FIELDS}
222
+ {_DBT_GRAPHQL_NODE_COMMON_FIELDS}
223
+ {_DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS}
224
224
  parentsSources {{
225
225
  uniqueId
226
226
  }}
@@ -229,7 +229,7 @@ _DBT_FIELDS_BY_TYPE = {
229
229
  }}
230
230
  """,
231
231
  "tests": f"""
232
- { _DBT_GRAPHQL_COMMON_FIELDS }
232
+ {_DBT_GRAPHQL_COMMON_FIELDS}
233
233
  state
234
234
  columnName
235
235
  status
@@ -315,7 +315,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
315
315
  res = response.json()
316
316
  if "errors" in res:
317
317
  raise ValueError(
318
- f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
318
+ f"Unable to fetch metadata from dbt Cloud: {res['errors']}"
319
319
  )
320
320
  data = res["data"]
321
321
  except JSONDecodeError as e:
@@ -506,16 +506,18 @@ class DBTNode:
506
506
  materialization: Optional[str] # table, view, ephemeral, incremental, snapshot
507
507
  # see https://docs.getdbt.com/reference/artifacts/manifest-json
508
508
  catalog_type: Optional[str]
509
- missing_from_catalog: bool # indicates if the node was missing from the catalog.json
509
+ missing_from_catalog: (
510
+ bool # indicates if the node was missing from the catalog.json
511
+ )
510
512
 
511
513
  owner: Optional[str]
512
514
 
513
515
  columns: List[DBTColumn] = field(default_factory=list)
514
516
  upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name
515
517
  upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list)
516
- raw_sql_parsing_result: Optional[
517
- SqlParsingResult
518
- ] = None # only set for nodes that don't depend on ephemeral models
518
+ raw_sql_parsing_result: Optional[SqlParsingResult] = (
519
+ None # only set for nodes that don't depend on ephemeral models
520
+ )
519
521
  cll_debug_info: Optional[SqlParsingDebugInfo] = None
520
522
 
521
523
  meta: Dict[str, Any] = field(default_factory=dict)
@@ -869,10 +871,10 @@ class DBTSourceBase(StatefulIngestionSourceBase):
869
871
  "platform": DBT_PLATFORM,
870
872
  "name": node.dbt_name,
871
873
  "instance": self.config.platform_instance,
874
+ # Ideally we'd include the env unconditionally. However, we started out
875
+ # not including env in the guid, so we need to maintain backwards compatibility
876
+ # with existing PROD assertions.
872
877
  **(
873
- # Ideally we'd include the env unconditionally. However, we started out
874
- # not including env in the guid, so we need to maintain backwards compatibility
875
- # with existing PROD assertions.
876
878
  {"env": self.config.env}
877
879
  if self.config.env != mce_builder.DEFAULT_ENV
878
880
  and self.config.include_env_in_assertion_guid
@@ -122,11 +122,6 @@ class DeltaLakeSource(Source):
122
122
  config_report,
123
123
  )
124
124
 
125
- @classmethod
126
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
127
- config = DeltaLakeSourceConfig.parse_obj(config_dict)
128
- return cls(config, ctx)
129
-
130
125
  def _parse_datatype(self, raw_field_json_str: str) -> List[SchemaFieldClass]:
131
126
  raw_field_json = json.loads(raw_field_json_str)
132
127
 
@@ -29,7 +29,7 @@ class DemoDataSource(Source):
29
29
 
30
30
  def __init__(self, ctx: PipelineContext, config: DemoDataConfig):
31
31
  file_config = FileSourceConfig(path=str(download_sample_data()))
32
- self.file_source = GenericFileSource(ctx, file_config)
32
+ self.file_source: GenericFileSource = GenericFileSource(ctx, file_config)
33
33
 
34
34
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
35
35
  yield from self.file_source.get_workunits()
@@ -181,7 +181,7 @@ class DremioAPIOperations:
181
181
  return
182
182
 
183
183
  # On-prem Dremio authentication (PAT or Basic Auth)
184
- for retry in range(1, self._retry_count + 1):
184
+ for _ in range(1, self._retry_count + 1):
185
185
  try:
186
186
  if connection_args.authentication_method == "PAT":
187
187
  self.session.headers.update(
@@ -191,9 +191,9 @@ class DremioAPIOperations:
191
191
  )
192
192
  return
193
193
  else:
194
- assert (
195
- connection_args.username and connection_args.password
196
- ), "Username and password are required for authentication"
194
+ assert connection_args.username and connection_args.password, (
195
+ "Username and password are required for authentication"
196
+ )
197
197
  host = connection_args.hostname
198
198
  port = connection_args.port
199
199
  protocol = "https" if connection_args.tls else "http"
@@ -101,9 +101,9 @@ class DremioToDataHubSourceTypeMapping:
101
101
  Add a new source type if not in the map (e.g., Dremio ARP).
102
102
  """
103
103
  dremio_source_type = dremio_source_type.upper()
104
- DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[
105
- dremio_source_type
106
- ] = datahub_source_type
104
+ DremioToDataHubSourceTypeMapping.SOURCE_TYPE_MAPPING[dremio_source_type] = (
105
+ datahub_source_type
106
+ )
107
107
 
108
108
  if category:
109
109
  if category.lower() == "file_object_storage":
@@ -45,6 +45,3 @@ class DremioSourceReport(
45
45
  self.views_scanned += 1
46
46
  else:
47
47
  raise KeyError(f"Unknown entity {ent_type}.")
48
-
49
- def set_ingestion_stage(self, dataset: str, stage: str) -> None:
50
- self.report_ingestion_stage_start(f"{dataset}: {stage}")
@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
472
472
  env=self.config.env,
473
473
  platform_instance=self.config.platform_instance,
474
474
  )
475
- self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
476
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
475
+ with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
476
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
477
477
 
478
478
  def generate_view_lineage(
479
479
  self, dataset_urn: str, parents: List[str]
@@ -111,10 +111,10 @@ class ElasticToSchemaFieldConverter:
111
111
 
112
112
  @staticmethod
113
113
  def get_column_type(elastic_column_type: str) -> SchemaFieldDataType:
114
- type_class: Optional[
115
- Type
116
- ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
117
- elastic_column_type
114
+ type_class: Optional[Type] = (
115
+ ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get(
116
+ elastic_column_type
117
+ )
118
118
  )
119
119
  if type_class is None:
120
120
  logger.warning(
@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  platform_name,
17
17
  support_status,
18
18
  )
19
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
19
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
20
20
  from datahub.ingestion.api.workunit import MetadataWorkUnit
21
21
  from datahub.ingestion.source.fivetran.config import (
22
22
  KNOWN_DATA_PLATFORM_MAPPING,
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
291
291
  dpi = self._generate_dpi_from_job(job, datajob)
292
292
  yield from self._get_dpi_workunits(job, dpi)
293
293
 
294
- @classmethod
295
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
296
- config = FivetranSourceConfig.parse_obj(config_dict)
297
- return cls(config, ctx)
298
-
299
294
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
300
295
  return [
301
296
  *super().get_workunit_processors(),
@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
141
141
  ) -> Iterable[MetadataWorkUnit]:
142
142
  if self.config.cleanup_expired_tokens:
143
143
  try:
144
- self.report.report_ingestion_stage_start("Expired Token Cleanup")
145
- self.revoke_expired_tokens()
144
+ with self.report.new_stage("Expired Token Cleanup"):
145
+ self.revoke_expired_tokens()
146
146
  except Exception as e:
147
147
  self.report.failure("While trying to cleanup expired token ", exc=e)
148
148
  if self.config.truncate_indices:
149
149
  try:
150
- self.report.report_ingestion_stage_start("Truncate Indices")
151
- self.truncate_indices()
150
+ with self.report.new_stage("Truncate Indices"):
151
+ self.truncate_indices()
152
152
  except Exception as e:
153
153
  self.report.failure("While trying to truncate indices ", exc=e)
154
154
  if self.config.soft_deleted_entities_cleanup.enabled:
155
155
  try:
156
- self.report.report_ingestion_stage_start(
157
- "Soft Deleted Entities Cleanup"
158
- )
159
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
156
+ with self.report.new_stage("Soft Deleted Entities Cleanup"):
157
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
160
158
  except Exception as e:
161
159
  self.report.failure(
162
160
  "While trying to cleanup soft deleted entities ", exc=e
163
161
  )
164
162
  if self.config.dataprocess_cleanup.enabled:
165
163
  try:
166
- self.report.report_ingestion_stage_start("Data Process Cleanup")
167
- yield from self.dataprocess_cleanup.get_workunits_internal()
164
+ with self.report.new_stage("Data Process Cleanup"):
165
+ yield from self.dataprocess_cleanup.get_workunits_internal()
168
166
  except Exception as e:
169
167
  self.report.failure("While trying to cleanup data process ", exc=e)
170
168
  if self.config.execution_request_cleanup.enabled:
171
169
  try:
172
- self.report.report_ingestion_stage_start("Execution request Cleanup")
173
- self.execution_request_cleanup.run()
170
+ with self.report.new_stage("Execution request Cleanup"):
171
+ self.execution_request_cleanup.run()
174
172
  except Exception as e:
175
173
  self.report.failure("While trying to cleanup execution request ", exc=e)
176
- # Otherwise last stage's duration does not get calculated.
177
- self.report.report_ingestion_stage_start("End")
178
174
  yield from []
179
175
 
180
176
  def truncate_indices(self) -> None:
@@ -296,6 +292,7 @@ class DataHubGcSource(Source):
296
292
  tokens = list_access_tokens.get("tokens", [])
297
293
  total = list_access_tokens.get("total", 0)
298
294
  if tokens == []:
295
+ # Due to a server bug we cannot rely on just total
299
296
  break
300
297
  for token in tokens:
301
298
  self.report.expired_tokens_revoked += 1
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
29
29
  )
30
30
 
31
31
  keep_history_max_days: int = Field(
32
- 30,
32
+ 90,
33
33
  description="Maximum number of days to keep execution requests for, per ingestion source",
34
34
  )
35
35
 
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
48
48
  description="Maximum runtime in seconds for the cleanup task",
49
49
  )
50
50
 
51
+ limit_entities_delete: Optional[int] = Field(
52
+ 10000, description="Max number of execution requests to hard delete."
53
+ )
54
+
51
55
  max_read_errors: int = Field(
52
56
  default=10,
53
57
  description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
65
69
  ergc_delete_errors: int = 0
66
70
  ergc_start_time: Optional[datetime.datetime] = None
67
71
  ergc_end_time: Optional[datetime.datetime] = None
72
+ ergc_delete_limit_reached: bool = False
73
+ ergc_runtime_limit_reached: bool = False
68
74
 
69
75
 
70
76
  class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
85
91
  self.graph = graph
86
92
  self.report = report
87
93
  self.instance_id = int(time.time())
94
+ self.last_print_time = 0.0
88
95
 
89
96
  if config is not None:
90
97
  self.config = config
91
98
  else:
92
99
  self.config = DatahubExecutionRequestCleanupConfig()
93
100
 
101
+ def _print_report(self) -> None:
102
+ time_taken = round(time.time() - self.last_print_time, 1)
103
+ # Print report every 2 minutes
104
+ if time_taken > 120:
105
+ self.last_print_time = time.time()
106
+ logger.info(f"\n{self.report.as_string()}")
107
+
94
108
  def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
95
109
  input_aspect = (
96
110
  entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
175
189
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
176
190
 
177
191
  for entry in self._scroll_execution_requests():
192
+ self._print_report()
178
193
  self.report.ergc_records_read += 1
179
194
  key = entry.ingestion_source
180
195
 
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
225
240
  f"record timestamp: {entry.requested_at}."
226
241
  )
227
242
  )
228
- self.report.ergc_records_deleted += 1
229
243
  yield entry
230
244
 
231
245
  def _delete_entry(self, entry: CleanupRecord) -> None:
232
246
  try:
233
- logger.info(
234
- f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
235
- )
236
247
  self.graph.delete_entity(entry.urn, True)
248
+ self.report.ergc_records_deleted += 1
237
249
  except Exception as e:
238
250
  self.report.ergc_delete_errors += 1
239
251
  self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
252
264
  >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
253
265
  )
254
266
  ):
267
+ self.report.ergc_runtime_limit_reached = True
255
268
  logger.info(f"ergc({self.instance_id}): max runtime reached.")
256
269
  return True
257
270
  return False
258
271
 
272
+ def _reached_delete_limit(self) -> bool:
273
+ if (
274
+ self.config.limit_entities_delete
275
+ and self.report.ergc_records_deleted >= self.config.limit_entities_delete
276
+ ):
277
+ logger.info(
278
+ f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
279
+ )
280
+ self.report.ergc_delete_limit_reached = True
281
+ return True
282
+ return False
283
+
259
284
  def run(self) -> None:
260
285
  if not self.config.enabled:
261
286
  logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
274
299
  )
275
300
 
276
301
  for entry in self._scroll_garbage_records():
277
- if self._reached_runtime_limit():
302
+ if self._reached_runtime_limit() or self._reached_delete_limit():
278
303
  break
279
304
  self._delete_entry(entry)
280
305
 
@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
- QUERY_QUERY_ENTITY = """
23
- query listQueries($input: ScrollAcrossEntitiesInput!) {
22
+ QUERY_ENTITIES = """
23
+ query listEntities($input: ScrollAcrossEntitiesInput!) {
24
24
  scrollAcrossEntities(input: $input) {
25
25
  nextScrollId
26
26
  count
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
29
29
  ... on QueryEntity {
30
30
  urn
31
31
  }
32
+ ... on DataProcessInstance {
33
+ urn
34
+ }
32
35
  }
33
36
  }
34
37
  }
@@ -96,7 +99,8 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
96
99
 
97
100
  @dataclass
98
101
  class SoftDeletedEntitiesReport(SourceReport):
99
- num_queries_found: int = 0
102
+ num_calls_made: Dict[str, int] = field(default_factory=dict)
103
+ num_entities_found: Dict[str, int] = field(default_factory=dict)
100
104
  num_soft_deleted_entity_processed: int = 0
101
105
  num_soft_deleted_retained_due_to_age: int = 0
102
106
  num_soft_deleted_entity_removal_started: int = 0
@@ -151,9 +155,9 @@ class SoftDeletedEntitiesCleanup:
151
155
  current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
152
156
  self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
153
157
  if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
154
- self.report.sample_hard_deleted_aspects_by_type[
155
- entity_type
156
- ] = LossyList()
158
+ self.report.sample_hard_deleted_aspects_by_type[entity_type] = (
159
+ LossyList()
160
+ )
157
161
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
158
162
 
159
163
  def delete_entity(self, urn: str) -> None:
@@ -225,19 +229,33 @@ class SoftDeletedEntitiesCleanup:
225
229
  time.sleep(self.config.delay)
226
230
  return futures
227
231
 
228
- def _get_soft_deleted_queries(self) -> Iterable[str]:
232
+ def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
229
233
  assert self.ctx.graph
230
234
  scroll_id: Optional[str] = None
235
+
236
+ batch_size = self.config.batch_size
237
+ if entity_type == "DATA_PROCESS_INSTANCE":
238
+ # Due to a bug in Data process instance querying this is a temp workaround
239
+ # to avoid a giant stacktrace by having a smaller batch size in first call
240
+ # This will be remove in future version after server with fix has been
241
+ # around for a while
242
+ batch_size = 10
243
+
231
244
  while True:
232
245
  try:
246
+ if entity_type not in self.report.num_calls_made:
247
+ self.report.num_calls_made[entity_type] = 1
248
+ else:
249
+ self.report.num_calls_made[entity_type] += 1
250
+ self._print_report()
233
251
  result = self.ctx.graph.execute_graphql(
234
- QUERY_QUERY_ENTITY,
252
+ graphql_query,
235
253
  {
236
254
  "input": {
237
- "types": ["QUERY"],
255
+ "types": [entity_type],
238
256
  "query": "*",
239
257
  "scrollId": scroll_id if scroll_id else None,
240
- "count": self.config.batch_size,
258
+ "count": batch_size,
241
259
  "orFilters": [
242
260
  {
243
261
  "and": [
@@ -254,15 +272,29 @@ class SoftDeletedEntitiesCleanup:
254
272
  )
255
273
  except Exception as e:
256
274
  self.report.failure(
257
- f"While trying to get queries with {scroll_id}", exc=e
275
+ f"While trying to get {entity_type} with {scroll_id}", exc=e
258
276
  )
259
277
  break
260
278
  scroll_across_entities = result.get("scrollAcrossEntities")
261
- if not scroll_across_entities or not scroll_across_entities.get("count"):
279
+ if not scroll_across_entities:
262
280
  break
281
+ search_results = scroll_across_entities.get("searchResults")
282
+ count = scroll_across_entities.get("count")
283
+ if not count or not search_results:
284
+ # Due to a server bug we cannot rely on just count as it was returning response like this
285
+ # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
+ break
287
+ if entity_type == "DATA_PROCESS_INSTANCE":
288
+ # Temp workaround. See note in beginning of the function
289
+ # We make the batch size = config after call has succeeded once
290
+ batch_size = self.config.batch_size
263
291
  scroll_id = scroll_across_entities.get("nextScrollId")
264
- self.report.num_queries_found += scroll_across_entities.get("count")
265
- for query in scroll_across_entities.get("searchResults"):
292
+ if entity_type not in self.report.num_entities_found:
293
+ self.report.num_entities_found[entity_type] = 0
294
+ self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
+ "count"
296
+ )
297
+ for query in search_results:
266
298
  yield query["entity"]["urn"]
267
299
 
268
300
  def _get_urns(self) -> Iterable[str]:
@@ -275,7 +307,8 @@ class SoftDeletedEntitiesCleanup:
275
307
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
276
308
  batch_size=self.config.batch_size,
277
309
  )
278
- yield from self._get_soft_deleted_queries()
310
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
+ yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
279
312
 
280
313
  def _times_up(self) -> bool:
281
314
  if (
@@ -141,8 +141,9 @@ class GCSSource(StatefulIngestionSourceBase):
141
141
  source.source_config.platform = PLATFORM_GCS
142
142
 
143
143
  source.is_s3_platform = lambda: True # type: ignore
144
- source.create_s3_path = lambda bucket_name, key: unquote(f"s3://{bucket_name}/{key}") # type: ignore
145
-
144
+ source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
145
+ f"s3://{bucket_name}/{key}"
146
+ )
146
147
  return source
147
148
 
148
149
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: