acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show
  1. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
  2. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
  3. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/assertion/assertion_operator.py +3 -5
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/datacontract/assertion_operator.py +3 -5
  8. datahub/api/entities/dataproduct/dataproduct.py +4 -4
  9. datahub/api/entities/dataset/dataset.py +2 -1
  10. datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
  11. datahub/cli/cli_utils.py +13 -2
  12. datahub/cli/delete_cli.py +3 -3
  13. datahub/cli/docker_cli.py +6 -6
  14. datahub/cli/ingest_cli.py +25 -15
  15. datahub/cli/lite_cli.py +2 -2
  16. datahub/cli/migrate.py +5 -5
  17. datahub/cli/specific/assertions_cli.py +3 -3
  18. datahub/cli/specific/structuredproperties_cli.py +84 -0
  19. datahub/cli/timeline_cli.py +1 -1
  20. datahub/configuration/common.py +1 -2
  21. datahub/configuration/config_loader.py +73 -50
  22. datahub/configuration/git.py +2 -2
  23. datahub/configuration/time_window_config.py +10 -5
  24. datahub/emitter/mce_builder.py +4 -8
  25. datahub/emitter/mcp_builder.py +27 -0
  26. datahub/emitter/mcp_patch_builder.py +1 -2
  27. datahub/emitter/rest_emitter.py +126 -85
  28. datahub/entrypoints.py +6 -0
  29. datahub/ingestion/api/incremental_lineage_helper.py +2 -8
  30. datahub/ingestion/api/report.py +1 -2
  31. datahub/ingestion/api/source.py +4 -2
  32. datahub/ingestion/api/source_helpers.py +1 -1
  33. datahub/ingestion/extractor/json_schema_util.py +3 -3
  34. datahub/ingestion/extractor/schema_util.py +3 -5
  35. datahub/ingestion/fs/s3_fs.py +3 -3
  36. datahub/ingestion/glossary/datahub_classifier.py +6 -4
  37. datahub/ingestion/graph/client.py +22 -19
  38. datahub/ingestion/graph/config.py +1 -1
  39. datahub/ingestion/run/pipeline.py +8 -7
  40. datahub/ingestion/run/pipeline_config.py +3 -3
  41. datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
  42. datahub/ingestion/source/abs/source.py +19 -8
  43. datahub/ingestion/source/aws/glue.py +77 -47
  44. datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
  45. datahub/ingestion/source/aws/s3_util.py +24 -1
  46. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  47. datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
  48. datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
  49. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
  50. datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
  51. datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
  52. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
  53. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
  54. datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
  55. datahub/ingestion/source/bigquery_v2/queries.py +1 -3
  56. datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
  57. datahub/ingestion/source/bigquery_v2/usage.py +60 -60
  58. datahub/ingestion/source/cassandra/cassandra.py +0 -1
  59. datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
  60. datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
  61. datahub/ingestion/source/confluent_schema_registry.py +6 -6
  62. datahub/ingestion/source/csv_enricher.py +29 -29
  63. datahub/ingestion/source/datahub/config.py +10 -0
  64. datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
  65. datahub/ingestion/source/datahub/datahub_source.py +12 -2
  66. datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
  67. datahub/ingestion/source/dbt/dbt_common.py +9 -7
  68. datahub/ingestion/source/delta_lake/source.py +0 -5
  69. datahub/ingestion/source/demo_data.py +1 -1
  70. datahub/ingestion/source/dremio/dremio_api.py +4 -4
  71. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
  72. datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
  73. datahub/ingestion/source/dremio/dremio_source.py +2 -2
  74. datahub/ingestion/source/elastic_search.py +4 -4
  75. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  76. datahub/ingestion/source/gc/datahub_gc.py +11 -14
  77. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  78. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
  79. datahub/ingestion/source/gcs/gcs_source.py +3 -2
  80. datahub/ingestion/source/ge_data_profiler.py +2 -5
  81. datahub/ingestion/source/ge_profiling_config.py +3 -3
  82. datahub/ingestion/source/iceberg/iceberg.py +13 -6
  83. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  84. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  85. datahub/ingestion/source/identity/azure_ad.py +3 -3
  86. datahub/ingestion/source/identity/okta.py +3 -3
  87. datahub/ingestion/source/kafka/kafka.py +11 -9
  88. datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
  89. datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
  90. datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
  91. datahub/ingestion/source/looker/looker_common.py +19 -19
  92. datahub/ingestion/source/looker/looker_config.py +11 -6
  93. datahub/ingestion/source/looker/looker_source.py +25 -25
  94. datahub/ingestion/source/looker/looker_template_language.py +3 -3
  95. datahub/ingestion/source/looker/looker_usage.py +5 -7
  96. datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
  97. datahub/ingestion/source/looker/lookml_source.py +13 -15
  98. datahub/ingestion/source/looker/view_upstream.py +5 -5
  99. datahub/ingestion/source/metabase.py +1 -6
  100. datahub/ingestion/source/mlflow.py +4 -9
  101. datahub/ingestion/source/mode.py +5 -5
  102. datahub/ingestion/source/mongodb.py +6 -4
  103. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  104. datahub/ingestion/source/nifi.py +24 -31
  105. datahub/ingestion/source/openapi.py +9 -9
  106. datahub/ingestion/source/powerbi/config.py +12 -12
  107. datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
  108. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
  109. datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
  110. datahub/ingestion/source/powerbi/powerbi.py +6 -6
  111. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
  112. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
  113. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  114. datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
  115. datahub/ingestion/source/redash.py +0 -5
  116. datahub/ingestion/source/redshift/config.py +3 -3
  117. datahub/ingestion/source/redshift/redshift.py +45 -46
  118. datahub/ingestion/source/redshift/usage.py +33 -33
  119. datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
  120. datahub/ingestion/source/s3/source.py +11 -15
  121. datahub/ingestion/source/salesforce.py +26 -25
  122. datahub/ingestion/source/schema/json_schema.py +1 -1
  123. datahub/ingestion/source/sigma/sigma.py +3 -3
  124. datahub/ingestion/source/sigma/sigma_api.py +12 -10
  125. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  126. datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
  127. datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
  128. datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
  129. datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
  130. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
  131. datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
  132. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
  133. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
  134. datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
  135. datahub/ingestion/source/sql/athena.py +1 -3
  136. datahub/ingestion/source/sql/clickhouse.py +8 -14
  137. datahub/ingestion/source/sql/oracle.py +1 -3
  138. datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
  139. datahub/ingestion/source/sql/sql_types.py +1 -2
  140. datahub/ingestion/source/sql/sql_utils.py +5 -0
  141. datahub/ingestion/source/sql/teradata.py +18 -5
  142. datahub/ingestion/source/state/profiling_state_handler.py +3 -3
  143. datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
  144. datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
  145. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
  146. datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
  147. datahub/ingestion/source/superset.py +1 -6
  148. datahub/ingestion/source/tableau/tableau.py +343 -117
  149. datahub/ingestion/source/tableau/tableau_common.py +5 -2
  150. datahub/ingestion/source/unity/config.py +3 -1
  151. datahub/ingestion/source/unity/proxy.py +1 -1
  152. datahub/ingestion/source/unity/source.py +74 -74
  153. datahub/ingestion/source/unity/usage.py +3 -1
  154. datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
  155. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
  156. datahub/ingestion/source/usage/usage_common.py +1 -1
  157. datahub/ingestion/source_report/ingestion_stage.py +24 -20
  158. datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
  159. datahub/ingestion/transformer/add_dataset_properties.py +3 -3
  160. datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
  161. datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
  162. datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
  163. datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
  164. datahub/ingestion/transformer/tags_to_terms.py +7 -7
  165. datahub/integrations/assertion/snowflake/compiler.py +10 -10
  166. datahub/lite/duckdb_lite.py +12 -10
  167. datahub/metadata/_schema_classes.py +317 -44
  168. datahub/metadata/_urns/urn_defs.py +69 -15
  169. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  170. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  171. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  172. datahub/metadata/schema.avsc +302 -89
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  175. datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  176. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  177. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  178. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  179. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  180. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  181. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  182. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  183. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  184. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  185. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  186. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  187. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  188. datahub/secret/datahub_secrets_client.py +12 -21
  189. datahub/secret/secret_common.py +14 -8
  190. datahub/specific/aspect_helpers/custom_properties.py +1 -2
  191. datahub/sql_parsing/schema_resolver.py +5 -10
  192. datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
  193. datahub/sql_parsing/sqlglot_lineage.py +3 -3
  194. datahub/sql_parsing/sqlglot_utils.py +1 -1
  195. datahub/telemetry/stats.py +1 -2
  196. datahub/testing/mcp_diff.py +1 -1
  197. datahub/utilities/file_backed_collections.py +11 -11
  198. datahub/utilities/hive_schema_to_avro.py +2 -2
  199. datahub/utilities/logging_manager.py +2 -2
  200. datahub/utilities/lossy_collections.py +3 -3
  201. datahub/utilities/mapping.py +3 -3
  202. datahub/utilities/memory_footprint.py +3 -2
  203. datahub/utilities/perf_timer.py +11 -6
  204. datahub/utilities/serialized_lru_cache.py +3 -1
  205. datahub/utilities/sqlalchemy_query_combiner.py +6 -6
  206. datahub/utilities/sqllineage_patch.py +1 -1
  207. datahub/utilities/stats_collections.py +3 -1
  208. datahub/utilities/urns/_urn_base.py +28 -5
  209. datahub/utilities/urns/urn_iter.py +2 -2
  210. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
  211. {acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional
5
5
  from humanfriendly import format_timespan
6
6
  from pydantic import Field, validator
7
7
  from pyiceberg.catalog import Catalog, load_catalog
8
+ from sortedcontainers import SortedList
8
9
 
9
10
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -146,19 +147,40 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
146
147
  return load_catalog(name=catalog_name, **catalog_config)
147
148
 
148
149
 
150
+ class TopTableTimings:
151
+ _VALUE_FIELD: str = "timing"
152
+ top_entites: SortedList
153
+ _size: int
154
+
155
+ def __init__(self, size: int = 10):
156
+ self._size = size
157
+ self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
158
+
159
+ def add(self, entity: Dict[str, Any]) -> None:
160
+ if self._VALUE_FIELD not in entity:
161
+ return
162
+ self.top_entites.add(entity)
163
+ if len(self.top_entites) > self._size:
164
+ self.top_entites.pop()
165
+
166
+ def __str__(self) -> str:
167
+ if len(self.top_entites) == 0:
168
+ return "no timings reported"
169
+ return str(list(self.top_entites))
170
+
171
+
149
172
  class TimingClass:
150
- times: List[int]
173
+ times: SortedList
151
174
 
152
175
  def __init__(self):
153
- self.times = []
176
+ self.times = SortedList()
154
177
 
155
- def add_timing(self, t):
156
- self.times.append(t)
178
+ def add_timing(self, t: float) -> None:
179
+ self.times.add(t)
157
180
 
158
- def __str__(self):
181
+ def __str__(self) -> str:
159
182
  if len(self.times) == 0:
160
183
  return "no timings reported"
161
- self.times.sort()
162
184
  total = sum(self.times)
163
185
  avg = total / len(self.times)
164
186
  return str(
@@ -180,6 +202,9 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
180
202
  load_table_timings: TimingClass = field(default_factory=TimingClass)
181
203
  processing_table_timings: TimingClass = field(default_factory=TimingClass)
182
204
  profiling_table_timings: TimingClass = field(default_factory=TimingClass)
205
+ tables_load_timings: TopTableTimings = field(default_factory=TopTableTimings)
206
+ tables_profile_timings: TopTableTimings = field(default_factory=TopTableTimings)
207
+ tables_process_timings: TopTableTimings = field(default_factory=TopTableTimings)
183
208
  listed_namespaces: int = 0
184
209
  total_listed_tables: int = 0
185
210
  tables_listed_per_namespace: TopKDict[str, int] = field(
@@ -201,11 +226,26 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
201
226
  def report_dropped(self, ent_name: str) -> None:
202
227
  self.filtered.append(ent_name)
203
228
 
204
- def report_table_load_time(self, t: float) -> None:
229
+ def report_table_load_time(
230
+ self, t: float, table_name: str, table_metadata_location: str
231
+ ) -> None:
205
232
  self.load_table_timings.add_timing(t)
233
+ self.tables_load_timings.add(
234
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
235
+ )
206
236
 
207
- def report_table_processing_time(self, t: float) -> None:
237
+ def report_table_processing_time(
238
+ self, t: float, table_name: str, table_metadata_location: str
239
+ ) -> None:
208
240
  self.processing_table_timings.add_timing(t)
241
+ self.tables_process_timings.add(
242
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
243
+ )
209
244
 
210
- def report_table_profiling_time(self, t: float) -> None:
245
+ def report_table_profiling_time(
246
+ self, t: float, table_name: str, table_metadata_location: str
247
+ ) -> None:
211
248
  self.profiling_table_timings.add_timing(t)
249
+ self.tables_profile_timings.add(
250
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
251
+ )
@@ -204,7 +204,9 @@ class IcebergProfiler:
204
204
  )
205
205
  dataset_profile.fieldProfiles.append(column_profile)
206
206
  time_taken = timer.elapsed_seconds()
207
- self.report.report_table_profiling_time(time_taken)
207
+ self.report.report_table_profiling_time(
208
+ time_taken, dataset_name, table.metadata_location
209
+ )
208
210
  LOGGER.debug(
209
211
  f"Finished profiling of dataset: {dataset_name} in {time_taken}"
210
212
  )
@@ -354,9 +354,9 @@ class AzureADSource(StatefulIngestionSourceBase):
354
354
  yield MetadataWorkUnit(id=group_status_wu_id, mcp=group_status_mcp)
355
355
 
356
356
  # Populate GroupMembership Aspects for CorpUsers
357
- datahub_corp_user_urn_to_group_membership: Dict[
358
- str, GroupMembershipClass
359
- ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
357
+ datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
358
+ defaultdict(lambda: GroupMembershipClass(groups=[]))
359
+ )
360
360
  if (
361
361
  self.config.ingest_group_membership
362
362
  and len(self.selected_azure_ad_groups) > 0
@@ -344,9 +344,9 @@ class OktaSource(StatefulIngestionSourceBase):
344
344
  ).as_workunit()
345
345
 
346
346
  # Step 2: Populate GroupMembership Aspects for CorpUsers
347
- datahub_corp_user_urn_to_group_membership: Dict[
348
- str, GroupMembershipClass
349
- ] = defaultdict(lambda: GroupMembershipClass(groups=[]))
347
+ datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
348
+ defaultdict(lambda: GroupMembershipClass(groups=[]))
349
+ )
350
350
  if self.config.ingest_group_membership and okta_groups is not None:
351
351
  # Fetch membership for each group.
352
352
  for okta_group in okta_groups:
@@ -419,10 +419,10 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
419
419
  custom_props = self.build_custom_properties(
420
420
  topic, topic_detail, extra_topic_config
421
421
  )
422
- schema_name: Optional[
423
- str
424
- ] = self.schema_registry_client._get_subject_for_topic(
425
- topic, is_key_schema=False
422
+ schema_name: Optional[str] = (
423
+ self.schema_registry_client._get_subject_for_topic(
424
+ topic, is_key_schema=False
425
+ )
426
426
  )
427
427
  if schema_name is not None:
428
428
  custom_props["Schema Name"] = schema_name
@@ -610,11 +610,13 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
610
610
 
611
611
  def fetch_topic_configurations(self, topics: List[str]) -> Dict[str, dict]:
612
612
  logger.info("Fetching config details for all topics")
613
- configs: Dict[
614
- ConfigResource, concurrent.futures.Future
615
- ] = self.admin_client.describe_configs(
616
- resources=[ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics],
617
- request_timeout=self.source_config.connection.client_timeout_seconds,
613
+ configs: Dict[ConfigResource, concurrent.futures.Future] = (
614
+ self.admin_client.describe_configs(
615
+ resources=[
616
+ ConfigResource(ConfigResource.Type.TOPIC, t) for t in topics
617
+ ],
618
+ request_timeout=self.source_config.connection.client_timeout_seconds,
619
+ )
618
620
  )
619
621
  logger.debug("Waiting for config details futures to complete")
620
622
  concurrent.futures.wait(configs.values())
@@ -17,7 +17,7 @@ from datahub.ingestion.api.decorators import (
17
17
  platform_name,
18
18
  support_status,
19
19
  )
20
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
20
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
21
21
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
22
  from datahub.ingestion.source.kafka_connect.common import (
23
23
  CONNECTOR_CLASS,
@@ -94,11 +94,6 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
94
94
  if not jpype.isJVMStarted():
95
95
  jpype.startJVM()
96
96
 
97
- @classmethod
98
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
99
- config = KafkaConnectSourceConfig.parse_obj(config_dict)
100
- return cls(config, ctx)
101
-
102
97
  def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
103
98
  """Get Kafka Connect connectors manifest using REST API.
104
99
  Enrich with lineages metadata.
@@ -115,9 +110,8 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
115
110
  connector_manifest = self._get_connector_manifest(
116
111
  connector_name, connector_url
117
112
  )
118
- if (
119
- connector_manifest is None
120
- or not self.config.connector_patterns.allowed(connector_manifest.name)
113
+ if connector_manifest is None or not self.config.connector_patterns.allowed(
114
+ connector_manifest.name
121
115
  ):
122
116
  self.report.report_dropped(connector_name)
123
117
  continue
@@ -199,9 +199,9 @@ class BigQuerySinkConnector(BaseConnector):
199
199
  transforms.append(transform)
200
200
  for key in self.connector_manifest.config.keys():
201
201
  if key.startswith(f"transforms.{name}."):
202
- transform[
203
- key.replace(f"transforms.{name}.", "")
204
- ] = self.connector_manifest.config[key]
202
+ transform[key.replace(f"transforms.{name}.", "")] = (
203
+ self.connector_manifest.config[key]
204
+ )
205
205
 
206
206
  if "defaultDataset" in connector_manifest.config:
207
207
  defaultDataset = connector_manifest.config["defaultDataset"]
@@ -123,9 +123,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
123
123
  transforms.append(transform)
124
124
  for key in self.connector_manifest.config.keys():
125
125
  if key.startswith(f"transforms.{name}."):
126
- transform[
127
- key.replace(f"transforms.{name}.", "")
128
- ] = self.connector_manifest.config[key]
126
+ transform[key.replace(f"transforms.{name}.", "")] = (
127
+ self.connector_manifest.config[key]
128
+ )
129
129
 
130
130
  return self.JdbcParser(
131
131
  db_connection_url,
@@ -596,9 +596,9 @@ class LookerUtil:
596
596
 
597
597
  @staticmethod
598
598
  def _extract_view_from_field(field: str) -> str:
599
- assert (
600
- field.count(".") == 1
601
- ), f"Error: A field must be prefixed by a view name, field is: {field}"
599
+ assert field.count(".") == 1, (
600
+ f"Error: A field must be prefixed by a view name, field is: {field}"
601
+ )
602
602
  return field.split(".")[0]
603
603
 
604
604
  @staticmethod
@@ -815,9 +815,9 @@ class LookerExplore:
815
815
  project_name: Optional[str] = None
816
816
  label: Optional[str] = None
817
817
  description: Optional[str] = None
818
- upstream_views: Optional[
819
- List[ProjectInclude]
820
- ] = None # captures the view name(s) this explore is derived from
818
+ upstream_views: Optional[List[ProjectInclude]] = (
819
+ None # captures the view name(s) this explore is derived from
820
+ )
821
821
  upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field(
822
822
  default_factory=dict
823
823
  ) # view_name is key and file_path is value. A single file may contains multiple views
@@ -889,7 +889,7 @@ class LookerExplore:
889
889
  upstream_views.extend(parsed_explore.upstream_views or [])
890
890
  else:
891
891
  logger.warning(
892
- f'Could not find extended explore {extended_explore} for explore {dict["name"]} in model {model_name}'
892
+ f"Could not find extended explore {extended_explore} for explore {dict['name']} in model {model_name}"
893
893
  )
894
894
  else:
895
895
  # we only fallback to the view_names list if this is not an extended explore
@@ -903,7 +903,7 @@ class LookerExplore:
903
903
  )
904
904
  if not info:
905
905
  logger.warning(
906
- f'Could not resolve view {view_name} for explore {dict["name"]} in model {model_name}'
906
+ f"Could not resolve view {view_name} for explore {dict['name']} in model {model_name}"
907
907
  )
908
908
  else:
909
909
  upstream_views.append(
@@ -935,9 +935,9 @@ class LookerExplore:
935
935
  try:
936
936
  explore = client.lookml_model_explore(model, explore_name)
937
937
  views: Set[str] = set()
938
- lkml_fields: List[
939
- LookmlModelExploreField
940
- ] = explore_field_set_to_lkml_fields(explore)
938
+ lkml_fields: List[LookmlModelExploreField] = (
939
+ explore_field_set_to_lkml_fields(explore)
940
+ )
941
941
 
942
942
  if explore.view_name is not None and explore.view_name != explore.name:
943
943
  # explore is not named after a view and is instead using a from field, which is modeled as view_name.
@@ -1034,9 +1034,9 @@ class LookerExplore:
1034
1034
  if measure_field.name is None:
1035
1035
  continue
1036
1036
  else:
1037
- field_name_vs_raw_explore_field[
1038
- measure_field.name
1039
- ] = measure_field
1037
+ field_name_vs_raw_explore_field[measure_field.name] = (
1038
+ measure_field
1039
+ )
1040
1040
 
1041
1041
  view_fields.append(
1042
1042
  ViewField(
@@ -1072,11 +1072,11 @@ class LookerExplore:
1072
1072
  if view_project_map:
1073
1073
  logger.debug(f"views and their projects: {view_project_map}")
1074
1074
 
1075
- upstream_views_file_path: Dict[
1076
- str, Optional[str]
1077
- ] = create_upstream_views_file_path_map(
1078
- lkml_fields=lkml_fields,
1079
- view_names=views,
1075
+ upstream_views_file_path: Dict[str, Optional[str]] = (
1076
+ create_upstream_views_file_path_map(
1077
+ lkml_fields=lkml_fields,
1078
+ view_names=views,
1079
+ )
1080
1080
  )
1081
1081
  if upstream_views_file_path:
1082
1082
  logger.debug(f"views and their file-paths: {upstream_views_file_path}")
@@ -166,9 +166,9 @@ def _get_generic_definition(
166
166
  # e.g. spark1 or hive2 or druid_18
167
167
  platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0])
168
168
 
169
- assert (
170
- platform is not None
171
- ), f"Failed to extract a valid platform from connection {looker_connection}"
169
+ assert platform is not None, (
170
+ f"Failed to extract a valid platform from connection {looker_connection}"
171
+ )
172
172
  db = looker_connection.database
173
173
  schema = looker_connection.schema # ok for this to be None
174
174
  return platform, db, schema
@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
300
300
 
301
301
  folder_path_pattern: AllowDenyPattern = Field(
302
302
  default=AllowDenyPattern.allow_all(),
303
- description="Allow or deny dashboards from specific folders. "
303
+ description="Allow or deny dashboards from specific folders using their fully qualified paths. "
304
304
  "For example: \n"
305
305
  "deny: \n"
306
- " - sales/deprecated \n"
307
- "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n"
306
+ " - Shared/deprecated \n"
307
+ "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
308
+ "allow: \n"
309
+ " - Shared/sales \n"
310
+ "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
311
+ "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
312
+ "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
308
313
  "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
309
314
  )
310
315
 
@@ -250,9 +250,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
250
250
 
251
251
  @staticmethod
252
252
  def _extract_view_from_field(field: str) -> str:
253
- assert (
254
- field.count(".") == 1
255
- ), f"Error: A field must be prefixed by a view name, field is: {field}"
253
+ assert field.count(".") == 1, (
254
+ f"Error: A field must be prefixed by a view name, field is: {field}"
255
+ )
256
256
  return field.split(".")[0]
257
257
 
258
258
  def _get_views_from_fields(self, fields: List[str]) -> List[str]:
@@ -610,12 +610,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
610
610
  def _create_platform_instance_aspect(
611
611
  self,
612
612
  ) -> DataPlatformInstance:
613
- assert (
614
- self.source_config.platform_name
615
- ), "Platform name is not set in the configuration."
616
- assert (
617
- self.source_config.platform_instance
618
- ), "Platform instance is not set in the configuration."
613
+ assert self.source_config.platform_name, (
614
+ "Platform name is not set in the configuration."
615
+ )
616
+ assert self.source_config.platform_instance, (
617
+ "Platform instance is not set in the configuration."
618
+ )
619
619
 
620
620
  return DataPlatformInstance(
621
621
  platform=builder.make_data_platform_urn(self.source_config.platform_name),
@@ -1016,9 +1016,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1016
1016
  yield from chart_events
1017
1017
 
1018
1018
  # Step 2: Emit metadata events for the Dashboard itself.
1019
- chart_urns: Set[
1020
- str
1021
- ] = set() # Collect the unique child chart urns for dashboard input lineage.
1019
+ chart_urns: Set[str] = (
1020
+ set()
1021
+ ) # Collect the unique child chart urns for dashboard input lineage.
1022
1022
  for chart_event in chart_events:
1023
1023
  chart_event_urn = self._extract_event_urn(chart_event)
1024
1024
  if chart_event_urn:
@@ -1538,20 +1538,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1538
1538
  }
1539
1539
  )
1540
1540
 
1541
- dashboard_element: Optional[
1542
- LookerDashboardElement
1543
- ] = self._get_looker_dashboard_element(
1544
- DashboardElement(
1545
- id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
1546
- # we add the "looks_" prefix to look.id.
1547
- title=look.title,
1548
- subtitle_text=look.description,
1549
- look_id=look.id,
1550
- dashboard_id=None, # As this is an independent look
1551
- look=LookWithQuery(
1552
- query=query, folder=look.folder, user_id=look.user_id
1541
+ dashboard_element: Optional[LookerDashboardElement] = (
1542
+ self._get_looker_dashboard_element(
1543
+ DashboardElement(
1544
+ id=f"looks_{look.id}", # to avoid conflict with non-standalone looks (element.id prefixes),
1545
+ # we add the "looks_" prefix to look.id.
1546
+ title=look.title,
1547
+ subtitle_text=look.description,
1548
+ look_id=look.id,
1549
+ dashboard_id=None, # As this is an independent look
1550
+ look=LookWithQuery(
1551
+ query=query, folder=look.folder, user_id=look.user_id
1552
+ ),
1553
1553
  ),
1554
- ),
1554
+ )
1555
1555
  )
1556
1556
 
1557
1557
  if dashboard_element is not None:
@@ -33,9 +33,9 @@ logger = logging.getLogger(__name__)
33
33
 
34
34
 
35
35
  class SpecialVariable:
36
- SPECIAL_VARIABLE_PATTERN: ClassVar[
37
- str
38
- ] = r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
36
+ SPECIAL_VARIABLE_PATTERN: ClassVar[str] = (
37
+ r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b"
38
+ )
39
39
  liquid_variable: dict
40
40
 
41
41
  def __init__(self, liquid_variable):
@@ -257,9 +257,9 @@ class BaseStatGenerator(ABC):
257
257
 
258
258
  for row in rows:
259
259
  logger.debug(row)
260
- entity_stat_aspect[
261
- self.get_entity_stat_key(row)
262
- ] = self.to_entity_timeseries_stat_aspect(row)
260
+ entity_stat_aspect[self.get_entity_stat_key(row)] = (
261
+ self.to_entity_timeseries_stat_aspect(row)
262
+ )
263
263
 
264
264
  return entity_stat_aspect
265
265
 
@@ -385,10 +385,8 @@ class BaseStatGenerator(ABC):
385
385
  entity_rows: List[Dict] = self._execute_query(
386
386
  entity_query_with_filters, "entity_query"
387
387
  )
388
- entity_usage_stat: Dict[
389
- Tuple[str, str], Any
390
- ] = self._process_entity_timeseries_rows(
391
- entity_rows
388
+ entity_usage_stat: Dict[Tuple[str, str], Any] = (
389
+ self._process_entity_timeseries_rows(entity_rows)
392
390
  ) # Any type to pass mypy unbound Aspect type error
393
391
 
394
392
  user_wise_query_with_filters: LookerQuery = self._append_filters(
@@ -38,16 +38,16 @@ def merge_parent_and_child_fields(
38
38
  # Create a map field-name vs field
39
39
  child_field_map: dict = {}
40
40
  for field in child_fields:
41
- assert (
42
- NAME in field
43
- ), "A lookml view must have a name field" # name is required field of lookml field array
41
+ assert NAME in field, (
42
+ "A lookml view must have a name field"
43
+ ) # name is required field of lookml field array
44
44
 
45
45
  child_field_map[field[NAME]] = field
46
46
 
47
47
  for field in parent_fields:
48
- assert (
49
- NAME in field
50
- ), "A lookml view must have a name field" # name is required field of lookml field array
48
+ assert NAME in field, (
49
+ "A lookml view must have a name field"
50
+ ) # name is required field of lookml field array
51
51
 
52
52
  if field[NAME] in child_field_map:
53
53
  # Fields defined in the child view take higher precedence.
@@ -482,14 +482,14 @@ class LookMLSource(StatefulIngestionSourceBase):
482
482
  if self.source_config.project_name is not None:
483
483
  return self.source_config.project_name
484
484
 
485
- assert (
486
- self.looker_client is not None
487
- ), "Failed to find a configured Looker API client"
485
+ assert self.looker_client is not None, (
486
+ "Failed to find a configured Looker API client"
487
+ )
488
488
  try:
489
489
  model = self.looker_client.lookml_model(model_name, fields="project_name")
490
- assert (
491
- model.project_name is not None
492
- ), f"Failed to find a project name for model {model_name}"
490
+ assert model.project_name is not None, (
491
+ f"Failed to find a project name for model {model_name}"
492
+ )
493
493
  return model.project_name
494
494
  except SDKError:
495
495
  raise ValueError(
@@ -541,9 +541,9 @@ class LookMLSource(StatefulIngestionSourceBase):
541
541
  self.reporter.git_clone_latency = datetime.now() - start_time
542
542
  self.source_config.base_folder = checkout_dir.resolve()
543
543
 
544
- self.base_projects_folder[
545
- BASE_PROJECT_NAME
546
- ] = self.source_config.base_folder
544
+ self.base_projects_folder[BASE_PROJECT_NAME] = (
545
+ self.source_config.base_folder
546
+ )
547
547
 
548
548
  visited_projects: Set[str] = set()
549
549
 
@@ -641,9 +641,9 @@ class LookMLSource(StatefulIngestionSourceBase):
641
641
  repo_url=remote_project.url,
642
642
  )
643
643
 
644
- self.base_projects_folder[
645
- remote_project.name
646
- ] = p_checkout_dir.resolve()
644
+ self.base_projects_folder[remote_project.name] = (
645
+ p_checkout_dir.resolve()
646
+ )
647
647
  repo = p_cloner.get_last_repo_cloned()
648
648
  assert repo
649
649
  remote_git_info = GitInfo(
@@ -930,9 +930,7 @@ class LookMLSource(StatefulIngestionSourceBase):
930
930
  logger.warning(
931
931
  f"view {maybe_looker_view.id.view_name} from model {model_name}, connection {model.connection} was previously processed via model {prev_model_name}, connection {prev_model_connection} and will likely lead to incorrect lineage to the underlying tables"
932
932
  )
933
- if (
934
- not self.source_config.emit_reachable_views_only
935
- ):
933
+ if not self.source_config.emit_reachable_views_only:
936
934
  logger.warning(
937
935
  "Consider enabling the `emit_reachable_views_only` flag to handle this case."
938
936
  )
@@ -484,11 +484,11 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
484
484
  )
485
485
 
486
486
  def __get_upstream_dataset_urn(self) -> List[str]:
487
- current_view_id: Optional[
488
- LookerViewId
489
- ] = self.looker_view_id_cache.get_looker_view_id(
490
- view_name=self.view_context.name(),
491
- base_folder_path=self.view_context.base_folder_path,
487
+ current_view_id: Optional[LookerViewId] = (
488
+ self.looker_view_id_cache.get_looker_view_id(
489
+ view_name=self.view_context.name(),
490
+ base_folder_path=self.view_context.base_folder_path,
491
+ )
492
492
  )
493
493
 
494
494
  # Current view will always be present in cache. assert will silence the lint
@@ -23,7 +23,7 @@ from datahub.ingestion.api.decorators import (
23
23
  platform_name,
24
24
  support_status,
25
25
  )
26
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
26
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
27
27
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
28
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  StaleEntityRemovalHandler,
@@ -789,11 +789,6 @@ class MetabaseSource(StatefulIngestionSourceBase):
789
789
 
790
790
  return platform, dbname, schema, platform_instance
791
791
 
792
- @classmethod
793
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
794
- config = MetabaseConfig.parse_obj(config_dict)
795
- return cls(ctx, config)
796
-
797
792
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
798
793
  return [
799
794
  *super().get_workunit_processors(),
@@ -172,10 +172,10 @@ class MLflowSource(Source):
172
172
  """
173
173
  Get all Registered Models in MLflow Model Registry.
174
174
  """
175
- registered_models: Iterable[
176
- RegisteredModel
177
- ] = self._traverse_mlflow_search_func(
178
- search_func=self.client.search_registered_models,
175
+ registered_models: Iterable[RegisteredModel] = (
176
+ self._traverse_mlflow_search_func(
177
+ search_func=self.client.search_registered_models,
178
+ )
179
179
  )
180
180
  return registered_models
181
181
 
@@ -333,8 +333,3 @@ class MLflowSource(Source):
333
333
  aspect=global_tags,
334
334
  )
335
335
  return wu
336
-
337
- @classmethod
338
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
339
- config = MLflowConfig.parse_obj(config_dict)
340
- return cls(ctx, config)
@@ -893,11 +893,11 @@ class ModeSource(StatefulIngestionSourceBase):
893
893
  jinja_params[key] = parameters[key].get("default", "")
894
894
 
895
895
  normalized_query = re.sub(
896
- r"{% form %}(.*){% endform %}",
897
- "",
898
- query,
899
- 0,
900
- re.MULTILINE | re.DOTALL,
896
+ pattern=r"{% form %}(.*){% endform %}",
897
+ repl="",
898
+ string=query,
899
+ count=0,
900
+ flags=re.MULTILINE | re.DOTALL,
901
901
  )
902
902
 
903
903
  # Wherever we don't resolve the jinja params, we replace it with NULL