acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -51,13 +51,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
51
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
52
  StatefulIngestionSourceBase,
53
53
  )
54
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
54
+ from datahub.ingestion.source_report.ingestion_stage import (
55
+ LINEAGE_EXTRACTION,
56
+ METADATA_EXTRACTION,
57
+ PROFILING,
58
+ )
55
59
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
56
60
  DatasetLineageTypeClass,
57
61
  UpstreamClass,
58
62
  UpstreamLineage,
59
63
  )
60
- from datahub.metadata.schema_classes import ChangeTypeClass, SchemaMetadataClass
64
+ from datahub.metadata.schema_classes import SchemaMetadataClass
61
65
  from datahub.metadata.urns import CorpUserUrn
62
66
  from datahub.sql_parsing.sql_parsing_aggregator import (
63
67
  KnownQueryLineageInfo,
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
89
93
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
90
94
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
91
95
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
96
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
92
97
  class DremioSource(StatefulIngestionSourceBase):
93
98
  """
94
99
  This plugin integrates with Dremio to extract and ingest metadata into DataHub.
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
126
131
  self.default_db = "dremio"
127
132
  self.config = config
128
133
  self.report = DremioSourceReport()
134
+
135
+ # Set time window for query lineage extraction
136
+ self.report.window_start_time, self.report.window_end_time = (
137
+ self.config.start_time,
138
+ self.config.end_time,
139
+ )
140
+
129
141
  self.source_map: Dict[str, DremioSourceMapEntry] = dict()
130
142
 
131
143
  # Initialize API operations
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
154
166
  generate_operations=True,
155
167
  usage_config=self.config.usage,
156
168
  )
169
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
157
170
 
158
171
  # For profiling
159
172
  self.profiler = DremioProfiler(config, self.report, dremio_api)
@@ -190,84 +203,88 @@ class DremioSource(StatefulIngestionSourceBase):
190
203
 
191
204
  self.source_map = self._build_source_map()
192
205
 
193
- # Process Containers
194
- containers = self.dremio_catalog.get_containers()
195
- for container in containers:
196
- try:
197
- yield from self.process_container(container)
198
- logger.info(
199
- f"Dremio container {container.container_name} emitted successfully"
200
- )
201
- except Exception as exc:
202
- self.report.num_containers_failed += 1 # Increment failed containers
203
- self.report.report_failure(
204
- message="Failed to process Dremio container",
205
- context=f"{'.'.join(container.path)}.{container.container_name}",
206
- exc=exc,
207
- )
206
+ with self.report.new_stage(METADATA_EXTRACTION):
207
+ # Process Containers
208
+ containers = self.dremio_catalog.get_containers()
209
+ for container in containers:
210
+ try:
211
+ yield from self.process_container(container)
212
+ logger.info(
213
+ f"Dremio container {container.container_name} emitted successfully"
214
+ )
215
+ except Exception as exc:
216
+ self.report.num_containers_failed += 1
217
+ self.report.report_failure(
218
+ message="Failed to process Dremio container",
219
+ context=f"{'.'.join(container.path)}.{container.container_name}",
220
+ exc=exc,
221
+ )
208
222
 
209
- # Process Datasets
210
- datasets = self.dremio_catalog.get_datasets()
223
+ # Process Datasets
224
+ datasets = self.dremio_catalog.get_datasets()
211
225
 
212
- for dataset_info in datasets:
213
- try:
214
- yield from self.process_dataset(dataset_info)
215
- logger.info(
216
- f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
217
- )
218
- except Exception as exc:
219
- self.report.num_datasets_failed += 1 # Increment failed datasets
220
- self.report.report_failure(
221
- message="Failed to process Dremio dataset",
222
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
223
- exc=exc,
224
- )
226
+ for dataset_info in datasets:
227
+ try:
228
+ yield from self.process_dataset(dataset_info)
229
+ logger.info(
230
+ f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
231
+ )
232
+ except Exception as exc:
233
+ self.report.num_datasets_failed += 1 # Increment failed datasets
234
+ self.report.report_failure(
235
+ message="Failed to process Dremio dataset",
236
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
237
+ exc=exc,
238
+ )
225
239
 
226
- # Optionally Process Query Lineage
227
- if self.config.include_query_lineage:
228
- self.get_query_lineage_workunits()
229
-
230
- # Process Glossary Terms
231
- glossary_terms = self.dremio_catalog.get_glossary_terms()
232
-
233
- for glossary_term in glossary_terms:
234
- try:
235
- yield from self.process_glossary_term(glossary_term)
236
- except Exception as exc:
237
- self.report.report_failure(
238
- message="Failed to process Glossary terms",
239
- context=f"{glossary_term.glossary_term}",
240
- exc=exc,
241
- )
240
+ # Process Glossary Terms
241
+ glossary_terms = self.dremio_catalog.get_glossary_terms()
242
242
 
243
- # Generate workunit for aggregated SQL parsing results
244
- for mcp in self.sql_parsing_aggregator.gen_metadata():
245
- self.report.report_workunit(mcp.as_workunit())
246
- yield mcp.as_workunit()
247
-
248
- # Profiling
249
- if self.config.is_profiling_enabled():
250
- with ThreadPoolExecutor(
251
- max_workers=self.config.profiling.max_workers
252
- ) as executor:
253
- future_to_dataset = {
254
- executor.submit(self.generate_profiles, dataset): dataset
255
- for dataset in datasets
256
- }
257
-
258
- for future in as_completed(future_to_dataset):
259
- dataset_info = future_to_dataset[future]
260
- try:
261
- yield from future.result()
262
- except Exception as exc:
263
- self.report.profiling_skipped_other[
264
- dataset_info.resource_name
265
- ] += 1
266
- self.report.report_failure(
267
- message="Failed to profile dataset",
268
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
269
- exc=exc,
270
- )
243
+ for glossary_term in glossary_terms:
244
+ try:
245
+ yield from self.process_glossary_term(glossary_term)
246
+ except Exception as exc:
247
+ self.report.report_failure(
248
+ message="Failed to process Glossary terms",
249
+ context=f"{glossary_term.glossary_term}",
250
+ exc=exc,
251
+ )
252
+
253
+ # Optionally Process Query Lineage
254
+ if self.config.include_query_lineage:
255
+ with self.report.new_stage(LINEAGE_EXTRACTION):
256
+ self.get_query_lineage_workunits()
257
+
258
+ # Generate workunit for aggregated SQL parsing results
259
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
260
+ yield mcp.as_workunit()
261
+
262
+ # Profiling
263
+ if self.config.is_profiling_enabled():
264
+ with (
265
+ self.report.new_stage(PROFILING),
266
+ ThreadPoolExecutor(
267
+ max_workers=self.config.profiling.max_workers
268
+ ) as executor,
269
+ ):
270
+ future_to_dataset = {
271
+ executor.submit(self.generate_profiles, dataset): dataset
272
+ for dataset in datasets
273
+ }
274
+
275
+ for future in as_completed(future_to_dataset):
276
+ dataset_info = future_to_dataset[future]
277
+ try:
278
+ yield from future.result()
279
+ except Exception as exc:
280
+ self.report.profiling_skipped_other[
281
+ dataset_info.resource_name
282
+ ] += 1
283
+ self.report.report_failure(
284
+ message="Failed to profile dataset",
285
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
286
+ exc=exc,
287
+ )
271
288
 
272
289
  def process_container(
273
290
  self, container_info: DremioContainer
@@ -388,8 +405,7 @@ class DremioSource(StatefulIngestionSourceBase):
388
405
  env=self.config.env,
389
406
  platform_instance=self.config.platform_instance,
390
407
  )
391
- with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
392
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
408
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
393
409
 
394
410
  def generate_view_lineage(
395
411
  self, dataset_urn: str, parents: List[str]
@@ -417,11 +433,8 @@ class DremioSource(StatefulIngestionSourceBase):
417
433
  ]
418
434
  )
419
435
  mcp = MetadataChangeProposalWrapper(
420
- entityType="dataset",
421
436
  entityUrn=dataset_urn,
422
- aspectName=lineage.ASPECT_NAME,
423
437
  aspect=lineage,
424
- changeType=ChangeTypeClass.UPSERT,
425
438
  )
426
439
 
427
440
  for upstream_urn in upstream_urns:
@@ -1,3 +1,7 @@
1
+ from datetime import datetime, timedelta
2
+ from typing import Optional
3
+
4
+
1
5
  class DremioSQLQueries:
2
6
  QUERY_DATASETS_CE = """
3
7
  SELECT* FROM
@@ -235,28 +239,83 @@ class DremioSQLQueries:
235
239
  TABLE_NAME ASC
236
240
  """
237
241
 
238
- # Dremio Documentation: https://docs.dremio.com/current/reference/sql/system-tables/jobs_recent/
239
- # queried_datasets incorrectly documented as [varchar]. Observed as varchar.
240
- # LENGTH used as opposed to ARRAY_SIZE
241
- QUERY_ALL_JOBS = """
242
- SELECT
243
- job_id,
244
- user_name,
245
- submitted_ts,
246
- query,
247
- queried_datasets
248
- FROM
249
- SYS.JOBS_RECENT
250
- WHERE
251
- STATUS = 'COMPLETED'
252
- AND LENGTH(queried_datasets)>0
253
- AND user_name != '$dremio$'
254
- AND query_type not like '%INTERNAL%'
255
- """
242
+ @staticmethod
243
+ def _get_default_start_timestamp_millis() -> str:
244
+ """Get default start timestamp (1 day ago) in milliseconds precision format"""
245
+ one_day_ago = datetime.now() - timedelta(days=1)
246
+ return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
247
+ :-3
248
+ ] # Truncate to milliseconds
249
+
250
+ @staticmethod
251
+ def _get_default_end_timestamp_millis() -> str:
252
+ """Get default end timestamp (now) in milliseconds precision format"""
253
+ now = datetime.now()
254
+ return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
255
+
256
+ @staticmethod
257
+ def get_query_all_jobs(
258
+ start_timestamp_millis: Optional[str] = None,
259
+ end_timestamp_millis: Optional[str] = None,
260
+ ) -> str:
261
+ """
262
+ Get query for all jobs with optional time filtering.
263
+
264
+ Args:
265
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
266
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
267
+
268
+ Returns:
269
+ SQL query string with time filtering applied
270
+ """
271
+ if start_timestamp_millis is None:
272
+ start_timestamp_millis = (
273
+ DremioSQLQueries._get_default_start_timestamp_millis()
274
+ )
275
+ if end_timestamp_millis is None:
276
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
277
+
278
+ return f"""
279
+ SELECT
280
+ job_id,
281
+ user_name,
282
+ submitted_ts,
283
+ query,
284
+ queried_datasets
285
+ FROM
286
+ SYS.JOBS_RECENT
287
+ WHERE
288
+ STATUS = 'COMPLETED'
289
+ AND LENGTH(queried_datasets)>0
290
+ AND user_name != '$dremio$'
291
+ AND query_type not like '%INTERNAL%'
292
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
293
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
294
+ """
295
+
296
+ @staticmethod
297
+ def get_query_all_jobs_cloud(
298
+ start_timestamp_millis: Optional[str] = None,
299
+ end_timestamp_millis: Optional[str] = None,
300
+ ) -> str:
301
+ """
302
+ Get query for all jobs in Dremio Cloud with optional time filtering.
303
+
304
+ Args:
305
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
306
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
307
+
308
+ Returns:
309
+ SQL query string with time filtering applied
310
+ """
311
+ if start_timestamp_millis is None:
312
+ start_timestamp_millis = (
313
+ DremioSQLQueries._get_default_start_timestamp_millis()
314
+ )
315
+ if end_timestamp_millis is None:
316
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
256
317
 
257
- # Dremio Documentation: https://docs.dremio.com/cloud/reference/sql/system-tables/jobs-historical
258
- # queried_datasets correctly documented as [varchar]
259
- QUERY_ALL_JOBS_CLOUD = """
318
+ return f"""
260
319
  SELECT
261
320
  job_id,
262
321
  user_name,
@@ -270,6 +329,8 @@ class DremioSQLQueries:
270
329
  AND ARRAY_SIZE(queried_datasets)>0
271
330
  AND user_name != '$dremio$'
272
331
  AND query_type not like '%INTERNAL%'
332
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
333
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
273
334
  """
274
335
 
275
336
  QUERY_TYPES = [
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
20
20
  from datahub.ingestion.api.decorators import (
21
+ SourceCapability,
21
22
  SupportStatus,
23
+ capability,
22
24
  config_class,
23
25
  platform_name,
24
26
  support_status,
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
187
189
  @platform_name("Metadata File")
188
190
  @config_class(FileSourceConfig)
189
191
  @support_status(SupportStatus.CERTIFIED)
192
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
190
193
  class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
191
194
  """
192
195
  This plugin pulls metadata from a previously generated file.
@@ -1,8 +1,8 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional
2
+ from typing import Dict, Iterable, List, Optional, Union
3
3
 
4
4
  import datahub.emitter.mce_builder as builder
5
- from datahub.api.entities.datajob import DataFlow, DataJob
5
+ from datahub.api.entities.datajob import DataJob as DataJobV1
6
6
  from datahub.api.entities.dataprocess.dataprocess_instance import (
7
7
  DataProcessInstance,
8
8
  InstanceRunResult,
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
42
42
  FineGrainedLineageDownstreamType,
43
43
  FineGrainedLineageUpstreamType,
44
44
  )
45
- from datahub.utilities.urns.data_flow_urn import DataFlowUrn
46
- from datahub.utilities.urns.dataset_urn import DatasetUrn
45
+ from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
46
+ from datahub.sdk.dataflow import DataFlow
47
+ from datahub.sdk.datajob import DataJob
48
+ from datahub.sdk.entity import Entity
47
49
 
48
50
  # Logger instance
49
51
  logger = logging.getLogger(__name__)
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
75
77
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
76
78
 
77
79
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
78
- input_dataset_urn_list: List[DatasetUrn] = []
79
- output_dataset_urn_list: List[DatasetUrn] = []
80
+ input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
81
+ output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
80
82
  fine_grained_lineage: List[FineGrainedLineage] = []
81
83
 
82
84
  # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
178
180
  )
179
181
  )
180
182
 
181
- datajob.inlets.extend(input_dataset_urn_list)
182
- datajob.outlets.extend(output_dataset_urn_list)
183
- datajob.fine_grained_lineages.extend(fine_grained_lineage)
183
+ datajob.set_inlets(input_dataset_urn_list)
184
+ datajob.set_outlets(output_dataset_urn_list)
185
+ datajob.set_fine_grained_lineages(fine_grained_lineage)
184
186
 
185
187
  return dict(
186
188
  **{
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
197
199
 
198
200
  def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
199
201
  return DataFlow(
200
- orchestrator=Constant.ORCHESTRATOR,
201
- id=connector.connector_id,
202
+ platform=Constant.ORCHESTRATOR,
203
+ name=connector.connector_id,
202
204
  env=self.config.env,
203
- name=connector.connector_name,
205
+ display_name=connector.connector_name,
204
206
  platform_instance=self.config.platform_instance,
205
207
  )
206
208
 
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
213
215
  )
214
216
  owner_email = self.audit_log.get_user_email(connector.user_id)
215
217
  datajob = DataJob(
216
- id=connector.connector_id,
218
+ name=connector.connector_id,
217
219
  flow_urn=dataflow_urn,
218
220
  platform_instance=self.config.platform_instance,
219
- name=connector.connector_name,
220
- owners={owner_email} if owner_email else set(),
221
+ display_name=connector.connector_name,
222
+ owners=[CorpUserUrn(owner_email)] if owner_email else None,
221
223
  )
222
224
 
223
225
  # Map connector source and destination table with dataset entity
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
232
234
  "sync_frequency": str(connector.sync_frequency),
233
235
  "destination_id": connector.destination_id,
234
236
  }
235
- datajob.properties = {
236
- **connector_properties,
237
- **lineage_properties,
238
- }
237
+
238
+ datajob.set_custom_properties({**connector_properties, **lineage_properties})
239
239
 
240
240
  return datajob
241
241
 
242
242
  def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
243
+ # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
244
+ datajob_v1 = DataJobV1(
245
+ id=datajob.name,
246
+ flow_urn=datajob.flow_urn,
247
+ platform_instance=self.config.platform_instance,
248
+ name=datajob.name,
249
+ inlets=datajob.inlets,
250
+ outlets=datajob.outlets,
251
+ fine_grained_lineages=datajob.fine_grained_lineages,
252
+ )
243
253
  return DataProcessInstance.from_datajob(
244
- datajob=datajob,
254
+ datajob=datajob_v1,
245
255
  id=job.job_id,
246
256
  clone_inlets=True,
247
257
  clone_outlets=True,
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
278
288
 
279
289
  def _get_connector_workunits(
280
290
  self, connector: Connector
281
- ) -> Iterable[MetadataWorkUnit]:
291
+ ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
282
292
  self.report.report_connectors_scanned()
283
293
  # Create dataflow entity with same name as connector name
284
294
  dataflow = self._generate_dataflow_from_connector(connector)
285
- for mcp in dataflow.generate_mcp():
286
- yield mcp.as_workunit()
295
+ yield dataflow
287
296
 
288
297
  # Map Fivetran's connector entity with Datahub's datajob entity
289
298
  datajob = self._generate_datajob_from_connector(connector)
290
- for mcp in datajob.generate_mcp(materialize_iolets=False):
291
- yield mcp.as_workunit()
299
+ yield datajob
292
300
 
293
301
  # Map Fivetran's job/sync history entity with Datahub's data process entity
294
302
  if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
310
318
  ).workunit_processor,
311
319
  ]
312
320
 
313
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
321
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
314
322
  """
315
323
  Datahub Ingestion framework invoke this method
316
324
  """
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
18
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
19
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
19
20
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
20
21
  from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
21
22
  from datahub.ingestion.source.data_lake_common.object_store import (
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
82
83
  @platform_name("Google Cloud Storage", id=PLATFORM_GCS)
83
84
  @config_class(GCSSourceConfig)
84
85
  @support_status(SupportStatus.INCUBATING)
85
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
86
+ @capability(
87
+ SourceCapability.CONTAINERS,
88
+ "Enabled by default",
89
+ subtype_modifier=[
90
+ SourceCapabilityModifier.GCS_BUCKET,
91
+ SourceCapabilityModifier.FOLDER,
92
+ ],
93
+ )
86
94
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
87
95
  @capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
88
96
  class GCSSource(StatefulIngestionSourceBase):
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
120
  env=self.config.env,
113
121
  max_rows=self.config.max_rows,
114
122
  number_of_files_to_sample=self.config.number_of_files_to_sample,
123
+ platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
115
124
  )
116
125
  return s3_config
117
126
 
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
138
147
 
139
148
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
140
149
  config = self.create_equivalent_s3_config()
141
- s3_source = S3Source(config, PipelineContext(ctx.run_id))
150
+ # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
151
+ s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
152
+ s3_source = S3Source(config, s3_ctx)
142
153
  return self.s3_source_overrides(s3_source)
143
154
 
144
155
  def s3_source_overrides(self, source: S3Source) -> S3Source: