acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,12 @@ from typing import List, Literal, Optional
4
4
  import certifi
5
5
  from pydantic import Field, validator
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
8
8
  from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
13
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -99,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
99
100
  query_timeout: int = Field(
100
101
  default=300, description="Time before cancelling Dremio profiling query"
101
102
  )
102
- include_field_median_value: bool = Field(
103
+ include_field_median_value: HiddenFromDocs[bool] = Field(
104
+ # Hidden because median causes a number of issues in Dremio.
103
105
  default=False,
104
- hidden_from_docs=True,
105
- description="Median causes a number of issues in Dremio.",
106
106
  )
107
107
 
108
108
 
@@ -118,6 +118,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
118
118
  class DremioSourceConfig(
119
119
  DremioConnectionConfig,
120
120
  StatefulIngestionConfigBase,
121
+ BaseTimeWindowConfig,
121
122
  EnvConfigMixin,
122
123
  PlatformInstanceConfigMixin,
123
124
  ):
@@ -1,22 +1,41 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
+ from typing import Optional
3
4
 
4
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
5
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
7
  StaleEntityRemovalSourceReport,
7
8
  )
8
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
9
+ from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
10
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
11
+ from datahub.utilities.stats_collections import (
12
+ TopKDict,
13
+ float_top_k_dict,
14
+ int_top_k_dict,
15
+ )
9
16
 
10
17
 
11
18
  @dataclass
12
19
  class DremioSourceReport(
13
- SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport
20
+ SQLSourceReport,
21
+ StaleEntityRemovalSourceReport,
22
+ BaseTimeWindowReport,
14
23
  ):
15
24
  num_containers_failed: int = 0
16
25
  num_datasets_failed: int = 0
17
26
  containers_scanned: int = 0
18
27
  containers_filtered: int = 0
19
28
 
29
+ api_calls_total: int = 0
30
+ api_calls_by_method_and_path: TopKDict[str, int] = field(
31
+ default_factory=int_top_k_dict
32
+ )
33
+ api_call_secs_by_method_and_path: TopKDict[str, float] = field(
34
+ default_factory=float_top_k_dict
35
+ )
36
+
37
+ sql_aggregator: Optional[SqlAggregatorReport] = None
38
+
20
39
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
21
40
  # recording total combined latency is not very useful, keeping this method as a placeholder
22
41
  # for future implementation of min / max / percentiles etc.
@@ -22,6 +22,7 @@ from datahub.ingestion.api.source import (
22
22
  SourceReport,
23
23
  )
24
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
25
26
  from datahub.ingestion.source.dremio.dremio_api import (
26
27
  DremioAPIOperations,
27
28
  DremioEdition,
@@ -51,13 +52,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
52
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
53
  StatefulIngestionSourceBase,
53
54
  )
54
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
55
+ from datahub.ingestion.source_report.ingestion_stage import (
56
+ LINEAGE_EXTRACTION,
57
+ METADATA_EXTRACTION,
58
+ IngestionHighStage,
59
+ )
55
60
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
56
61
  DatasetLineageTypeClass,
57
62
  UpstreamClass,
58
63
  UpstreamLineage,
59
64
  )
60
- from datahub.metadata.schema_classes import ChangeTypeClass, SchemaMetadataClass
65
+ from datahub.metadata.schema_classes import SchemaMetadataClass
61
66
  from datahub.metadata.urns import CorpUserUrn
62
67
  from datahub.sql_parsing.sql_parsing_aggregator import (
63
68
  KnownQueryLineageInfo,
@@ -82,13 +87,34 @@ class DremioSourceMapEntry:
82
87
  @platform_name("Dremio")
83
88
  @config_class(DremioSourceConfig)
84
89
  @support_status(SupportStatus.CERTIFIED)
85
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
90
+ @capability(
91
+ SourceCapability.CONTAINERS,
92
+ "Enabled by default",
93
+ subtype_modifier=[
94
+ SourceCapabilityModifier.DREMIO_SPACE,
95
+ SourceCapabilityModifier.DREMIO_SOURCE,
96
+ ],
97
+ )
86
98
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
87
99
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
88
100
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
89
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
101
+ @capability(
102
+ SourceCapability.LINEAGE_COARSE,
103
+ "Enabled by default",
104
+ subtype_modifier=[
105
+ SourceCapabilityModifier.TABLE,
106
+ ],
107
+ )
108
+ @capability(
109
+ SourceCapability.LINEAGE_FINE,
110
+ "Extract column-level lineage",
111
+ subtype_modifier=[
112
+ SourceCapabilityModifier.TABLE,
113
+ ],
114
+ )
90
115
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
91
116
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
117
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
92
118
  class DremioSource(StatefulIngestionSourceBase):
93
119
  """
94
120
  This plugin integrates with Dremio to extract and ingest metadata into DataHub.
@@ -126,6 +152,13 @@ class DremioSource(StatefulIngestionSourceBase):
126
152
  self.default_db = "dremio"
127
153
  self.config = config
128
154
  self.report = DremioSourceReport()
155
+
156
+ # Set time window for query lineage extraction
157
+ self.report.window_start_time, self.report.window_end_time = (
158
+ self.config.start_time,
159
+ self.config.end_time,
160
+ )
161
+
129
162
  self.source_map: Dict[str, DremioSourceMapEntry] = dict()
130
163
 
131
164
  # Initialize API operations
@@ -154,6 +187,7 @@ class DremioSource(StatefulIngestionSourceBase):
154
187
  generate_operations=True,
155
188
  usage_config=self.config.usage,
156
189
  )
190
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
157
191
 
158
192
  # For profiling
159
193
  self.profiler = DremioProfiler(config, self.report, dremio_api)
@@ -190,84 +224,88 @@ class DremioSource(StatefulIngestionSourceBase):
190
224
 
191
225
  self.source_map = self._build_source_map()
192
226
 
193
- # Process Containers
194
- containers = self.dremio_catalog.get_containers()
195
- for container in containers:
196
- try:
197
- yield from self.process_container(container)
198
- logger.info(
199
- f"Dremio container {container.container_name} emitted successfully"
200
- )
201
- except Exception as exc:
202
- self.report.num_containers_failed += 1 # Increment failed containers
203
- self.report.report_failure(
204
- message="Failed to process Dremio container",
205
- context=f"{'.'.join(container.path)}.{container.container_name}",
206
- exc=exc,
207
- )
227
+ with self.report.new_stage(METADATA_EXTRACTION):
228
+ # Process Containers
229
+ containers = self.dremio_catalog.get_containers()
230
+ for container in containers:
231
+ try:
232
+ yield from self.process_container(container)
233
+ logger.info(
234
+ f"Dremio container {container.container_name} emitted successfully"
235
+ )
236
+ except Exception as exc:
237
+ self.report.num_containers_failed += 1
238
+ self.report.report_failure(
239
+ message="Failed to process Dremio container",
240
+ context=f"{'.'.join(container.path)}.{container.container_name}",
241
+ exc=exc,
242
+ )
208
243
 
209
- # Process Datasets
210
- datasets = self.dremio_catalog.get_datasets()
244
+ # Process Datasets
245
+ datasets = self.dremio_catalog.get_datasets()
211
246
 
212
- for dataset_info in datasets:
213
- try:
214
- yield from self.process_dataset(dataset_info)
215
- logger.info(
216
- f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
217
- )
218
- except Exception as exc:
219
- self.report.num_datasets_failed += 1 # Increment failed datasets
220
- self.report.report_failure(
221
- message="Failed to process Dremio dataset",
222
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
223
- exc=exc,
224
- )
247
+ for dataset_info in datasets:
248
+ try:
249
+ yield from self.process_dataset(dataset_info)
250
+ logger.info(
251
+ f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
252
+ )
253
+ except Exception as exc:
254
+ self.report.num_datasets_failed += 1 # Increment failed datasets
255
+ self.report.report_failure(
256
+ message="Failed to process Dremio dataset",
257
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
258
+ exc=exc,
259
+ )
225
260
 
226
- # Optionally Process Query Lineage
227
- if self.config.include_query_lineage:
228
- self.get_query_lineage_workunits()
229
-
230
- # Process Glossary Terms
231
- glossary_terms = self.dremio_catalog.get_glossary_terms()
232
-
233
- for glossary_term in glossary_terms:
234
- try:
235
- yield from self.process_glossary_term(glossary_term)
236
- except Exception as exc:
237
- self.report.report_failure(
238
- message="Failed to process Glossary terms",
239
- context=f"{glossary_term.glossary_term}",
240
- exc=exc,
241
- )
261
+ # Process Glossary Terms
262
+ glossary_terms = self.dremio_catalog.get_glossary_terms()
242
263
 
243
- # Generate workunit for aggregated SQL parsing results
244
- for mcp in self.sql_parsing_aggregator.gen_metadata():
245
- self.report.report_workunit(mcp.as_workunit())
246
- yield mcp.as_workunit()
247
-
248
- # Profiling
249
- if self.config.is_profiling_enabled():
250
- with ThreadPoolExecutor(
251
- max_workers=self.config.profiling.max_workers
252
- ) as executor:
253
- future_to_dataset = {
254
- executor.submit(self.generate_profiles, dataset): dataset
255
- for dataset in datasets
256
- }
257
-
258
- for future in as_completed(future_to_dataset):
259
- dataset_info = future_to_dataset[future]
260
- try:
261
- yield from future.result()
262
- except Exception as exc:
263
- self.report.profiling_skipped_other[
264
- dataset_info.resource_name
265
- ] += 1
266
- self.report.report_failure(
267
- message="Failed to profile dataset",
268
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
269
- exc=exc,
270
- )
264
+ for glossary_term in glossary_terms:
265
+ try:
266
+ yield from self.process_glossary_term(glossary_term)
267
+ except Exception as exc:
268
+ self.report.report_failure(
269
+ message="Failed to process Glossary terms",
270
+ context=f"{glossary_term.glossary_term}",
271
+ exc=exc,
272
+ )
273
+
274
+ # Optionally Process Query Lineage
275
+ if self.config.include_query_lineage:
276
+ with self.report.new_stage(LINEAGE_EXTRACTION):
277
+ self.get_query_lineage_workunits()
278
+
279
+ # Generate workunit for aggregated SQL parsing results
280
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
281
+ yield mcp.as_workunit()
282
+
283
+ # Profiling
284
+ if self.config.is_profiling_enabled():
285
+ with (
286
+ self.report.new_high_stage(IngestionHighStage.PROFILING),
287
+ ThreadPoolExecutor(
288
+ max_workers=self.config.profiling.max_workers
289
+ ) as executor,
290
+ ):
291
+ future_to_dataset = {
292
+ executor.submit(self.generate_profiles, dataset): dataset
293
+ for dataset in datasets
294
+ }
295
+
296
+ for future in as_completed(future_to_dataset):
297
+ dataset_info = future_to_dataset[future]
298
+ try:
299
+ yield from future.result()
300
+ except Exception as exc:
301
+ self.report.profiling_skipped_other[
302
+ dataset_info.resource_name
303
+ ] += 1
304
+ self.report.report_failure(
305
+ message="Failed to profile dataset",
306
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
307
+ exc=exc,
308
+ )
271
309
 
272
310
  def process_container(
273
311
  self, container_info: DremioContainer
@@ -300,10 +338,10 @@ class DremioSource(StatefulIngestionSourceBase):
300
338
  return
301
339
 
302
340
  dataset_urn = make_dataset_urn_with_platform_instance(
303
- platform=make_data_platform_urn(self.get_platform()),
304
- name=f"dremio.{dataset_name}",
305
- env=self.config.env,
341
+ platform=self.get_platform(),
342
+ name=dataset_name,
306
343
  platform_instance=self.config.platform_instance,
344
+ env=self.config.env,
307
345
  )
308
346
 
309
347
  for dremio_mcp in self.dremio_aspects.populate_dataset_mcp(
@@ -383,13 +421,12 @@ class DremioSource(StatefulIngestionSourceBase):
383
421
  schema_str = ".".join(dataset_info.path)
384
422
  dataset_name = f"{schema_str}.{dataset_info.resource_name}".lower()
385
423
  dataset_urn = make_dataset_urn_with_platform_instance(
386
- platform=make_data_platform_urn(self.get_platform()),
387
- name=f"dremio.{dataset_name}",
388
- env=self.config.env,
424
+ platform=self.get_platform(),
425
+ name=dataset_name,
389
426
  platform_instance=self.config.platform_instance,
427
+ env=self.config.env,
390
428
  )
391
- with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
392
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
429
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
393
430
 
394
431
  def generate_view_lineage(
395
432
  self, dataset_urn: str, parents: List[str]
@@ -399,10 +436,10 @@ class DremioSource(StatefulIngestionSourceBase):
399
436
  """
400
437
  upstream_urns = [
401
438
  make_dataset_urn_with_platform_instance(
402
- platform=make_data_platform_urn(self.get_platform()),
403
- name=f"dremio.{upstream_table.lower()}",
404
- env=self.config.env,
439
+ platform=self.get_platform(),
440
+ name=upstream_table.lower(),
405
441
  platform_instance=self.config.platform_instance,
442
+ env=self.config.env,
406
443
  )
407
444
  for upstream_table in parents
408
445
  ]
@@ -417,11 +454,8 @@ class DremioSource(StatefulIngestionSourceBase):
417
454
  ]
418
455
  )
419
456
  mcp = MetadataChangeProposalWrapper(
420
- entityType="dataset",
421
457
  entityUrn=dataset_urn,
422
- aspectName=lineage.ASPECT_NAME,
423
458
  aspect=lineage,
424
- changeType=ChangeTypeClass.UPSERT,
425
459
  )
426
460
 
427
461
  for upstream_urn in upstream_urns:
@@ -464,19 +498,19 @@ class DremioSource(StatefulIngestionSourceBase):
464
498
  if query.query and query.affected_dataset:
465
499
  upstream_urns = [
466
500
  make_dataset_urn_with_platform_instance(
467
- platform=make_data_platform_urn(self.get_platform()),
468
- name=f"dremio.{ds.lower()}",
469
- env=self.config.env,
501
+ platform=self.get_platform(),
502
+ name=ds.lower(),
470
503
  platform_instance=self.config.platform_instance,
504
+ env=self.config.env,
471
505
  )
472
506
  for ds in query.queried_datasets
473
507
  ]
474
508
 
475
509
  downstream_urn = make_dataset_urn_with_platform_instance(
476
- platform=make_data_platform_urn(self.get_platform()),
477
- name=f"dremio.{query.affected_dataset.lower()}",
478
- env=self.config.env,
510
+ platform=self.get_platform(),
511
+ name=query.affected_dataset.lower(),
479
512
  platform_instance=self.config.platform_instance,
513
+ env=self.config.env,
480
514
  )
481
515
 
482
516
  # Add query to SqlParsingAggregator
@@ -1,3 +1,7 @@
1
+ from datetime import datetime, timedelta
2
+ from typing import Optional
3
+
4
+
1
5
  class DremioSQLQueries:
2
6
  QUERY_DATASETS_CE = """
3
7
  SELECT* FROM
@@ -235,28 +239,83 @@ class DremioSQLQueries:
235
239
  TABLE_NAME ASC
236
240
  """
237
241
 
238
- # Dremio Documentation: https://docs.dremio.com/current/reference/sql/system-tables/jobs_recent/
239
- # queried_datasets incorrectly documented as [varchar]. Observed as varchar.
240
- # LENGTH used as opposed to ARRAY_SIZE
241
- QUERY_ALL_JOBS = """
242
- SELECT
243
- job_id,
244
- user_name,
245
- submitted_ts,
246
- query,
247
- queried_datasets
248
- FROM
249
- SYS.JOBS_RECENT
250
- WHERE
251
- STATUS = 'COMPLETED'
252
- AND LENGTH(queried_datasets)>0
253
- AND user_name != '$dremio$'
254
- AND query_type not like '%INTERNAL%'
255
- """
242
+ @staticmethod
243
+ def _get_default_start_timestamp_millis() -> str:
244
+ """Get default start timestamp (1 day ago) in milliseconds precision format"""
245
+ one_day_ago = datetime.now() - timedelta(days=1)
246
+ return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
247
+ :-3
248
+ ] # Truncate to milliseconds
249
+
250
+ @staticmethod
251
+ def _get_default_end_timestamp_millis() -> str:
252
+ """Get default end timestamp (now) in milliseconds precision format"""
253
+ now = datetime.now()
254
+ return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
255
+
256
+ @staticmethod
257
+ def get_query_all_jobs(
258
+ start_timestamp_millis: Optional[str] = None,
259
+ end_timestamp_millis: Optional[str] = None,
260
+ ) -> str:
261
+ """
262
+ Get query for all jobs with optional time filtering.
263
+
264
+ Args:
265
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
266
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
267
+
268
+ Returns:
269
+ SQL query string with time filtering applied
270
+ """
271
+ if start_timestamp_millis is None:
272
+ start_timestamp_millis = (
273
+ DremioSQLQueries._get_default_start_timestamp_millis()
274
+ )
275
+ if end_timestamp_millis is None:
276
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
277
+
278
+ return f"""
279
+ SELECT
280
+ job_id,
281
+ user_name,
282
+ submitted_ts,
283
+ query,
284
+ queried_datasets
285
+ FROM
286
+ SYS.JOBS_RECENT
287
+ WHERE
288
+ STATUS = 'COMPLETED'
289
+ AND LENGTH(queried_datasets)>0
290
+ AND user_name != '$dremio$'
291
+ AND query_type not like '%INTERNAL%'
292
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
293
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
294
+ """
295
+
296
+ @staticmethod
297
+ def get_query_all_jobs_cloud(
298
+ start_timestamp_millis: Optional[str] = None,
299
+ end_timestamp_millis: Optional[str] = None,
300
+ ) -> str:
301
+ """
302
+ Get query for all jobs in Dremio Cloud with optional time filtering.
303
+
304
+ Args:
305
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
306
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
307
+
308
+ Returns:
309
+ SQL query string with time filtering applied
310
+ """
311
+ if start_timestamp_millis is None:
312
+ start_timestamp_millis = (
313
+ DremioSQLQueries._get_default_start_timestamp_millis()
314
+ )
315
+ if end_timestamp_millis is None:
316
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
256
317
 
257
- # Dremio Documentation: https://docs.dremio.com/cloud/reference/sql/system-tables/jobs-historical
258
- # queried_datasets correctly documented as [varchar]
259
- QUERY_ALL_JOBS_CLOUD = """
318
+ return f"""
260
319
  SELECT
261
320
  job_id,
262
321
  user_name,
@@ -270,6 +329,8 @@ class DremioSQLQueries:
270
329
  AND ARRAY_SIZE(queried_datasets)>0
271
330
  AND user_name != '$dremio$'
272
331
  AND query_type not like '%INTERNAL%'
332
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
333
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
273
334
  """
274
335
 
275
336
  QUERY_TYPES = [
@@ -12,7 +12,7 @@ from typing import (
12
12
  Union,
13
13
  )
14
14
 
15
- from pydantic.fields import Field
15
+ from pydantic import Field, PositiveInt
16
16
 
17
17
  from datahub.configuration.common import AllowDenyPattern
18
18
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -73,7 +73,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
73
73
 
74
74
  MAX_ITEMS_TO_RETRIEVE = 100
75
75
  PAGE_SIZE = 100
76
- MAX_SCHEMA_SIZE = 300
77
76
  MAX_PRIMARY_KEYS_SIZE = 100
78
77
  FIELD_DELIMITER = "."
79
78
 
@@ -107,6 +106,10 @@ class DynamoDBConfig(
107
106
  'Refer "Advanced Configurations" section for more details',
108
107
  )
109
108
 
109
+ max_schema_size: PositiveInt = Field(
110
+ default=300, description="Maximum number of fields to include in the schema."
111
+ )
112
+
110
113
  table_pattern: AllowDenyPattern = Field(
111
114
  default=AllowDenyPattern.allow_all(),
112
115
  description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'",
@@ -160,7 +163,7 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
160
163
 
161
164
  @platform_name("DynamoDB", id="dynamodb")
162
165
  @config_class(DynamoDBConfig)
163
- @support_status(SupportStatus.TESTING)
166
+ @support_status(SupportStatus.INCUBATING)
164
167
  @capability(
165
168
  SourceCapability.PLATFORM_INSTANCE,
166
169
  "By default, platform_instance will use the AWS account id",
@@ -455,25 +458,25 @@ class DynamoDBSource(StatefulIngestionSourceBase):
455
458
  ) -> SchemaMetadataClass:
456
459
  """ "
457
460
  To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
458
- in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the
461
+ in descending order and truncate the schema by max_schema_size, and then start to construct the
459
462
  schema metadata sorted by attribute name
460
463
  """
461
464
 
462
465
  canonical_schema: List[SchemaField] = []
463
466
  schema_size = len(schema.values())
464
467
  table_fields = list(schema.values())
465
- if schema_size > MAX_SCHEMA_SIZE:
468
+ if schema_size > self.config.max_schema_size:
466
469
  # downsample the schema, using frequency as the sort key
467
470
  self.report.report_warning(
468
471
  title="Schema Size Too Large",
469
- message=f"Downsampling the table schema because MAX_SCHEMA_SIZE threshold is {MAX_SCHEMA_SIZE}",
472
+ message=f"Downsampling the table schema because `max_schema_size` threshold is {self.config.max_schema_size}",
470
473
  context=f"Collection: {dataset_urn}",
471
474
  )
472
475
 
473
476
  # Add this information to the custom properties so user can know they are looking at down sampled schema
474
477
  dataset_properties.customProperties["schema.downsampled"] = "True"
475
478
  dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
476
- # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
479
+ # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include max_schema_size items
477
480
  primary_keys = []
478
481
  for schema_field in sorted(
479
482
  table_fields,
@@ -481,7 +484,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
481
484
  -x["count"],
482
485
  x["delimited_name"],
483
486
  ), # Negate `count` for descending order, `delimited_name` stays the same for ascending
484
- )[0:MAX_SCHEMA_SIZE]:
487
+ )[: self.config.max_schema_size]:
485
488
  field_path = schema_field["delimited_name"]
486
489
  native_data_type = self.get_native_type(schema_field["type"], table_name)
487
490
  type = self.get_field_type(schema_field["type"], table_name)
File without changes