acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,25 @@
1
1
  import dataclasses
2
2
  import logging
3
+ import os
3
4
  import re
4
5
  import time
5
6
  from dataclasses import dataclass
6
7
  from datetime import datetime, timezone
7
8
  from functools import lru_cache
8
9
  from json import JSONDecodeError
9
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
10
+ from typing import (
11
+ Dict,
12
+ Iterable,
13
+ Iterator,
14
+ List,
15
+ Optional,
16
+ Set,
17
+ Tuple,
18
+ Union,
19
+ )
10
20
 
11
21
  import dateutil.parser as dp
22
+ import psutil
12
23
  import pydantic
13
24
  import requests
14
25
  import sqlglot
@@ -22,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
22
33
  from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
23
34
 
24
35
  import datahub.emitter.mce_builder as builder
25
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
36
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
26
37
  from datahub.configuration.source_common import (
27
38
  DatasetLineageProviderConfigBase,
28
39
  )
@@ -33,6 +44,7 @@ from datahub.emitter.mcp_builder import (
33
44
  add_dataset_to_container,
34
45
  gen_containers,
35
46
  )
47
+ from datahub.emitter.request_helper import make_curl_command
36
48
  from datahub.ingestion.api.common import PipelineContext
37
49
  from datahub.ingestion.api.decorators import (
38
50
  SourceCapability,
@@ -113,8 +125,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
113
125
  )
114
126
  from datahub.utilities import config_clean
115
127
  from datahub.utilities.lossy_collections import LossyList
128
+ from datahub.utilities.perf_timer import PerfTimer
116
129
 
117
130
  logger: logging.Logger = logging.getLogger(__name__)
131
+ # Default API limit for items returned per API call
132
+ # Used for the default per_page value for paginated API requests
133
+ DEFAULT_API_ITEMS_PER_PAGE = 30
118
134
 
119
135
 
120
136
  class SpaceKey(ContainerKey):
@@ -193,15 +209,37 @@ class ModeConfig(
193
209
  default=True, description="Tag measures and dimensions in the schema"
194
210
  )
195
211
 
212
+ items_per_page: HiddenFromDocs[int] = Field(
213
+ DEFAULT_API_ITEMS_PER_PAGE,
214
+ description="Number of items per page for paginated API requests.",
215
+ )
216
+
217
+ exclude_archived: bool = Field(
218
+ default=False, description="Exclude archived reports"
219
+ )
220
+
196
221
  @validator("connect_uri")
197
222
  def remove_trailing_slash(cls, v):
198
223
  return config_clean.remove_trailing_slashes(v)
199
224
 
225
+ @validator("items_per_page")
226
+ def validate_items_per_page(cls, v):
227
+ if 1 <= v <= DEFAULT_API_ITEMS_PER_PAGE:
228
+ return v
229
+ else:
230
+ raise ValueError(
231
+ f"items_per_page must be between 1 and {DEFAULT_API_ITEMS_PER_PAGE}"
232
+ )
233
+
200
234
 
201
235
  class HTTPError429(HTTPError):
202
236
  pass
203
237
 
204
238
 
239
+ class HTTPError504(HTTPError):
240
+ pass
241
+
242
+
205
243
  ModeRequestError = (HTTPError, JSONDecodeError)
206
244
 
207
245
 
@@ -216,6 +254,23 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
216
254
  num_query_template_render: int = 0
217
255
  num_query_template_render_failures: int = 0
218
256
  num_query_template_render_success: int = 0
257
+ num_requests_exceeding_rate_limit: int = 0
258
+ num_requests_retried_on_timeout: int = 0
259
+ num_spaces_retrieved: int = 0
260
+ space_get_api_called: int = 0
261
+ report_get_api_called: int = 0
262
+ dataset_get_api_called: int = 0
263
+ query_get_api_called: int = 0
264
+ chart_get_api_called: int = 0
265
+ get_cache_hits: int = 0
266
+ get_cache_misses: int = 0
267
+ get_cache_size: int = 0
268
+ process_memory_used_mb: float = 0
269
+ space_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
270
+ report_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
271
+ dataset_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
272
+ query_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
273
+ chart_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
219
274
 
220
275
  def report_dropped_space(self, ent_name: str) -> None:
221
276
  self.filtered_spaces.append(ent_name)
@@ -339,7 +394,8 @@ class ModeSource(StatefulIngestionSourceBase):
339
394
 
340
395
  # Test the connection
341
396
  try:
342
- self._get_request_json(f"{self.config.connect_uri}/api/verify")
397
+ key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
398
+ logger.debug(f"Auth info: {key_info}")
343
399
  except ModeRequestError as e:
344
400
  self.report.report_failure(
345
401
  title="Failed to Connect",
@@ -454,9 +510,23 @@ class ModeSource(StatefulIngestionSourceBase):
454
510
  # Datasets
455
511
  datasets = []
456
512
  for imported_dataset_name in report_info.get("imported_datasets", {}):
457
- mode_dataset = self._get_request_json(
458
- f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
459
- )
513
+ try:
514
+ mode_dataset = self._get_request_json(
515
+ f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
516
+ )
517
+ except HTTPError as http_error:
518
+ status_code = http_error.response.status_code
519
+ if status_code == 404:
520
+ self.report.report_warning(
521
+ title="Report Not Found",
522
+ message="Referenced report for reusable dataset was not found.",
523
+ context=f"Report: {report_info.get('id')}, "
524
+ f"Imported Dataset Report: {imported_dataset_name.get('token')}",
525
+ )
526
+ continue
527
+ else:
528
+ raise http_error
529
+
460
530
  dataset_urn = builder.make_dataset_urn_with_platform_instance(
461
531
  self.platform,
462
532
  str(mode_dataset.get("id")),
@@ -560,29 +630,38 @@ class ModeSource(StatefulIngestionSourceBase):
560
630
  space_info = {}
561
631
  try:
562
632
  logger.debug(f"Retrieving spaces for {self.workspace_uri}")
563
- payload = self._get_request_json(f"{self.workspace_uri}/spaces?filter=all")
564
- spaces = payload.get("_embedded", {}).get("spaces", {})
565
- logger.debug(
566
- f"Got {len(spaces)} spaces from workspace {self.workspace_uri}"
567
- )
568
- for s in spaces:
569
- logger.debug(f"Space: {s.get('name')}")
570
- space_name = s.get("name", "")
571
- # Using both restricted and default_access_level because
572
- # there is a current bug with restricted returning False everytime
573
- # which has been reported to Mode team
574
- if self.config.exclude_restricted and (
575
- s.get("restricted") or s.get("default_access_level") == "restricted"
633
+ with self.report.space_get_timer:
634
+ for spaces_page in self._get_paged_request_json(
635
+ f"{self.workspace_uri}/spaces?filter=all",
636
+ "spaces",
637
+ self.config.items_per_page,
576
638
  ):
577
- logging.debug(
578
- f"Skipping space {space_name} due to exclude restricted"
639
+ self.report.space_get_api_called += 1
640
+ logger.debug(
641
+ f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
579
642
  )
580
- continue
581
- if not self.config.space_pattern.allowed(space_name):
582
- self.report.report_dropped_space(space_name)
583
- logging.debug(f"Skipping space {space_name} due to space pattern")
584
- continue
585
- space_info[s.get("token", "")] = s.get("name", "")
643
+ self.report.num_spaces_retrieved += len(spaces_page)
644
+ for s in spaces_page:
645
+ logger.debug(f"Space: {s.get('name')}")
646
+ space_name = s.get("name", "")
647
+ # Using both restricted and default_access_level because
648
+ # there is a current bug with restricted returning False everytime
649
+ # which has been reported to Mode team
650
+ if self.config.exclude_restricted and (
651
+ s.get("restricted")
652
+ or s.get("default_access_level") == "restricted"
653
+ ):
654
+ logging.debug(
655
+ f"Skipping space {space_name} due to exclude restricted"
656
+ )
657
+ continue
658
+ if not self.config.space_pattern.allowed(space_name):
659
+ self.report.report_dropped_space(space_name)
660
+ logging.debug(
661
+ f"Skipping space {space_name} due to space pattern"
662
+ )
663
+ continue
664
+ space_info[s.get("token", "")] = s.get("name", "")
586
665
  except ModeRequestError as e:
587
666
  self.report.report_failure(
588
667
  title="Failed to Retrieve Spaces",
@@ -897,7 +976,7 @@ class ModeSource(StatefulIngestionSourceBase):
897
976
  for match in matches:
898
977
  definition = Template(source=match).render()
899
978
  parameters = yaml.safe_load(definition)
900
- for key in parameters.keys():
979
+ for key in parameters:
901
980
  jinja_params[key] = parameters[key].get("default", "")
902
981
 
903
982
  normalized_query = re.sub(
@@ -1386,125 +1465,217 @@ class ModeSource(StatefulIngestionSourceBase):
1386
1465
  mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
1387
1466
  yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
1388
1467
 
1389
- @lru_cache(maxsize=None)
1390
- def _get_reports(self, space_token: str) -> List[dict]:
1391
- reports = []
1468
+ def _get_reports(self, space_token: str) -> Iterator[List[dict]]:
1392
1469
  try:
1393
- reports_json = self._get_request_json(
1394
- f"{self.workspace_uri}/spaces/{space_token}/reports"
1395
- )
1396
- reports = reports_json.get("_embedded", {}).get("reports", {})
1470
+ with self.report.report_get_timer:
1471
+ for reports_page in self._get_paged_request_json(
1472
+ f"{self.workspace_uri}/spaces/{space_token}/reports?filter=all",
1473
+ "reports",
1474
+ self.config.items_per_page,
1475
+ ):
1476
+ self.report.report_get_api_called += 1
1477
+ logger.debug(
1478
+ f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
1479
+ )
1480
+ if self.config.exclude_archived:
1481
+ logger.debug(
1482
+ f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
1483
+ )
1484
+ reports_page = [
1485
+ report
1486
+ for report in reports_page
1487
+ if not report.get("archived", False)
1488
+ ]
1489
+ yield reports_page
1397
1490
  except ModeRequestError as e:
1398
- self.report.report_failure(
1399
- title="Failed to Retrieve Reports for Space",
1400
- message="Unable to retrieve reports for space token.",
1401
- context=f"Space Token: {space_token}, Error: {str(e)}",
1402
- )
1403
- return reports
1491
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1492
+ self.report.report_warning(
1493
+ title="No Reports Found in Space",
1494
+ message="No reports were found in the space. It may have been recently deleted.",
1495
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1496
+ )
1497
+ else:
1498
+ self.report.report_failure(
1499
+ title="Failed to Retrieve Reports for Space",
1500
+ message="Unable to retrieve reports for space token.",
1501
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1502
+ )
1404
1503
 
1405
- @lru_cache(maxsize=None)
1406
- def _get_datasets(self, space_token: str) -> List[dict]:
1504
+ def _get_datasets(self, space_token: str) -> Iterator[List[dict]]:
1407
1505
  """
1408
1506
  Retrieves datasets for a given space token.
1409
1507
  """
1410
- datasets = []
1411
1508
  try:
1412
- url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
1413
- datasets_json = self._get_request_json(url)
1414
- datasets = datasets_json.get("_embedded", {}).get("reports", [])
1509
+ with self.report.dataset_get_timer:
1510
+ for dataset_page in self._get_paged_request_json(
1511
+ f"{self.workspace_uri}/spaces/{space_token}/datasets?filter=all",
1512
+ "reports",
1513
+ self.config.items_per_page,
1514
+ ):
1515
+ self.report.dataset_get_api_called += 1
1516
+ logger.debug(
1517
+ f"Read {len(dataset_page)} datasets records from workspace {self.workspace_uri} space {space_token}"
1518
+ )
1519
+ yield dataset_page
1415
1520
  except ModeRequestError as e:
1416
- self.report.report_failure(
1417
- title="Failed to Retrieve Datasets for Space",
1418
- message=f"Unable to retrieve datasets for space token {space_token}.",
1419
- context=f"Error: {str(e)}",
1420
- )
1421
- return datasets
1521
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1522
+ self.report.report_warning(
1523
+ title="No Datasets Found in Space",
1524
+ message="No datasets were found in the space. It may have been recently deleted.",
1525
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1526
+ )
1527
+ else:
1528
+ self.report.report_failure(
1529
+ title="Failed to Retrieve Datasets for Space",
1530
+ message=f"Unable to retrieve datasets for space token {space_token}.",
1531
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1532
+ )
1422
1533
 
1423
- @lru_cache(maxsize=None)
1424
- def _get_queries(self, report_token: str) -> list:
1425
- queries = []
1534
+ def _get_queries(self, report_token: str) -> List[dict]:
1426
1535
  try:
1427
- queries_json = self._get_request_json(
1428
- f"{self.workspace_uri}/reports/{report_token}/queries"
1429
- )
1430
- queries = queries_json.get("_embedded", {}).get("queries", {})
1536
+ with self.report.query_get_timer:
1537
+ # This endpoint does not handle pagination properly
1538
+ queries = self._get_request_json(
1539
+ f"{self.workspace_uri}/reports/{report_token}/queries"
1540
+ )
1541
+ self.report.query_get_api_called += 1
1542
+ logger.debug(
1543
+ f"Read {len(queries)} queries records from workspace {self.workspace_uri} report {report_token}"
1544
+ )
1545
+ return queries.get("_embedded", {}).get("queries", [])
1431
1546
  except ModeRequestError as e:
1432
- self.report.report_failure(
1433
- title="Failed to Retrieve Queries",
1434
- message="Unable to retrieve queries for report token.",
1435
- context=f"Report Token: {report_token}, Error: {str(e)}",
1436
- )
1437
- return queries
1547
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1548
+ self.report.report_warning(
1549
+ title="No Queries Found",
1550
+ message="No queries found for the report token. Maybe the report is deleted...",
1551
+ context=f"Report Token: {report_token}, Error: {str(e)}",
1552
+ )
1553
+ else:
1554
+ self.report.report_failure(
1555
+ title="Failed to Retrieve Queries",
1556
+ message="Unable to retrieve queries for report token.",
1557
+ context=f"Report Token: {report_token}, Error: {str(e)}",
1558
+ )
1559
+ return []
1438
1560
 
1439
1561
  @lru_cache(maxsize=None)
1440
- def _get_last_query_run(
1441
- self, report_token: str, report_run_id: str, query_run_id: str
1442
- ) -> Dict:
1562
+ def _get_last_query_run(self, report_token: str, report_run_id: str) -> list:
1563
+ # This function is unused and may be subject to removal in a future revision of this source
1564
+ query_runs = []
1443
1565
  try:
1444
- queries_json = self._get_request_json(
1445
- f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
1446
- )
1447
- queries = queries_json.get("_embedded", {}).get("queries", {})
1566
+ for query_run_page in self._get_paged_request_json(
1567
+ f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs?filter=all",
1568
+ "query_runs",
1569
+ self.config.items_per_page,
1570
+ ):
1571
+ query_runs.extend(query_run_page)
1448
1572
  except ModeRequestError as e:
1449
1573
  self.report.report_failure(
1450
1574
  title="Failed to Retrieve Queries for Report",
1451
1575
  message="Unable to retrieve queries for report token.",
1452
1576
  context=f"Report Token:{report_token}, Error: {str(e)}",
1453
1577
  )
1454
- return {}
1455
- return queries
1578
+ return query_runs
1456
1579
 
1457
- @lru_cache(maxsize=None)
1458
- def _get_charts(self, report_token: str, query_token: str) -> list:
1459
- charts = []
1580
+ def _get_charts(self, report_token: str, query_token: str) -> List[dict]:
1460
1581
  try:
1461
- charts_json = self._get_request_json(
1462
- f"{self.workspace_uri}/reports/{report_token}"
1463
- f"/queries/{query_token}/charts"
1464
- )
1465
- charts = charts_json.get("_embedded", {}).get("charts", {})
1582
+ with self.report.chart_get_timer:
1583
+ # This endpoint does not handle pagination properly
1584
+ charts = self._get_request_json(
1585
+ f"{self.workspace_uri}/reports/{report_token}/queries/{query_token}/charts"
1586
+ )
1587
+ self.report.chart_get_api_called += 1
1588
+ logger.debug(
1589
+ f"Read {len(charts)} charts records from workspace {self.workspace_uri} report {report_token} query {query_token}"
1590
+ )
1591
+ return charts.get("_embedded", {}).get("charts", [])
1466
1592
  except ModeRequestError as e:
1467
- self.report.report_failure(
1468
- title="Failed to Retrieve Charts",
1469
- message="Unable to retrieve charts from Mode.",
1470
- context=f"Report Token: {report_token}, "
1471
- f"Query token: {query_token}, "
1472
- f"Error: {str(e)}",
1473
- )
1474
- return charts
1593
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1594
+ self.report.report_warning(
1595
+ title="No Charts Found for Query",
1596
+ message="No charts were found for the query. The query may have been recently deleted.",
1597
+ context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
1598
+ )
1599
+ else:
1600
+ self.report.report_failure(
1601
+ title="Failed to Retrieve Charts",
1602
+ message="Unable to retrieve charts from Mode.",
1603
+ context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
1604
+ )
1605
+ return []
1475
1606
 
1607
+ def _get_paged_request_json(
1608
+ self, url: str, key: str, per_page: int
1609
+ ) -> Iterator[List[Dict]]:
1610
+ page: int = 1
1611
+ while True:
1612
+ page_url = f"{url}&per_page={per_page}&page={page}"
1613
+ response = self._get_request_json(page_url)
1614
+ data: List[Dict] = response.get("_embedded", {}).get(key, [])
1615
+ if not data:
1616
+ break
1617
+ yield data
1618
+ page += 1
1619
+
1620
+ @lru_cache(maxsize=None)
1476
1621
  def _get_request_json(self, url: str) -> Dict:
1477
1622
  r = tenacity.Retrying(
1478
1623
  wait=wait_exponential(
1479
1624
  multiplier=self.config.api_options.retry_backoff_multiplier,
1480
1625
  max=self.config.api_options.max_retry_interval,
1481
1626
  ),
1482
- retry=retry_if_exception_type((HTTPError429, ConnectionError)),
1627
+ retry=retry_if_exception_type(
1628
+ (HTTPError429, HTTPError504, ConnectionError)
1629
+ ),
1483
1630
  stop=stop_after_attempt(self.config.api_options.max_attempts),
1484
1631
  )
1485
1632
 
1486
1633
  @r.wraps
1487
1634
  def get_request():
1635
+ curl_command = make_curl_command(self.session, "GET", url, "")
1636
+ logger.debug(f"Issuing request; curl equivalent: {curl_command}")
1637
+
1488
1638
  try:
1489
1639
  response = self.session.get(
1490
1640
  url, timeout=self.config.api_options.timeout
1491
1641
  )
1492
1642
  if response.status_code == 204: # No content, don't parse json
1493
1643
  return {}
1644
+
1645
+ response.raise_for_status()
1494
1646
  return response.json()
1495
1647
  except HTTPError as http_error:
1496
1648
  error_response = http_error.response
1497
1649
  if error_response.status_code == 429:
1650
+ self.report.num_requests_exceeding_rate_limit += 1
1498
1651
  # respect Retry-After
1499
1652
  sleep_time = error_response.headers.get("retry-after")
1500
1653
  if sleep_time is not None:
1501
1654
  time.sleep(float(sleep_time))
1502
1655
  raise HTTPError429 from None
1656
+ elif error_response.status_code == 504:
1657
+ self.report.num_requests_retried_on_timeout += 1
1658
+ time.sleep(0.1)
1659
+ raise HTTPError504 from None
1503
1660
 
1661
+ logger.debug(
1662
+ f"Error response ({error_response.status_code}): {error_response.text}"
1663
+ )
1504
1664
  raise http_error
1505
1665
 
1506
1666
  return get_request()
1507
1667
 
1668
+ @staticmethod
1669
+ def _get_process_memory():
1670
+ process = psutil.Process(os.getpid())
1671
+ mem_info = process.memory_info()
1672
+ return {
1673
+ "rss": mem_info.rss / (1024 * 1024),
1674
+ "vms": mem_info.vms / (1024 * 1024),
1675
+ "shared": getattr(mem_info, "shared", 0) / (1024 * 1024),
1676
+ "data": getattr(mem_info, "data", 0) / (1024 * 1024),
1677
+ }
1678
+
1508
1679
  @staticmethod
1509
1680
  def create_embed_aspect_mcp(
1510
1681
  entity_urn: str, embed_url: str
@@ -1540,115 +1711,116 @@ class ModeSource(StatefulIngestionSourceBase):
1540
1711
  yield from self.construct_space_container(space_token, space_name)
1541
1712
  space_container_key = self.gen_space_key(space_token)
1542
1713
 
1543
- reports = self._get_reports(space_token)
1544
- for report in reports:
1545
- logger.debug(
1546
- f"Report: name: {report.get('name')} token: {report.get('token')}"
1547
- )
1548
- dashboard_tuple_from_report = self.construct_dashboard(
1549
- space_token=space_token, report_info=report
1550
- )
1551
-
1552
- if dashboard_tuple_from_report is None:
1553
- continue
1554
- (
1555
- dashboard_snapshot_from_report,
1556
- browse_mcpw,
1557
- ) = dashboard_tuple_from_report
1714
+ for report_page in self._get_reports(space_token):
1715
+ for report in report_page:
1716
+ logger.debug(
1717
+ f"Report: name: {report.get('name')} token: {report.get('token')}"
1718
+ )
1719
+ dashboard_tuple_from_report = self.construct_dashboard(
1720
+ space_token=space_token, report_info=report
1721
+ )
1558
1722
 
1559
- mce = MetadataChangeEvent(
1560
- proposedSnapshot=dashboard_snapshot_from_report
1561
- )
1723
+ if dashboard_tuple_from_report is None:
1724
+ continue
1725
+ (
1726
+ dashboard_snapshot_from_report,
1727
+ browse_mcpw,
1728
+ ) = dashboard_tuple_from_report
1562
1729
 
1563
- mcpw = MetadataChangeProposalWrapper(
1564
- entityUrn=dashboard_snapshot_from_report.urn,
1565
- aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
1566
- )
1567
- yield mcpw.as_workunit()
1568
- yield from add_dataset_to_container(
1569
- container_key=space_container_key,
1570
- dataset_urn=dashboard_snapshot_from_report.urn,
1571
- )
1572
- yield browse_mcpw.as_workunit()
1730
+ mce = MetadataChangeEvent(
1731
+ proposedSnapshot=dashboard_snapshot_from_report
1732
+ )
1573
1733
 
1574
- usage_statistics = DashboardUsageStatisticsClass(
1575
- timestampMillis=round(datetime.now().timestamp() * 1000),
1576
- viewsCount=report.get("view_count", 0),
1577
- )
1734
+ mcpw = MetadataChangeProposalWrapper(
1735
+ entityUrn=dashboard_snapshot_from_report.urn,
1736
+ aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
1737
+ )
1738
+ yield mcpw.as_workunit()
1739
+ yield from add_dataset_to_container(
1740
+ container_key=space_container_key,
1741
+ dataset_urn=dashboard_snapshot_from_report.urn,
1742
+ )
1743
+ yield browse_mcpw.as_workunit()
1578
1744
 
1579
- yield MetadataChangeProposalWrapper(
1580
- entityUrn=dashboard_snapshot_from_report.urn,
1581
- aspect=usage_statistics,
1582
- ).as_workunit()
1745
+ usage_statistics = DashboardUsageStatisticsClass(
1746
+ timestampMillis=round(datetime.now().timestamp() * 1000),
1747
+ viewsCount=report.get("view_count", 0),
1748
+ )
1583
1749
 
1584
- if self.config.ingest_embed_url is True:
1585
- yield self.create_embed_aspect_mcp(
1586
- entity_urn=dashboard_snapshot_from_report.urn,
1587
- embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
1750
+ yield MetadataChangeProposalWrapper(
1751
+ entityUrn=dashboard_snapshot_from_report.urn,
1752
+ aspect=usage_statistics,
1588
1753
  ).as_workunit()
1589
1754
 
1590
- yield MetadataWorkUnit(id=dashboard_snapshot_from_report.urn, mce=mce)
1755
+ if self.config.ingest_embed_url is True:
1756
+ yield self.create_embed_aspect_mcp(
1757
+ entity_urn=dashboard_snapshot_from_report.urn,
1758
+ embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
1759
+ ).as_workunit()
1760
+
1761
+ yield MetadataWorkUnit(
1762
+ id=dashboard_snapshot_from_report.urn, mce=mce
1763
+ )
1591
1764
 
1592
1765
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1593
1766
  # Space/collection -> report -> query -> Chart
1594
- for space_token in self.space_tokens.keys():
1595
- reports = self._get_reports(space_token)
1596
- for report in reports:
1597
- report_token = report.get("token", "")
1598
-
1599
- queries = self._get_queries(report_token)
1600
- for query in queries:
1601
- query_mcps = self.construct_query_or_dataset(
1602
- report_token,
1603
- query,
1604
- space_token=space_token,
1605
- report_info=report,
1606
- is_mode_dataset=False,
1607
- )
1608
- chart_fields: Dict[str, SchemaFieldClass] = {}
1609
- for wu in query_mcps:
1610
- if isinstance(
1611
- wu.metadata, MetadataChangeProposalWrapper
1612
- ) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
1613
- schema_metadata = wu.metadata.aspect
1614
- for field in schema_metadata.fields:
1615
- chart_fields.setdefault(field.fieldPath, field)
1616
-
1617
- yield wu
1618
-
1619
- charts = self._get_charts(report_token, query.get("token", ""))
1620
- # build charts
1621
- for i, chart in enumerate(charts):
1622
- yield from self.construct_chart_from_api_data(
1623
- i,
1624
- chart,
1625
- chart_fields,
1767
+ for space_token in self.space_tokens:
1768
+ for report_page in self._get_reports(space_token):
1769
+ for report in report_page:
1770
+ report_token = report.get("token", "")
1771
+
1772
+ queries = self._get_queries(report_token)
1773
+ for query in queries:
1774
+ query_mcps = self.construct_query_or_dataset(
1775
+ report_token,
1626
1776
  query,
1627
1777
  space_token=space_token,
1628
1778
  report_info=report,
1629
- query_name=query["name"],
1779
+ is_mode_dataset=False,
1630
1780
  )
1781
+ chart_fields: Dict[str, SchemaFieldClass] = {}
1782
+ for wu in query_mcps:
1783
+ if isinstance(
1784
+ wu.metadata, MetadataChangeProposalWrapper
1785
+ ) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
1786
+ schema_metadata = wu.metadata.aspect
1787
+ for field in schema_metadata.fields:
1788
+ chart_fields.setdefault(field.fieldPath, field)
1789
+
1790
+ yield wu
1791
+
1792
+ charts = self._get_charts(report_token, query.get("token", ""))
1793
+ # build charts
1794
+ for i, chart in enumerate(charts):
1795
+ yield from self.construct_chart_from_api_data(
1796
+ i,
1797
+ chart,
1798
+ chart_fields,
1799
+ query,
1800
+ space_token=space_token,
1801
+ report_info=report,
1802
+ query_name=query["name"],
1803
+ )
1631
1804
 
1632
1805
  def emit_dataset_mces(self):
1633
1806
  """
1634
1807
  Emits MetadataChangeEvents (MCEs) for datasets within each space.
1635
1808
  """
1636
1809
  for space_token, _ in self.space_tokens.items():
1637
- datasets = self._get_datasets(space_token)
1638
-
1639
- for report in datasets:
1640
- report_token = report.get("token", "")
1641
- queries = self._get_queries(report_token)
1642
- for query in queries:
1643
- query_mcps = self.construct_query_or_dataset(
1644
- report_token,
1645
- query,
1646
- space_token=space_token,
1647
- report_info=report,
1648
- is_mode_dataset=True,
1649
- )
1650
- for wu in query_mcps:
1651
- yield wu
1810
+ for dataset_page in self._get_datasets(space_token):
1811
+ for report in dataset_page:
1812
+ report_token = report.get("token", "")
1813
+ queries = self._get_queries(report_token)
1814
+ for query in queries:
1815
+ query_mcps = self.construct_query_or_dataset(
1816
+ report_token,
1817
+ query,
1818
+ space_token=space_token,
1819
+ report_info=report,
1820
+ is_mode_dataset=True,
1821
+ )
1822
+ for wu in query_mcps:
1823
+ yield wu
1652
1824
 
1653
1825
  @classmethod
1654
1826
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "ModeSource":
@@ -1667,6 +1839,12 @@ class ModeSource(StatefulIngestionSourceBase):
1667
1839
  yield from self.emit_dashboard_mces()
1668
1840
  yield from self.emit_dataset_mces()
1669
1841
  yield from self.emit_chart_mces()
1842
+ cache_info = self._get_request_json.cache_info()
1843
+ self.report.get_cache_hits = cache_info.hits
1844
+ self.report.get_cache_misses = cache_info.misses
1845
+ self.report.get_cache_size = cache_info.currsize
1846
+ memory_used = self._get_process_memory()
1847
+ self.report.process_memory_used_mb = round(memory_used["rss"], 2)
1670
1848
 
1671
1849
  def get_report(self) -> SourceReport:
1672
1850
  return self.report