acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -178,7 +178,9 @@ class SACSourceReport(StaleEntityRemovalSourceReport):
178
178
  SourceCapability.LINEAGE_COARSE,
179
179
  "Enabled by default (only for Live Data Models)",
180
180
  )
181
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
181
+ @capability(
182
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
183
+ )
182
184
  @capability(
183
185
  SourceCapability.SCHEMA_METADATA,
184
186
  "Enabled by default (only for Import Data Models)",
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
33
33
  )
34
34
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
- from datahub.ingestion.source.common.subtypes import DatasetSubTypes
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ DatasetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
37
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
41
  StaleEntityRemovalHandler,
39
42
  StaleEntityRemovalSourceReport,
@@ -107,30 +110,33 @@ class SalesforceConfig(
107
110
  auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
108
111
 
109
112
  # Username, Password Auth
110
- username: Optional[str] = Field(description="Salesforce username")
111
- password: Optional[str] = Field(description="Password for Salesforce user")
113
+ username: Optional[str] = Field(None, description="Salesforce username")
114
+ password: Optional[str] = Field(None, description="Password for Salesforce user")
112
115
  consumer_key: Optional[str] = Field(
113
- description="Consumer key for Salesforce JSON web token access"
116
+ None, description="Consumer key for Salesforce JSON web token access"
114
117
  )
115
118
  private_key: Optional[str] = Field(
116
- description="Private key as a string for Salesforce JSON web token access"
119
+ None, description="Private key as a string for Salesforce JSON web token access"
117
120
  )
118
121
  security_token: Optional[str] = Field(
119
- description="Security token for Salesforce username"
122
+ None, description="Security token for Salesforce username"
120
123
  )
121
124
  # client_id, client_secret not required
122
125
 
123
126
  # Direct - Instance URL, Access Token Auth
124
127
  instance_url: Optional[str] = Field(
125
- description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com"
128
+ None,
129
+ description="Salesforce instance url. e.g. https://MyDomainName.my.salesforce.com",
126
130
  )
127
131
  # Flag to indicate whether the instance is production or sandbox
128
132
  is_sandbox: bool = Field(
129
133
  default=False, description="Connect to Sandbox instance of your Salesforce"
130
134
  )
131
- access_token: Optional[str] = Field(description="Access token for instance url")
135
+ access_token: Optional[str] = Field(
136
+ None, description="Access token for instance url"
137
+ )
132
138
 
133
- ingest_tags: Optional[bool] = Field(
139
+ ingest_tags: bool = Field(
134
140
  default=False,
135
141
  description="Ingest Tags from source. This will override Tags entered from UI",
136
142
  )
@@ -144,7 +150,8 @@ class SalesforceConfig(
144
150
  description='Regex patterns for tables/schemas to describe domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain keys specified.',
145
151
  )
146
152
  api_version: Optional[str] = Field(
147
- description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'"
153
+ None,
154
+ description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'",
148
155
  )
149
156
 
150
157
  profiling: SalesforceProfilingConfig = SalesforceProfilingConfig()
@@ -520,7 +527,7 @@ class SalesforceApi:
520
527
 
521
528
  @platform_name("Salesforce")
522
529
  @config_class(SalesforceConfig)
523
- @support_status(SupportStatus.INCUBATING)
530
+ @support_status(SupportStatus.CERTIFIED)
524
531
  @capability(
525
532
  capability_name=SourceCapability.PLATFORM_INSTANCE,
526
533
  description="Can be equivalent to Salesforce organization",
@@ -532,11 +539,11 @@ class SalesforceApi:
532
539
  @capability(
533
540
  capability_name=SourceCapability.DATA_PROFILING,
534
541
  description="Only table level profiling is supported via `profiling.enabled` config field",
542
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
535
543
  )
536
544
  @capability(
537
545
  capability_name=SourceCapability.DELETION_DETECTION,
538
- description="Not supported yet",
539
- supported=False,
546
+ description="Enabled by default via stateful ingestion",
540
547
  )
541
548
  @capability(
542
549
  capability_name=SourceCapability.SCHEMA_METADATA,
@@ -546,6 +553,14 @@ class SalesforceApi:
546
553
  capability_name=SourceCapability.TAGS,
547
554
  description="Enabled by default",
548
555
  )
556
+ @capability(
557
+ capability_name=SourceCapability.LINEAGE_COARSE,
558
+ description="Extract table-level lineage for Salesforce objects",
559
+ subtype_modifier=[
560
+ SourceCapabilityModifier.SALESFORCE_CUSTOM_OBJECT,
561
+ SourceCapabilityModifier.SALESFORCE_STANDARD_OBJECT,
562
+ ],
563
+ )
549
564
  class SalesforceSource(StatefulIngestionSourceBase):
550
565
  def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
551
566
  super().__init__(config, ctx)
@@ -4,7 +4,6 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import unittest
7
- import urllib.request
8
7
  from dataclasses import dataclass
9
8
  from os.path import basename, dirname
10
9
  from pathlib import Path
@@ -12,6 +11,7 @@ from typing import Any, Iterable, List, Optional, Union
12
11
  from urllib.parse import urlparse
13
12
 
14
13
  import jsonref
14
+ import requests
15
15
  from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
16
16
  from pydantic.fields import Field
17
17
 
@@ -91,19 +91,18 @@ class JsonSchemaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMix
91
91
  )
92
92
 
93
93
  @validator("path")
94
- def download_http_url_to_temp_file(v):
94
+ def download_http_url_to_temp_file(cls, v):
95
95
  if isinstance(v, AnyHttpUrl):
96
96
  try:
97
- with urllib.request.urlopen(v) as response:
98
- schema_dict = json.load(response)
99
- if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
100
- schema_dict["$id"] = str(v)
101
- with tempfile.NamedTemporaryFile(
102
- mode="w", delete=False
103
- ) as tmp_file:
104
- tmp_file.write(json.dumps(schema_dict))
105
- tmp_file.flush()
106
- return tmp_file.name
97
+ response = requests.get(str(v))
98
+ response.raise_for_status()
99
+ schema_dict = response.json()
100
+ if not JsonSchemaTranslator._get_id_from_any_schema(schema_dict):
101
+ schema_dict["$id"] = str(v)
102
+ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
103
+ tmp_file.write(json.dumps(schema_dict))
104
+ tmp_file.flush()
105
+ return tmp_file.name
107
106
  except Exception as e:
108
107
  logger.error(
109
108
  f"Failed to localize url {v} due to {e}. Run with --debug to get full stacktrace"
@@ -353,7 +352,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
353
352
  if self.config.platform_instance:
354
353
  browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
355
354
 
356
- if os.path.isdir(self.config.path):
355
+ if isinstance(self.config.path, Path) and self.config.path.is_dir():
357
356
  for root, _, files in os.walk(self.config.path, topdown=False):
358
357
  for file_name in [f for f in files if f.endswith(".json")]:
359
358
  try:
@@ -373,10 +372,11 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
373
372
 
374
373
  else:
375
374
  try:
375
+ assert isinstance(self.config.path, Path)
376
376
  yield from self._load_one_file(
377
377
  ref_loader,
378
378
  browse_prefix=browse_prefix,
379
- root_dir=Path(os.path.dirname(Path(self.config.path))),
379
+ root_dir=self.config.path.parent,
380
380
  file_name=str(self.config.path),
381
381
  )
382
382
  except Exception as e:
@@ -1,4 +1,4 @@
1
- from collections import Counter
1
+ from collections import Counter, defaultdict
2
2
  from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
3
3
 
4
4
  from typing_extensions import TypedDict
@@ -84,7 +84,7 @@ def is_nullable_collection(
84
84
 
85
85
 
86
86
  def construct_schema(
87
- collection: Sequence[Dict[str, Any]], delimiter: str
87
+ collection: Sequence[Dict[str, Any]], delimiter: str = "."
88
88
  ) -> Dict[Tuple[str, ...], SchemaDescription]:
89
89
  """
90
90
  Construct (infer) a schema from a collection of documents.
@@ -104,9 +104,11 @@ def construct_schema(
104
104
  string to concatenate field names by
105
105
  """
106
106
 
107
- schema: Dict[Tuple[str, ...], BasicSchemaDescription] = {}
107
+ schema: Dict[Tuple[str, ...], BasicSchemaDescription] = defaultdict(
108
+ lambda: {"types": Counter(), "count": 0}
109
+ )
108
110
 
109
- def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> None:
111
+ def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> int:
110
112
  """
111
113
  Recursively update the schema with a document, which may/may not contain nested fields.
112
114
 
@@ -118,18 +120,24 @@ def construct_schema(
118
120
  prefix of fields that the document is under, pass an empty tuple when initializing
119
121
  """
120
122
 
123
+ # we want to make sure that parents of nested structures are included first, before their children, so that
124
+ # they are displayed properly in the UI, also in the event of trimming the list (which happens, for example,
125
+ # in mongodb ingestor)
126
+ max_count = 0
121
127
  for key, value in doc.items():
122
128
  new_parent_prefix = parent_prefix + (key,)
123
129
 
124
130
  # if nested value, look at the types within
125
131
  if isinstance(value, dict):
126
- append_to_schema(value, new_parent_prefix)
132
+ max_count = max(append_to_schema(value, new_parent_prefix), max_count)
127
133
  # if array of values, check what types are within
128
134
  if isinstance(value, list):
129
135
  for item in value:
130
136
  # if dictionary, add it as a nested object
131
137
  if isinstance(item, dict):
132
- append_to_schema(item, new_parent_prefix)
138
+ max_count = max(
139
+ append_to_schema(item, new_parent_prefix), max_count
140
+ )
133
141
 
134
142
  # don't record None values (counted towards nullable)
135
143
  if value is not None:
@@ -143,6 +151,14 @@ def construct_schema(
143
151
  # update the type count
144
152
  schema[new_parent_prefix]["types"].update({type(value): 1})
145
153
  schema[new_parent_prefix]["count"] += 1
154
+ max_count = max(schema[new_parent_prefix]["count"], max_count)
155
+
156
+ if parent_prefix != ():
157
+ schema[parent_prefix]["count"] = max(
158
+ schema[parent_prefix]["count"], max_count
159
+ )
160
+
161
+ return max_count
146
162
 
147
163
  for document in collection:
148
164
  append_to_schema(document, ())
@@ -1,14 +1,16 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Dict, Optional
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  import pydantic
6
+ from pydantic import BaseModel, Field
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
8
9
  from datahub.configuration.source_common import (
9
10
  EnvConfigMixin,
10
11
  PlatformInstanceConfigMixin,
11
12
  )
13
+ from datahub.ingestion.api.report import EntityFilterReport
12
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
13
15
  StaleEntityRemovalSourceReport,
14
16
  StatefulStaleMetadataRemovalConfig,
@@ -52,17 +54,82 @@ class Constant:
52
54
  DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
53
55
 
54
56
 
57
+ class WorkspaceCounts(BaseModel):
58
+ workbooks_count: int = 0
59
+ datasets_count: int = 0
60
+ elements_count: int = 0
61
+ pages_count: int = 0
62
+
63
+ def is_empty(self) -> bool:
64
+ return (
65
+ self.workbooks_count == 0
66
+ and self.datasets_count == 0
67
+ and self.elements_count == 0
68
+ and self.pages_count == 0
69
+ )
70
+
71
+ def as_obj(self) -> dict:
72
+ return {
73
+ "workbooks_count": self.workbooks_count,
74
+ "datasets_count": self.datasets_count,
75
+ "elements_count": self.elements_count,
76
+ "pages_count": self.pages_count,
77
+ }
78
+
79
+
80
+ class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
81
+ type: str = "workspace"
82
+
83
+ workspace_counts: Dict[str, WorkspaceCounts] = Field(
84
+ default_factory=dict,
85
+ description="Counts of workbooks, datasets, elements and pages in each workspace.",
86
+ )
87
+
88
+ def increment_workbooks_count(self, workspace_id: str) -> None:
89
+ if workspace_id not in self.workspace_counts:
90
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
91
+ self.workspace_counts[workspace_id].workbooks_count += 1
92
+
93
+ def increment_datasets_count(self, workspace_id: str) -> None:
94
+ if workspace_id not in self.workspace_counts:
95
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
96
+ self.workspace_counts[workspace_id].datasets_count += 1
97
+
98
+ def increment_elements_count(self, workspace_id: str) -> None:
99
+ if workspace_id not in self.workspace_counts:
100
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
101
+ self.workspace_counts[workspace_id].elements_count += 1
102
+
103
+ def increment_pages_count(self, workspace_id: str) -> None:
104
+ if workspace_id not in self.workspace_counts:
105
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
106
+ self.workspace_counts[workspace_id].pages_count += 1
107
+
108
+ def as_obj(self) -> dict:
109
+ return {
110
+ "filtered": self.dropped_entities.as_obj(),
111
+ "processed": self.processed_entities.as_obj(),
112
+ "workspace_counts": {
113
+ key: item.as_obj() for key, item in self.workspace_counts.items()
114
+ },
115
+ }
116
+
117
+
55
118
  @dataclass
56
119
  class SigmaSourceReport(StaleEntityRemovalSourceReport):
57
- number_of_workspaces: int = 0
120
+ workspaces: SigmaWorkspaceEntityFilterReport = field(
121
+ default_factory=SigmaWorkspaceEntityFilterReport
122
+ )
58
123
  non_accessible_workspaces_count: int = 0
59
- shared_entities_count: int = 0
60
- number_of_datasets: int = 0
61
- number_of_workbooks: int = 0
62
- number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
63
124
 
64
- def report_number_of_workspaces(self, number_of_workspaces: int) -> None:
65
- self.number_of_workspaces = number_of_workspaces
125
+ datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
126
+ datasets_without_workspace: int = 0
127
+
128
+ workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
129
+ workbooks_without_workspace: int = 0
130
+
131
+ number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
132
+ empty_workspaces: List[str] = field(default_factory=list)
66
133
 
67
134
 
68
135
  class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from datetime import datetime
2
3
  from typing import Dict, List, Optional
3
4
 
@@ -23,6 +24,8 @@ class Workspace(BaseModel):
23
24
 
24
25
  @root_validator(pre=True)
25
26
  def update_values(cls, values: Dict) -> Dict:
27
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
28
+ values = deepcopy(values)
26
29
  # Update name if presonal workspace
27
30
  if values["name"] == "User Folder":
28
31
  values["name"] = "My documents"
@@ -30,11 +30,13 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
30
30
  from datahub.ingestion.source.common.subtypes import (
31
31
  BIContainerSubTypes,
32
32
  DatasetSubTypes,
33
+ SourceCapabilityModifier,
33
34
  )
34
35
  from datahub.ingestion.source.sigma.config import (
35
36
  PlatformDetail,
36
37
  SigmaSourceConfig,
37
38
  SigmaSourceReport,
39
+ WorkspaceCounts,
38
40
  )
39
41
  from datahub.ingestion.source.sigma.data_classes import (
40
42
  Element,
@@ -94,7 +96,11 @@ logger = logging.getLogger(__name__)
94
96
  @platform_name("Sigma")
95
97
  @config_class(SigmaSourceConfig)
96
98
  @support_status(SupportStatus.INCUBATING)
97
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
99
+ @capability(
100
+ SourceCapability.CONTAINERS,
101
+ "Enabled by default",
102
+ subtype_modifier=[SourceCapabilityModifier.SIGMA_WORKSPACE],
103
+ )
98
104
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
99
105
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default.")
100
106
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@@ -104,6 +110,7 @@ logger = logging.getLogger(__name__)
104
110
  SourceCapability.OWNERSHIP,
105
111
  "Enabled by default, configured using `ingest_owner`",
106
112
  )
113
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
107
114
  class SigmaSource(StatefulIngestionSourceBase, TestableSource):
108
115
  """
109
116
  This plugin extracts the following:
@@ -162,14 +169,18 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
162
169
 
163
170
  def _get_allowed_workspaces(self) -> List[Workspace]:
164
171
  all_workspaces = self.sigma_api.workspaces.values()
165
- allowed_workspaces = [
166
- workspace
167
- for workspace in all_workspaces
168
- if self.config.workspace_pattern.allowed(workspace.name)
169
- ]
170
172
  logger.info(f"Number of workspaces = {len(all_workspaces)}")
171
- self.reporter.report_number_of_workspaces(len(all_workspaces))
173
+
174
+ allowed_workspaces = []
175
+ for workspace in all_workspaces:
176
+ if self.config.workspace_pattern.allowed(workspace.name):
177
+ allowed_workspaces.append(workspace)
178
+ else:
179
+ self.reporter.workspaces.dropped(
180
+ f"{workspace.name} ({workspace.workspaceId})"
181
+ )
172
182
  logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
183
+
173
184
  return allowed_workspaces
174
185
 
175
186
  def _gen_workspace_workunit(
@@ -280,6 +291,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
280
291
  yield self._gen_dataset_properties(dataset_urn, dataset)
281
292
 
282
293
  if dataset.workspaceId:
294
+ self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
283
295
  yield from add_entity_to_container(
284
296
  container_key=self._gen_workspace_key(dataset.workspaceId),
285
297
  entity_type="dataset",
@@ -463,6 +475,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
463
475
  ).as_workunit()
464
476
 
465
477
  if workbook.workspaceId:
478
+ self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
479
+
466
480
  yield self._gen_entity_browsepath_aspect(
467
481
  entity_urn=chart_urn,
468
482
  parent_entity_urn=builder.make_container_urn(
@@ -520,6 +534,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
520
534
  all_input_fields: List[InputFieldClass] = []
521
535
 
522
536
  if workbook.workspaceId:
537
+ self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
523
538
  yield self._gen_entity_browsepath_aspect(
524
539
  entity_urn=dashboard_urn,
525
540
  parent_entity_urn=builder.make_container_urn(
@@ -609,6 +624,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
609
624
 
610
625
  paths = workbook.path.split("/")[1:]
611
626
  if workbook.workspaceId:
627
+ self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
628
+
612
629
  yield self._gen_entity_browsepath_aspect(
613
630
  entity_urn=dashboard_urn,
614
631
  parent_entity_urn=builder.make_container_urn(
@@ -658,7 +675,19 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
658
675
  yield from self._gen_workbook_workunit(workbook)
659
676
 
660
677
  for workspace in self._get_allowed_workspaces():
678
+ self.reporter.workspaces.processed(
679
+ f"{workspace.name} ({workspace.workspaceId})"
680
+ )
661
681
  yield from self._gen_workspace_workunit(workspace)
682
+ if self.reporter.workspaces.workspace_counts.get(
683
+ workspace.workspaceId, WorkspaceCounts()
684
+ ).is_empty():
685
+ logger.warning(
686
+ f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
687
+ )
688
+ self.reporter.empty_workspaces.append(
689
+ f"{workspace.name} ({workspace.workspaceId})"
690
+ )
662
691
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
663
692
 
664
693
  def get_report(self) -> SourceReport: