acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -37,9 +37,9 @@ from datahub.ingestion.api.source_helpers import (
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.graph.client import get_default_graph
39
39
  from datahub.ingestion.graph.config import ClientMode
40
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
41
- FineGrainedLineageDownstreamType,
42
- FineGrainedLineageUpstreamType,
40
+ from datahub.metadata.schema_classes import (
41
+ FineGrainedLineageDownstreamTypeClass,
42
+ FineGrainedLineageUpstreamTypeClass,
43
43
  )
44
44
 
45
45
  logger = logging.getLogger(__name__)
@@ -49,7 +49,7 @@ class EntityConfig(EnvConfigMixin):
49
49
  name: str
50
50
  type: str
51
51
  platform: str
52
- platform_instance: Optional[str]
52
+ platform_instance: Optional[str] = None
53
53
 
54
54
  @validator("type")
55
55
  def type_must_be_supported(cls, v: str) -> str:
@@ -80,9 +80,9 @@ class FineGrainedLineageConfig(ConfigModel):
80
80
  @validator("upstreamType")
81
81
  def upstream_type_must_be_supported(cls, v: str) -> str:
82
82
  allowed_types = [
83
- FineGrainedLineageUpstreamType.FIELD_SET,
84
- FineGrainedLineageUpstreamType.DATASET,
85
- FineGrainedLineageUpstreamType.NONE,
83
+ FineGrainedLineageUpstreamTypeClass.FIELD_SET,
84
+ FineGrainedLineageUpstreamTypeClass.DATASET,
85
+ FineGrainedLineageUpstreamTypeClass.NONE,
86
86
  ]
87
87
  if v not in allowed_types:
88
88
  raise ValueError(
@@ -93,8 +93,8 @@ class FineGrainedLineageConfig(ConfigModel):
93
93
  @validator("downstreamType")
94
94
  def downstream_type_must_be_supported(cls, v: str) -> str:
95
95
  allowed_types = [
96
- FineGrainedLineageDownstreamType.FIELD_SET,
97
- FineGrainedLineageDownstreamType.FIELD,
96
+ FineGrainedLineageDownstreamTypeClass.FIELD_SET,
97
+ FineGrainedLineageDownstreamTypeClass.FIELD,
98
98
  ]
99
99
  if v not in allowed_types:
100
100
  raise ValueError(
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
33
33
  )
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
35
  from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
36
- from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ MLAssetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
37
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
41
  StaleEntityRemovalHandler,
39
42
  StaleEntityRemovalSourceReport,
@@ -133,11 +136,18 @@ class MLflowRegisteredModelStageInfo:
133
136
 
134
137
  @platform_name("MLflow")
135
138
  @config_class(MLflowConfig)
136
- @support_status(SupportStatus.TESTING)
139
+ @support_status(SupportStatus.INCUBATING)
137
140
  @capability(
138
141
  SourceCapability.DESCRIPTIONS,
139
142
  "Extract descriptions for MLflow Registered Models and Model Versions",
140
143
  )
144
+ @capability(
145
+ SourceCapability.CONTAINERS,
146
+ "Extract ML experiments",
147
+ subtype_modifier=[
148
+ SourceCapabilityModifier.MLFLOW_EXPERIMENT,
149
+ ],
150
+ )
141
151
  @capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
142
152
  class MLflowSource(StatefulIngestionSourceBase):
143
153
  platform = "mlflow"
File without changes
@@ -0,0 +1,533 @@
1
+ import logging
2
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
3
+
4
+ from pydantic import Field
5
+
6
+ from datahub.configuration.common import ConfigModel
7
+ from datahub.emitter.mce_builder import make_dataset_urn
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.api.decorators import (
11
+ SupportStatus,
12
+ config_class,
13
+ platform_name,
14
+ support_status,
15
+ )
16
+ from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
17
+ from datahub.ingestion.api.source_helpers import AutoSystemMetadata, auto_workunit
18
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
19
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
20
+ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
21
+ DataHubMockDataReport,
22
+ )
23
+ from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
24
+ from datahub.metadata.schema_classes import (
25
+ CalendarIntervalClass,
26
+ DatasetLineageTypeClass,
27
+ DatasetProfileClass,
28
+ DatasetUsageStatisticsClass,
29
+ StatusClass,
30
+ SubTypesClass,
31
+ TimeWindowSizeClass,
32
+ UpstreamClass,
33
+ UpstreamLineageClass,
34
+ )
35
+ from datahub.sdk.entity import Entity
36
+ from datahub.utilities.str_enum import StrEnum
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ PLATFORM_NAME = "fake"
41
+
42
+
43
+ class SubTypePattern(StrEnum):
44
+ ALTERNATING = "alternating"
45
+ ALL_TABLE = "all_table"
46
+ ALL_VIEW = "all_view"
47
+ LEVEL_BASED = "level_based"
48
+
49
+
50
+ class LineageConfigGen1(ConfigModel):
51
+ """
52
+ Configuration for generating mock lineage data for testing purposes.
53
+
54
+ This configuration controls how the mock data source generates a hierarchical
55
+ lineage graph with multiple levels of upstream/downstream relationships.
56
+
57
+ The lineage graph is structured as follows:
58
+ - Level 0: 1 table (root)
59
+ - Level 1: lineage_fan_out tables (each connected to the root)
60
+ - Level 2+: If lineage_fan_out_after_first_hop is set, uses that value;
61
+ otherwise uses lineage_fan_out^level tables (each connected to a level 1 table)
62
+ - ... and so on for lineage_hops levels
63
+
64
+ Examples:
65
+ - With lineage_fan_out=2, lineage_hops=1: Creates 3 tables total
66
+ (1 root + 2 downstream) with 2 lineage relationships
67
+ - With lineage_fan_out=3, lineage_hops=2: Creates 13 tables total
68
+ (1 + 3 + 9) with 12 lineage relationships
69
+ - With lineage_fan_out=4, lineage_hops=1: Creates 5 tables total
70
+ (1 + 4) with 4 lineage relationships
71
+ - With lineage_fan_out=3, lineage_hops=3, lineage_fan_out_after_first_hop=2:
72
+ Creates 1 + 3 + 6 + 12 = 22 tables total (prevents exponential growth)
73
+
74
+ Table naming convention: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
75
+ """
76
+
77
+ enabled: bool = Field(
78
+ default=False,
79
+ description="Whether this source is enabled",
80
+ )
81
+
82
+ table_name_prefix: Optional[str] = Field(
83
+ default=None,
84
+ description="Prefix to add to the table name. This is useful for testing purposes.",
85
+ )
86
+
87
+ emit_lineage: bool = Field(
88
+ default=True,
89
+ description="Whether to emit lineage data for testing purposes. When False, no lineage data is generated regardless of other settings.",
90
+ )
91
+ emit_usage: bool = Field(
92
+ default=True,
93
+ description="Whether to emit usage data for testing purposes. When False, no usage data is generated regardless of other settings.",
94
+ )
95
+
96
+ lineage_fan_out: int = Field(
97
+ default=3,
98
+ description="Number of downstream tables that each upstream table connects to. This controls the 'width' of the lineage graph. Higher values create more parallel downstream tables per level.",
99
+ )
100
+
101
+ lineage_hops: int = Field(
102
+ default=2,
103
+ description="Number of hops (levels) in the lineage graph. This controls the 'depth' of the lineage graph. Level 0 is the root table, and each subsequent level contains downstream tables. Higher values create deeper lineage chains.",
104
+ )
105
+
106
+ lineage_fan_out_after_first_hop: Optional[int] = Field(
107
+ default=None,
108
+ description="Optional limit on fanout for hops after the first hop. When set, prevents exponential growth by limiting the number of downstream tables per upstream table at levels 2 and beyond. When None, uses the standard exponential growth (lineage_fan_out^level).",
109
+ )
110
+
111
+ subtype_pattern: SubTypePattern = Field(
112
+ default=SubTypePattern.ALTERNATING,
113
+ description="Pattern for determining SubTypes. Options: 'alternating', 'all_table', 'all_view', 'level_based'",
114
+ )
115
+
116
+ subtype_types: List[str] = Field(
117
+ default=["Table", "View"],
118
+ description="List of types to use in alternating pattern. Defaults to ['Table', 'View'].",
119
+ )
120
+
121
+ level_subtypes: Dict[int, str] = Field(
122
+ default={0: "Table", 1: "View", 2: "Table"},
123
+ description="Mapping of level to subtype for level_based pattern",
124
+ )
125
+
126
+
127
+ class DataHubMockDataConfig(ConfigModel):
128
+ enabled: bool = Field(
129
+ default=True,
130
+ description="Whether this source is enabled",
131
+ )
132
+ throw_uncaught_exceptions: bool = Field(
133
+ default=False,
134
+ description="Whether to throw an uncaught exception for testing",
135
+ )
136
+ num_errors: int = Field(
137
+ default=0,
138
+ description="Number of errors to add in report for testing",
139
+ )
140
+ num_warnings: int = Field(
141
+ default=0,
142
+ description="Number of warnings to add in report for testing",
143
+ )
144
+ num_info: int = Field(
145
+ default=0,
146
+ description="Number of info to add in report for testing",
147
+ )
148
+
149
+ gen_1: LineageConfigGen1 = Field(
150
+ default_factory=LineageConfigGen1,
151
+ description="Configuration for lineage data generation",
152
+ )
153
+
154
+
155
+ @platform_name(PLATFORM_NAME)
156
+ @config_class(DataHubMockDataConfig)
157
+ @support_status(SupportStatus.TESTING)
158
+ class DataHubMockDataSource(Source):
159
+ """
160
+ This source is for generating mock data for testing purposes.
161
+ Expect breaking changes as we iterate on the mock data source.
162
+ """
163
+
164
+ def __init__(self, ctx: PipelineContext, config: DataHubMockDataConfig):
165
+ self.ctx = ctx
166
+ self.config = config
167
+ self.report = DataHubMockDataReport()
168
+
169
+ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
170
+ workunit_processors = [AutoSystemMetadata(self.ctx).stamp]
171
+ return self._apply_workunit_processors(
172
+ workunit_processors, auto_workunit(self.get_workunits_internal())
173
+ )
174
+
175
+ def get_workunits_internal(
176
+ self,
177
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
178
+ # We don't want any implicit aspects to be produced
179
+ # so we are not using get_workunits_internal
180
+
181
+ if self.config.throw_uncaught_exceptions:
182
+ raise Exception("This is a test exception")
183
+
184
+ if self.config.num_errors > 0:
185
+ for i in range(self.config.num_errors):
186
+ self.report.failure(
187
+ message="This is test error message",
188
+ title="Test Error",
189
+ context=f"This is test error {i}",
190
+ )
191
+
192
+ if self.config.num_warnings > 0:
193
+ for i in range(self.config.num_warnings):
194
+ self.report.warning(
195
+ message="This is test warning",
196
+ title="Test Warning",
197
+ context=f"This is test warning {i}",
198
+ log_category=StructuredLogCategory.LINEAGE,
199
+ )
200
+
201
+ if self.config.num_info > 0:
202
+ for i in range(self.config.num_info):
203
+ self.report.info(
204
+ message="This is test info",
205
+ title="Test Info",
206
+ context=f"This is test info {i}",
207
+ )
208
+
209
+ if self.config.gen_1.enabled:
210
+ for wu in self._data_gen_1():
211
+ if self.report.first_urn_seen is None:
212
+ self.report.first_urn_seen = wu.get_urn()
213
+ self.report.report_workunit(wu)
214
+ yield wu
215
+
216
+ yield from []
217
+
218
+ def _calculate_lineage_tables(
219
+ self, fan_out: int, hops: int, fan_out_after_first: Optional[int] = None
220
+ ) -> Tuple[int, List[int]]:
221
+ """
222
+ Calculate the total number of tables and tables at each level for lineage generation.
223
+
224
+ Args:
225
+ fan_out: Number of downstream tables per upstream table at level 1
226
+ hops: Number of hops (levels) in the lineage graph
227
+ fan_out_after_first: Optional limit on fanout for hops after the first hop
228
+
229
+ Returns:
230
+ Tuple of (total_tables, tables_at_levels) where tables_at_levels is a list
231
+ containing the number of tables at each level (index 0 = level 0, etc.)
232
+ """
233
+ tables_to_be_created = 0
234
+ tables_at_levels: List[int] = []
235
+
236
+ for i in range(hops + 1):
237
+ if i == 0:
238
+ # Level 0: always 1 table
239
+ tables_at_level = 1
240
+ elif i == 1:
241
+ # Level 1: uses lineage_fan_out
242
+ tables_at_level = fan_out
243
+ else:
244
+ # Level 2+: use fan_out_after_first_hop if set, otherwise exponential growth
245
+ if fan_out_after_first is not None:
246
+ # Each table at previous level creates fan_out_after_first tables
247
+ tables_at_level = tables_at_levels[i - 1] * fan_out_after_first
248
+ else:
249
+ # Original exponential behavior
250
+ tables_at_level = fan_out**i
251
+
252
+ tables_at_levels.append(tables_at_level)
253
+ tables_to_be_created += tables_at_level
254
+
255
+ return tables_to_be_created, tables_at_levels
256
+
257
+ def _calculate_fanout_for_level(
258
+ self, level: int, fan_out: int, fan_out_after_first: Optional[int] = None
259
+ ) -> int:
260
+ """
261
+ Calculate the fanout (number of downstream tables) for a specific level.
262
+
263
+ Args:
264
+ level: The current level (0-based)
265
+ fan_out: Number of downstream tables per upstream table at level 1
266
+ fan_out_after_first: Optional limit on fanout for hops after the first hop
267
+
268
+ Returns:
269
+ The number of downstream tables that each table at this level should connect to
270
+ """
271
+ if level == 0:
272
+ # Level 0: uses the standard fan_out
273
+ return fan_out
274
+ else:
275
+ # Level 1+: use fan_out_after_first if set, otherwise use fan_out
276
+ return fan_out_after_first if fan_out_after_first is not None else fan_out
277
+
278
+ def _determine_subtype(
279
+ self,
280
+ table_level: int,
281
+ table_index: int,
282
+ subtype_pattern: SubTypePattern,
283
+ subtype_types: List[str],
284
+ level_subtypes: Dict[int, str],
285
+ ) -> str:
286
+ """
287
+ Determine subtype based on configured pattern.
288
+
289
+ Args:
290
+ table_level: Level of the table in the lineage graph
291
+ table_index: Index of the table within its level
292
+ subtype_pattern: Pattern for determining subtypes
293
+ subtype_types: List of types to use in alternating pattern
294
+ level_subtypes: Mapping of level to subtype for level_based pattern
295
+
296
+ Returns:
297
+ The determined subtype from the configured types
298
+ """
299
+ if subtype_pattern == SubTypePattern.ALTERNATING:
300
+ return subtype_types[table_index % len(subtype_types)]
301
+ elif subtype_pattern == SubTypePattern.LEVEL_BASED:
302
+ return level_subtypes.get(table_level, DatasetSubTypes.TABLE)
303
+ elif subtype_pattern == SubTypePattern.ALL_TABLE:
304
+ return DatasetSubTypes.TABLE
305
+ elif subtype_pattern == SubTypePattern.ALL_VIEW:
306
+ return DatasetSubTypes.VIEW
307
+ else:
308
+ return DatasetSubTypes.TABLE # default
309
+
310
+ def _get_subtypes_aspect(
311
+ self,
312
+ table_name: str,
313
+ table_level: int,
314
+ table_index: int,
315
+ subtype_pattern: SubTypePattern,
316
+ subtype_types: List[str],
317
+ level_subtypes: Dict[int, str],
318
+ ) -> MetadataWorkUnit:
319
+ """
320
+ Create a SubTypes aspect for a table based on deterministic pattern.
321
+
322
+ Args:
323
+ table_name: Name of the table
324
+ table_level: Level of the table in the lineage graph
325
+ table_index: Index of the table within its level
326
+ subtype_pattern: Pattern for determining subtypes
327
+ subtype_types: List of types to use in alternating pattern
328
+ level_subtypes: Mapping of level to subtype for level_based pattern
329
+
330
+ Returns:
331
+ MetadataWorkUnit containing the SubTypes aspect
332
+ """
333
+ # Determine subtype based on pattern
334
+ subtype = self._determine_subtype(
335
+ table_level, table_index, subtype_pattern, subtype_types, level_subtypes
336
+ )
337
+
338
+ urn = make_dataset_urn(platform=PLATFORM_NAME, name=table_name)
339
+ mcp = MetadataChangeProposalWrapper(
340
+ entityUrn=urn,
341
+ entityType="dataset",
342
+ aspect=SubTypesClass(typeNames=[subtype]),
343
+ )
344
+ return mcp.as_workunit()
345
+
346
+ def _data_gen_1(self) -> Iterable[MetadataWorkUnit]:
347
+ """Generate mock lineage data for testing purposes."""
348
+ gen_1 = self.config.gen_1
349
+ fan_out = gen_1.lineage_fan_out
350
+ hops = gen_1.lineage_hops
351
+ fan_out_after_first = gen_1.lineage_fan_out_after_first_hop
352
+
353
+ logger.info(
354
+ f"Generating lineage data with fan_out={fan_out}, hops={hops}, fan_out_after_first={fan_out_after_first}"
355
+ )
356
+
357
+ tables_to_be_created, tables_at_levels = self._calculate_lineage_tables(
358
+ fan_out, hops, fan_out_after_first
359
+ )
360
+
361
+ logger.info(f"About to create {tables_to_be_created} datasets mock data")
362
+
363
+ for i in range(hops + 1):
364
+ tables_at_level = tables_at_levels[i]
365
+
366
+ for j in range(tables_at_level):
367
+ table_name = TableNamingHelper.generate_table_name(
368
+ hops, fan_out, i, j, gen_1.table_name_prefix
369
+ )
370
+
371
+ yield self._get_status_aspect(table_name)
372
+
373
+ yield self._get_subtypes_aspect(
374
+ table_name,
375
+ i,
376
+ j,
377
+ gen_1.subtype_pattern,
378
+ gen_1.subtype_types,
379
+ gen_1.level_subtypes,
380
+ )
381
+
382
+ yield self._get_profile_aspect(table_name)
383
+
384
+ if self.config.gen_1.emit_usage:
385
+ yield self._get_usage_aspect(table_name)
386
+
387
+ if self.config.gen_1.emit_lineage:
388
+ yield from self._generate_lineage_for_table(
389
+ table_name=table_name,
390
+ table_level=i,
391
+ table_index=j,
392
+ hops=hops,
393
+ fan_out=fan_out,
394
+ fan_out_after_first=fan_out_after_first,
395
+ tables_at_levels=tables_at_levels,
396
+ table_name_prefix=gen_1.table_name_prefix,
397
+ )
398
+
399
+ def _generate_lineage_for_table(
400
+ self,
401
+ table_name: str,
402
+ table_level: int,
403
+ table_index: int,
404
+ hops: int,
405
+ fan_out: int,
406
+ fan_out_after_first: Optional[int],
407
+ tables_at_levels: List[int],
408
+ table_name_prefix: Optional[str],
409
+ ) -> Iterable[MetadataWorkUnit]:
410
+ """Generate lineage relationships for a specific table."""
411
+ # Only generate lineage if there are downstream levels
412
+ if table_level + 1 > hops:
413
+ return
414
+
415
+ current_fan_out = self._calculate_fanout_for_level(
416
+ table_level, fan_out, fan_out_after_first
417
+ )
418
+
419
+ yield from self._generate_downstream_lineage(
420
+ upstream_table_name=table_name,
421
+ upstream_table_index=table_index,
422
+ upstream_table_level=table_level,
423
+ current_fan_out=current_fan_out,
424
+ hops=hops,
425
+ fan_out=fan_out,
426
+ tables_at_levels=tables_at_levels,
427
+ table_name_prefix=table_name_prefix,
428
+ )
429
+
430
+ def _generate_downstream_lineage(
431
+ self,
432
+ upstream_table_name: str,
433
+ upstream_table_index: int,
434
+ upstream_table_level: int,
435
+ current_fan_out: int,
436
+ hops: int,
437
+ fan_out: int,
438
+ tables_at_levels: List[int],
439
+ table_name_prefix: Optional[str],
440
+ ) -> Iterable[MetadataWorkUnit]:
441
+ """Generate lineage relationships to downstream tables."""
442
+ downstream_level = upstream_table_level + 1
443
+ downstream_tables_count = tables_at_levels[downstream_level]
444
+
445
+ # Calculate range of downstream tables this upstream table connects to
446
+ start_downstream = upstream_table_index * current_fan_out
447
+ end_downstream = min(
448
+ (upstream_table_index + 1) * current_fan_out, downstream_tables_count
449
+ )
450
+
451
+ for downstream_index in range(start_downstream, end_downstream):
452
+ downstream_table_name = TableNamingHelper.generate_table_name(
453
+ hops, fan_out, downstream_level, downstream_index, table_name_prefix
454
+ )
455
+ yield self._get_upstream_aspect(
456
+ upstream_table=upstream_table_name,
457
+ downstream_table=downstream_table_name,
458
+ )
459
+
460
+ def _get_status_aspect(self, table: str) -> MetadataWorkUnit:
461
+ urn = make_dataset_urn(
462
+ platform=PLATFORM_NAME,
463
+ name=table,
464
+ )
465
+ mcp = MetadataChangeProposalWrapper(
466
+ entityUrn=urn,
467
+ entityType="dataset",
468
+ aspect=StatusClass(removed=False),
469
+ )
470
+ return mcp.as_workunit()
471
+
472
+ def _get_upstream_aspect(
473
+ self, upstream_table: str, downstream_table: str
474
+ ) -> MetadataWorkUnit:
475
+ mcp = MetadataChangeProposalWrapper(
476
+ entityUrn=make_dataset_urn(
477
+ platform=PLATFORM_NAME,
478
+ name=downstream_table,
479
+ ),
480
+ entityType="dataset",
481
+ aspect=UpstreamLineageClass(
482
+ upstreams=[
483
+ UpstreamClass(
484
+ dataset=make_dataset_urn(
485
+ platform=PLATFORM_NAME,
486
+ name=upstream_table,
487
+ ),
488
+ type=DatasetLineageTypeClass.TRANSFORMED,
489
+ )
490
+ ],
491
+ ),
492
+ )
493
+ return mcp.as_workunit()
494
+
495
+ def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
496
+ urn = make_dataset_urn(
497
+ platform=PLATFORM_NAME,
498
+ name=table,
499
+ )
500
+ mcp = MetadataChangeProposalWrapper(
501
+ entityUrn=urn,
502
+ entityType="dataset",
503
+ aspect=DatasetProfileClass(
504
+ timestampMillis=0,
505
+ rowCount=100,
506
+ columnCount=10,
507
+ sizeInBytes=1000,
508
+ ),
509
+ )
510
+ return mcp.as_workunit()
511
+
512
+ def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
513
+ urn = make_dataset_urn(
514
+ platform=PLATFORM_NAME,
515
+ name=table,
516
+ )
517
+ mcp = MetadataChangeProposalWrapper(
518
+ entityUrn=urn,
519
+ entityType="dataset",
520
+ aspect=DatasetUsageStatisticsClass(
521
+ timestampMillis=0,
522
+ eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
523
+ uniqueUserCount=0,
524
+ totalSqlQueries=0,
525
+ topSqlQueries=[],
526
+ userCounts=[],
527
+ fieldCounts=[],
528
+ ),
529
+ )
530
+ return mcp.as_workunit()
531
+
532
+ def get_report(self) -> SourceReport:
533
+ return self.report
@@ -0,0 +1,12 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from datahub.ingestion.api.source import SourceReport
5
+
6
+
7
+ @dataclass
8
+ class DataHubMockDataReport(SourceReport):
9
+ first_urn_seen: Optional[str] = field(
10
+ default=None,
11
+ metadata={"description": "The first URN encountered during ingestion"},
12
+ )