acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,401 @@
1
+ {
2
+ "entities": {
3
+ "dataJob": {
4
+ "dataJobInputOutput": {
5
+ "aspect": "dataJobInputOutput",
6
+ "fields": [
7
+ {
8
+ "name": "inputDatasets",
9
+ "path": "inputDatasets",
10
+ "isLineage": true,
11
+ "relationship": {
12
+ "name": "Consumes",
13
+ "entityTypes": [
14
+ "dataset"
15
+ ],
16
+ "isLineage": true
17
+ }
18
+ },
19
+ {
20
+ "name": "inputDatasetEdges",
21
+ "path": "inputDatasetEdges",
22
+ "isLineage": true,
23
+ "relationship": {
24
+ "name": "Consumes",
25
+ "entityTypes": [
26
+ "dataset"
27
+ ],
28
+ "isLineage": true
29
+ }
30
+ },
31
+ {
32
+ "name": "outputDatasets",
33
+ "path": "outputDatasets",
34
+ "isLineage": true,
35
+ "relationship": {
36
+ "name": "Produces",
37
+ "entityTypes": [
38
+ "dataset"
39
+ ],
40
+ "isLineage": true
41
+ }
42
+ },
43
+ {
44
+ "name": "outputDatasetEdges",
45
+ "path": "outputDatasetEdges",
46
+ "isLineage": true,
47
+ "relationship": {
48
+ "name": "Produces",
49
+ "entityTypes": [
50
+ "dataset"
51
+ ],
52
+ "isLineage": true
53
+ }
54
+ },
55
+ {
56
+ "name": "inputDatajobs",
57
+ "path": "inputDatajobs",
58
+ "isLineage": true,
59
+ "relationship": {
60
+ "name": "DownstreamOf",
61
+ "entityTypes": [
62
+ "dataJob"
63
+ ],
64
+ "isLineage": true
65
+ }
66
+ },
67
+ {
68
+ "name": "inputDatajobEdges",
69
+ "path": "inputDatajobEdges",
70
+ "isLineage": true,
71
+ "relationship": {
72
+ "name": "DownstreamOf",
73
+ "entityTypes": [
74
+ "dataJob"
75
+ ],
76
+ "isLineage": true
77
+ }
78
+ }
79
+ ]
80
+ }
81
+ },
82
+ "dataProcessInstance": {
83
+ "dataProcessInstanceOutput": {
84
+ "aspect": "dataProcessInstanceOutput",
85
+ "fields": [
86
+ {
87
+ "name": "outputEdges",
88
+ "path": "outputEdges",
89
+ "isLineage": true,
90
+ "relationship": {
91
+ "name": "DataProcessInstanceProduces",
92
+ "entityTypes": [
93
+ "dataset",
94
+ "mlModel",
95
+ "dataProcessInstance"
96
+ ],
97
+ "isLineage": true
98
+ }
99
+ }
100
+ ]
101
+ },
102
+ "dataProcessInstanceInput": {
103
+ "aspect": "dataProcessInstanceInput",
104
+ "fields": [
105
+ {
106
+ "name": "inputEdges",
107
+ "path": "inputEdges",
108
+ "isLineage": true,
109
+ "relationship": {
110
+ "name": "DataProcessInstanceConsumes",
111
+ "entityTypes": [
112
+ "dataset",
113
+ "mlModel",
114
+ "dataProcessInstance"
115
+ ],
116
+ "isLineage": true
117
+ }
118
+ }
119
+ ]
120
+ }
121
+ },
122
+ "dataProcess": {
123
+ "dataProcessInfo": {
124
+ "aspect": "dataProcessInfo",
125
+ "fields": [
126
+ {
127
+ "name": "inputs",
128
+ "path": "inputs",
129
+ "isLineage": true,
130
+ "relationship": {
131
+ "name": "Consumes",
132
+ "entityTypes": [
133
+ "dataset"
134
+ ],
135
+ "isLineage": true
136
+ }
137
+ },
138
+ {
139
+ "name": "outputs",
140
+ "path": "outputs",
141
+ "isLineage": true,
142
+ "relationship": {
143
+ "name": "Consumes",
144
+ "entityTypes": [
145
+ "dataset"
146
+ ],
147
+ "isLineage": true
148
+ }
149
+ }
150
+ ]
151
+ }
152
+ },
153
+ "dataset": {
154
+ "upstreamLineage": {
155
+ "aspect": "upstreamLineage",
156
+ "fields": [
157
+ {
158
+ "name": "dataset",
159
+ "path": "upstreams.dataset",
160
+ "isLineage": true,
161
+ "relationship": {
162
+ "name": "DownstreamOf",
163
+ "entityTypes": [
164
+ "dataset"
165
+ ],
166
+ "isLineage": true
167
+ }
168
+ }
169
+ ]
170
+ }
171
+ },
172
+ "chart": {
173
+ "chartInfo": {
174
+ "aspect": "chartInfo",
175
+ "fields": [
176
+ {
177
+ "name": "inputs",
178
+ "path": "inputs",
179
+ "isLineage": true,
180
+ "relationship": {
181
+ "name": "Consumes",
182
+ "entityTypes": [
183
+ "dataset"
184
+ ],
185
+ "isLineage": true
186
+ }
187
+ },
188
+ {
189
+ "name": "inputEdges",
190
+ "path": "inputEdges",
191
+ "isLineage": true,
192
+ "relationship": {
193
+ "name": "Consumes",
194
+ "entityTypes": [
195
+ "dataset"
196
+ ],
197
+ "isLineage": true
198
+ }
199
+ }
200
+ ]
201
+ }
202
+ },
203
+ "dashboard": {
204
+ "dashboardInfo": {
205
+ "aspect": "dashboardInfo",
206
+ "fields": [
207
+ {
208
+ "name": "charts",
209
+ "path": "charts",
210
+ "isLineage": true,
211
+ "relationship": {
212
+ "name": "Contains",
213
+ "entityTypes": [
214
+ "chart"
215
+ ],
216
+ "isLineage": true
217
+ }
218
+ },
219
+ {
220
+ "name": "chartEdges",
221
+ "path": "chartEdges",
222
+ "isLineage": true,
223
+ "relationship": {
224
+ "name": "Contains",
225
+ "entityTypes": [
226
+ "chart"
227
+ ],
228
+ "isLineage": true
229
+ }
230
+ },
231
+ {
232
+ "name": "datasets",
233
+ "path": "datasets",
234
+ "isLineage": true,
235
+ "relationship": {
236
+ "name": "Consumes",
237
+ "entityTypes": [
238
+ "dataset"
239
+ ],
240
+ "isLineage": true
241
+ }
242
+ },
243
+ {
244
+ "name": "datasetEdges",
245
+ "path": "datasetEdges",
246
+ "isLineage": true,
247
+ "relationship": {
248
+ "name": "Consumes",
249
+ "entityTypes": [
250
+ "dataset"
251
+ ],
252
+ "isLineage": true
253
+ }
254
+ },
255
+ {
256
+ "name": "dashboards",
257
+ "path": "dashboards",
258
+ "isLineage": true,
259
+ "relationship": {
260
+ "name": "DashboardContainsDashboard",
261
+ "entityTypes": [
262
+ "dashboard"
263
+ ],
264
+ "isLineage": true
265
+ }
266
+ }
267
+ ]
268
+ }
269
+ },
270
+ "mlModelGroup": {
271
+ "mlModelGroupProperties": {
272
+ "aspect": "mlModelGroupProperties",
273
+ "fields": [
274
+ {
275
+ "name": "trainingJobs",
276
+ "path": "trainingJobs",
277
+ "isLineage": true,
278
+ "relationship": {
279
+ "name": "TrainedBy",
280
+ "entityTypes": [
281
+ "dataJob",
282
+ "dataProcessInstance"
283
+ ],
284
+ "isLineage": true
285
+ }
286
+ },
287
+ {
288
+ "name": "downstreamJobs",
289
+ "path": "downstreamJobs",
290
+ "isLineage": true,
291
+ "relationship": {
292
+ "name": "UsedBy",
293
+ "entityTypes": [
294
+ "dataJob",
295
+ "dataProcessInstance"
296
+ ],
297
+ "isLineage": true
298
+ }
299
+ }
300
+ ]
301
+ }
302
+ },
303
+ "mlFeature": {
304
+ "mlFeatureProperties": {
305
+ "aspect": "mlFeatureProperties",
306
+ "fields": [
307
+ {
308
+ "name": "sources",
309
+ "path": "sources",
310
+ "isLineage": true,
311
+ "relationship": {
312
+ "name": "DerivedFrom",
313
+ "entityTypes": [
314
+ "dataset"
315
+ ],
316
+ "isLineage": true
317
+ }
318
+ }
319
+ ]
320
+ }
321
+ },
322
+ "mlPrimaryKey": {
323
+ "mlPrimaryKeyProperties": {
324
+ "aspect": "mlPrimaryKeyProperties",
325
+ "fields": [
326
+ {
327
+ "name": "sources",
328
+ "path": "sources",
329
+ "isLineage": true,
330
+ "relationship": {
331
+ "name": "DerivedFrom",
332
+ "entityTypes": [
333
+ "dataset"
334
+ ],
335
+ "isLineage": true
336
+ }
337
+ }
338
+ ]
339
+ }
340
+ },
341
+ "mlModel": {
342
+ "mlModelProperties": {
343
+ "aspect": "mlModelProperties",
344
+ "fields": [
345
+ {
346
+ "name": "trainingJobs",
347
+ "path": "trainingJobs",
348
+ "isLineage": true,
349
+ "relationship": {
350
+ "name": "TrainedBy",
351
+ "entityTypes": [
352
+ "dataJob",
353
+ "dataProcessInstance"
354
+ ],
355
+ "isLineage": true
356
+ }
357
+ },
358
+ {
359
+ "name": "downstreamJobs",
360
+ "path": "downstreamJobs",
361
+ "isLineage": true,
362
+ "relationship": {
363
+ "name": "UsedBy",
364
+ "entityTypes": [
365
+ "dataJob",
366
+ "dataProcessInstance"
367
+ ],
368
+ "isLineage": true
369
+ }
370
+ },
371
+ {
372
+ "name": "mlFeatures",
373
+ "path": "mlFeatures",
374
+ "isLineage": true,
375
+ "relationship": {
376
+ "name": "Consumes",
377
+ "entityTypes": [
378
+ "mlFeature"
379
+ ],
380
+ "isLineage": true
381
+ }
382
+ },
383
+ {
384
+ "name": "groups",
385
+ "path": "groups",
386
+ "isLineage": true,
387
+ "relationship": {
388
+ "name": "MemberOf",
389
+ "entityTypes": [
390
+ "mlModelGroup"
391
+ ],
392
+ "isLineage": true
393
+ }
394
+ }
395
+ ]
396
+ }
397
+ }
398
+ },
399
+ "generated_by": "metadata-ingestion/scripts/modeldocgen.py",
400
+ "generated_at": "2025-07-01T10:49:03.713749+00:00"
401
+ }
@@ -0,0 +1,177 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from functools import lru_cache
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # Global cache for lineage data to avoid repeated file reads
11
+ _lineage_data: Optional["LineageData"] = None
12
+
13
+
14
+ @dataclass
15
+ class Field:
16
+ name: str
17
+ path: str
18
+ isLineage: bool
19
+ relationship: Optional[Dict]
20
+
21
+
22
+ @dataclass
23
+ class Aspect:
24
+ name: str
25
+ fields: List[Field]
26
+
27
+
28
+ @dataclass
29
+ class Entity:
30
+ name: str
31
+ aspects: Dict[str, Aspect]
32
+
33
+
34
+ @dataclass
35
+ class LineageData:
36
+ # entity name -> aspect
37
+ entities: Dict[str, Entity]
38
+ generated_by: str
39
+ generated_at: str
40
+
41
+
42
+ def get_lineage_data() -> LineageData:
43
+ """
44
+ This is experimental internal API subject to breaking changes without prior notice.
45
+ """
46
+ global _lineage_data
47
+
48
+ if _lineage_data is not None:
49
+ return _lineage_data
50
+
51
+ raw_data = _load_lineage_data()
52
+ _entities = raw_data.get("entities", {})
53
+ for entity_name, entity_data in _entities.items():
54
+ entity = Entity(
55
+ name=entity_name,
56
+ aspects={},
57
+ )
58
+ for aspect_name, aspect_data in entity_data.items():
59
+ entity.aspects[aspect_name] = Aspect(
60
+ name=aspect_name,
61
+ fields=[
62
+ Field(
63
+ name=field["name"],
64
+ path=field["path"],
65
+ isLineage=field["isLineage"],
66
+ relationship=field.get("relationship", None),
67
+ )
68
+ for field in aspect_data.get("fields", [])
69
+ ],
70
+ )
71
+ _entities[entity_name] = entity
72
+
73
+ _lineage_data = LineageData(
74
+ entities=_entities,
75
+ generated_by=raw_data.get("generated_by", ""),
76
+ generated_at=raw_data.get("generated_at", ""),
77
+ )
78
+ return _lineage_data
79
+
80
+
81
+ def get_all_aspect_names() -> List[str]:
82
+ """
83
+ This is experimental internal API subject to breaking changes without prior notice.
84
+ """
85
+ entities = get_lineage_data().entities
86
+ if not entities:
87
+ return []
88
+ first_entity = next(iter(entities.values()))
89
+ return list(first_entity.aspects.keys())
90
+
91
+
92
+ def _load_lineage_data() -> Dict:
93
+ """
94
+ This is experimental internal API subject to breaking changes without prior notice.
95
+
96
+ Load lineage data from the autogenerated lineage.json file.
97
+
98
+ Returns:
99
+ Dict containing the lineage information, or empty dict if file doesn't exist
100
+
101
+ Raises:
102
+ json.JSONDecodeError: If lineage.json is malformed
103
+ """
104
+ # Get the path to lineage.json relative to this file
105
+ current_file = Path(__file__)
106
+ lineage_file = current_file.parent / "lineage.json"
107
+
108
+ if not lineage_file.exists():
109
+ logger.warning(
110
+ f"Lineage file not found: {lineage_file}. "
111
+ "This may indicate a packaging issue. Lineage detection will be disabled."
112
+ )
113
+ return {}
114
+
115
+ try:
116
+ with open(lineage_file, "r") as f:
117
+ return json.load(f)
118
+ except json.JSONDecodeError as e:
119
+ logger.error(
120
+ f"Failed to parse lineage.json: {e}. Lineage detection will be disabled."
121
+ )
122
+ return {}
123
+
124
+
125
+ def _get_fields(entity_type: str, aspect_name: str) -> List[Dict]:
126
+ """
127
+ This is experimental internal API subject to breaking changes without prior notice.
128
+ """
129
+ lineage_data = get_lineage_data()
130
+ entity = lineage_data.entities.get(entity_type)
131
+ if not entity:
132
+ return []
133
+
134
+ aspect = entity.aspects.get(aspect_name)
135
+ if not aspect:
136
+ return []
137
+
138
+ return [
139
+ {
140
+ "name": field.name,
141
+ "path": field.path,
142
+ "isLineage": field.isLineage,
143
+ "relationship": field.relationship,
144
+ }
145
+ for field in aspect.fields
146
+ ]
147
+
148
+
149
+ def _get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
150
+ """
151
+ This is experimental internal API subject to breaking changes without prior notice.
152
+ """
153
+ return [
154
+ field
155
+ for field in _get_fields(entity_type, aspect_name)
156
+ if field.get("isLineage", False)
157
+ ]
158
+
159
+
160
+ @lru_cache(maxsize=128)
161
+ def is_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
162
+ """
163
+ This is experimental internal API subject to breaking changes without prior notice.
164
+ """
165
+ return len(_get_lineage_fields(entity_type, aspect_name)) > 0
166
+
167
+
168
+ def clear_cache() -> None:
169
+ """
170
+ This is experimental internal API subject to breaking changes without prior notice.
171
+
172
+ Clear the internal cache of lineage data.
173
+
174
+ This is useful for testing or when the lineage.json file has been updated.
175
+ """
176
+ global _lineage_data
177
+ _lineage_data = None
@@ -125,7 +125,7 @@ class AvroToMceSchemaConverter:
125
125
  self._prefix_name_stack: PrefixNameStack = [self.version_string]
126
126
  # Tracks the fields on the current path.
127
127
  self._fields_stack: FieldStack = []
128
- # Tracks the record types seen so far. Used to prevent infinite recursion with recursive types.
128
+ # Stack of record types currently being processed. Used to prevent infinite recursion with recursive types.
129
129
  self._record_types_seen: List[str] = []
130
130
  # If part of the key-schema or value-schema.
131
131
  self._is_key_schema = is_key_schema
@@ -522,10 +522,12 @@ class AvroToMceSchemaConverter:
522
522
  # Handle recursive record definitions
523
523
  recurse: bool = True
524
524
  if isinstance(schema, avro.schema.RecordSchema):
525
- if schema.fullname not in self._record_types_seen:
526
- self._record_types_seen.append(schema.fullname)
527
- else:
525
+ # Only prevent recursion if we're currently processing this record type (true recursion)
526
+ # Allow reuse of the same record type in different contexts
527
+ if schema.fullname in self._record_types_seen:
528
528
  recurse = False
529
+ else:
530
+ self._record_types_seen.append(schema.fullname)
529
531
 
530
532
  # Adjust actual schema if needed
531
533
  actual_schema = self._get_underlying_type_if_option_as_union(schema, schema)
@@ -559,6 +561,13 @@ class AvroToMceSchemaConverter:
559
561
  for sub_schema in self._get_sub_schemas(actual_schema):
560
562
  yield from self._to_mce_fields(sub_schema)
561
563
 
564
+ # Clean up the processing stack
565
+ if (
566
+ isinstance(schema, avro.schema.RecordSchema)
567
+ and schema.fullname in self._record_types_seen
568
+ ):
569
+ self._record_types_seen.remove(schema.fullname)
570
+
562
571
  def _gen_non_nested_to_mce_fields(
563
572
  self, schema: SchemaOrField
564
573
  ) -> Iterable[SchemaField]:
@@ -90,6 +90,11 @@ class ClassificationHandler:
90
90
 
91
91
  def get_classifiers(self) -> List[Classifier]:
92
92
  classifiers = []
93
+ if (
94
+ not isinstance(self.config, ClassificationSourceConfigMixin)
95
+ or self.config.classification is None
96
+ ):
97
+ return classifiers
93
98
 
94
99
  for classifier in self.config.classification.classifiers:
95
100
  classifier_class = classifier_registry.get(classifier.type)