acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3431 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +47 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +7 -7
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/source.py +19 -3
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +19 -9
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
- datahub/sql_parsing/sqlglot_lineage.py +40 -13
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
{
|
|
2
|
+
"entities": {
|
|
3
|
+
"dataJob": {
|
|
4
|
+
"dataJobInputOutput": {
|
|
5
|
+
"aspect": "dataJobInputOutput",
|
|
6
|
+
"fields": [
|
|
7
|
+
{
|
|
8
|
+
"name": "inputDatasets",
|
|
9
|
+
"path": "inputDatasets",
|
|
10
|
+
"isLineage": true,
|
|
11
|
+
"relationship": {
|
|
12
|
+
"name": "Consumes",
|
|
13
|
+
"entityTypes": [
|
|
14
|
+
"dataset"
|
|
15
|
+
],
|
|
16
|
+
"isLineage": true
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "inputDatasetEdges",
|
|
21
|
+
"path": "inputDatasetEdges",
|
|
22
|
+
"isLineage": true,
|
|
23
|
+
"relationship": {
|
|
24
|
+
"name": "Consumes",
|
|
25
|
+
"entityTypes": [
|
|
26
|
+
"dataset"
|
|
27
|
+
],
|
|
28
|
+
"isLineage": true
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"name": "outputDatasets",
|
|
33
|
+
"path": "outputDatasets",
|
|
34
|
+
"isLineage": true,
|
|
35
|
+
"relationship": {
|
|
36
|
+
"name": "Produces",
|
|
37
|
+
"entityTypes": [
|
|
38
|
+
"dataset"
|
|
39
|
+
],
|
|
40
|
+
"isLineage": true
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"name": "outputDatasetEdges",
|
|
45
|
+
"path": "outputDatasetEdges",
|
|
46
|
+
"isLineage": true,
|
|
47
|
+
"relationship": {
|
|
48
|
+
"name": "Produces",
|
|
49
|
+
"entityTypes": [
|
|
50
|
+
"dataset"
|
|
51
|
+
],
|
|
52
|
+
"isLineage": true
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"name": "inputDatajobs",
|
|
57
|
+
"path": "inputDatajobs",
|
|
58
|
+
"isLineage": true,
|
|
59
|
+
"relationship": {
|
|
60
|
+
"name": "DownstreamOf",
|
|
61
|
+
"entityTypes": [
|
|
62
|
+
"dataJob"
|
|
63
|
+
],
|
|
64
|
+
"isLineage": true
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"name": "inputDatajobEdges",
|
|
69
|
+
"path": "inputDatajobEdges",
|
|
70
|
+
"isLineage": true,
|
|
71
|
+
"relationship": {
|
|
72
|
+
"name": "DownstreamOf",
|
|
73
|
+
"entityTypes": [
|
|
74
|
+
"dataJob"
|
|
75
|
+
],
|
|
76
|
+
"isLineage": true
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
"dataProcessInstance": {
|
|
83
|
+
"dataProcessInstanceOutput": {
|
|
84
|
+
"aspect": "dataProcessInstanceOutput",
|
|
85
|
+
"fields": [
|
|
86
|
+
{
|
|
87
|
+
"name": "outputEdges",
|
|
88
|
+
"path": "outputEdges",
|
|
89
|
+
"isLineage": true,
|
|
90
|
+
"relationship": {
|
|
91
|
+
"name": "DataProcessInstanceProduces",
|
|
92
|
+
"entityTypes": [
|
|
93
|
+
"dataset",
|
|
94
|
+
"mlModel",
|
|
95
|
+
"dataProcessInstance"
|
|
96
|
+
],
|
|
97
|
+
"isLineage": true
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
"dataProcessInstanceInput": {
|
|
103
|
+
"aspect": "dataProcessInstanceInput",
|
|
104
|
+
"fields": [
|
|
105
|
+
{
|
|
106
|
+
"name": "inputEdges",
|
|
107
|
+
"path": "inputEdges",
|
|
108
|
+
"isLineage": true,
|
|
109
|
+
"relationship": {
|
|
110
|
+
"name": "DataProcessInstanceConsumes",
|
|
111
|
+
"entityTypes": [
|
|
112
|
+
"dataset",
|
|
113
|
+
"mlModel",
|
|
114
|
+
"dataProcessInstance"
|
|
115
|
+
],
|
|
116
|
+
"isLineage": true
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
},
|
|
122
|
+
"dataProcess": {
|
|
123
|
+
"dataProcessInfo": {
|
|
124
|
+
"aspect": "dataProcessInfo",
|
|
125
|
+
"fields": [
|
|
126
|
+
{
|
|
127
|
+
"name": "inputs",
|
|
128
|
+
"path": "inputs",
|
|
129
|
+
"isLineage": true,
|
|
130
|
+
"relationship": {
|
|
131
|
+
"name": "Consumes",
|
|
132
|
+
"entityTypes": [
|
|
133
|
+
"dataset"
|
|
134
|
+
],
|
|
135
|
+
"isLineage": true
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
"name": "outputs",
|
|
140
|
+
"path": "outputs",
|
|
141
|
+
"isLineage": true,
|
|
142
|
+
"relationship": {
|
|
143
|
+
"name": "Consumes",
|
|
144
|
+
"entityTypes": [
|
|
145
|
+
"dataset"
|
|
146
|
+
],
|
|
147
|
+
"isLineage": true
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
]
|
|
151
|
+
}
|
|
152
|
+
},
|
|
153
|
+
"dataset": {
|
|
154
|
+
"upstreamLineage": {
|
|
155
|
+
"aspect": "upstreamLineage",
|
|
156
|
+
"fields": [
|
|
157
|
+
{
|
|
158
|
+
"name": "dataset",
|
|
159
|
+
"path": "upstreams.dataset",
|
|
160
|
+
"isLineage": true,
|
|
161
|
+
"relationship": {
|
|
162
|
+
"name": "DownstreamOf",
|
|
163
|
+
"entityTypes": [
|
|
164
|
+
"dataset"
|
|
165
|
+
],
|
|
166
|
+
"isLineage": true
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"chart": {
|
|
173
|
+
"chartInfo": {
|
|
174
|
+
"aspect": "chartInfo",
|
|
175
|
+
"fields": [
|
|
176
|
+
{
|
|
177
|
+
"name": "inputs",
|
|
178
|
+
"path": "inputs",
|
|
179
|
+
"isLineage": true,
|
|
180
|
+
"relationship": {
|
|
181
|
+
"name": "Consumes",
|
|
182
|
+
"entityTypes": [
|
|
183
|
+
"dataset"
|
|
184
|
+
],
|
|
185
|
+
"isLineage": true
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"name": "inputEdges",
|
|
190
|
+
"path": "inputEdges",
|
|
191
|
+
"isLineage": true,
|
|
192
|
+
"relationship": {
|
|
193
|
+
"name": "Consumes",
|
|
194
|
+
"entityTypes": [
|
|
195
|
+
"dataset"
|
|
196
|
+
],
|
|
197
|
+
"isLineage": true
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
]
|
|
201
|
+
}
|
|
202
|
+
},
|
|
203
|
+
"dashboard": {
|
|
204
|
+
"dashboardInfo": {
|
|
205
|
+
"aspect": "dashboardInfo",
|
|
206
|
+
"fields": [
|
|
207
|
+
{
|
|
208
|
+
"name": "charts",
|
|
209
|
+
"path": "charts",
|
|
210
|
+
"isLineage": true,
|
|
211
|
+
"relationship": {
|
|
212
|
+
"name": "Contains",
|
|
213
|
+
"entityTypes": [
|
|
214
|
+
"chart"
|
|
215
|
+
],
|
|
216
|
+
"isLineage": true
|
|
217
|
+
}
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"name": "chartEdges",
|
|
221
|
+
"path": "chartEdges",
|
|
222
|
+
"isLineage": true,
|
|
223
|
+
"relationship": {
|
|
224
|
+
"name": "Contains",
|
|
225
|
+
"entityTypes": [
|
|
226
|
+
"chart"
|
|
227
|
+
],
|
|
228
|
+
"isLineage": true
|
|
229
|
+
}
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"name": "datasets",
|
|
233
|
+
"path": "datasets",
|
|
234
|
+
"isLineage": true,
|
|
235
|
+
"relationship": {
|
|
236
|
+
"name": "Consumes",
|
|
237
|
+
"entityTypes": [
|
|
238
|
+
"dataset"
|
|
239
|
+
],
|
|
240
|
+
"isLineage": true
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
{
|
|
244
|
+
"name": "datasetEdges",
|
|
245
|
+
"path": "datasetEdges",
|
|
246
|
+
"isLineage": true,
|
|
247
|
+
"relationship": {
|
|
248
|
+
"name": "Consumes",
|
|
249
|
+
"entityTypes": [
|
|
250
|
+
"dataset"
|
|
251
|
+
],
|
|
252
|
+
"isLineage": true
|
|
253
|
+
}
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
"name": "dashboards",
|
|
257
|
+
"path": "dashboards",
|
|
258
|
+
"isLineage": true,
|
|
259
|
+
"relationship": {
|
|
260
|
+
"name": "DashboardContainsDashboard",
|
|
261
|
+
"entityTypes": [
|
|
262
|
+
"dashboard"
|
|
263
|
+
],
|
|
264
|
+
"isLineage": true
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
]
|
|
268
|
+
}
|
|
269
|
+
},
|
|
270
|
+
"mlModelGroup": {
|
|
271
|
+
"mlModelGroupProperties": {
|
|
272
|
+
"aspect": "mlModelGroupProperties",
|
|
273
|
+
"fields": [
|
|
274
|
+
{
|
|
275
|
+
"name": "trainingJobs",
|
|
276
|
+
"path": "trainingJobs",
|
|
277
|
+
"isLineage": true,
|
|
278
|
+
"relationship": {
|
|
279
|
+
"name": "TrainedBy",
|
|
280
|
+
"entityTypes": [
|
|
281
|
+
"dataJob",
|
|
282
|
+
"dataProcessInstance"
|
|
283
|
+
],
|
|
284
|
+
"isLineage": true
|
|
285
|
+
}
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"name": "downstreamJobs",
|
|
289
|
+
"path": "downstreamJobs",
|
|
290
|
+
"isLineage": true,
|
|
291
|
+
"relationship": {
|
|
292
|
+
"name": "UsedBy",
|
|
293
|
+
"entityTypes": [
|
|
294
|
+
"dataJob",
|
|
295
|
+
"dataProcessInstance"
|
|
296
|
+
],
|
|
297
|
+
"isLineage": true
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
]
|
|
301
|
+
}
|
|
302
|
+
},
|
|
303
|
+
"mlFeature": {
|
|
304
|
+
"mlFeatureProperties": {
|
|
305
|
+
"aspect": "mlFeatureProperties",
|
|
306
|
+
"fields": [
|
|
307
|
+
{
|
|
308
|
+
"name": "sources",
|
|
309
|
+
"path": "sources",
|
|
310
|
+
"isLineage": true,
|
|
311
|
+
"relationship": {
|
|
312
|
+
"name": "DerivedFrom",
|
|
313
|
+
"entityTypes": [
|
|
314
|
+
"dataset"
|
|
315
|
+
],
|
|
316
|
+
"isLineage": true
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
]
|
|
320
|
+
}
|
|
321
|
+
},
|
|
322
|
+
"mlPrimaryKey": {
|
|
323
|
+
"mlPrimaryKeyProperties": {
|
|
324
|
+
"aspect": "mlPrimaryKeyProperties",
|
|
325
|
+
"fields": [
|
|
326
|
+
{
|
|
327
|
+
"name": "sources",
|
|
328
|
+
"path": "sources",
|
|
329
|
+
"isLineage": true,
|
|
330
|
+
"relationship": {
|
|
331
|
+
"name": "DerivedFrom",
|
|
332
|
+
"entityTypes": [
|
|
333
|
+
"dataset"
|
|
334
|
+
],
|
|
335
|
+
"isLineage": true
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
]
|
|
339
|
+
}
|
|
340
|
+
},
|
|
341
|
+
"mlModel": {
|
|
342
|
+
"mlModelProperties": {
|
|
343
|
+
"aspect": "mlModelProperties",
|
|
344
|
+
"fields": [
|
|
345
|
+
{
|
|
346
|
+
"name": "trainingJobs",
|
|
347
|
+
"path": "trainingJobs",
|
|
348
|
+
"isLineage": true,
|
|
349
|
+
"relationship": {
|
|
350
|
+
"name": "TrainedBy",
|
|
351
|
+
"entityTypes": [
|
|
352
|
+
"dataJob",
|
|
353
|
+
"dataProcessInstance"
|
|
354
|
+
],
|
|
355
|
+
"isLineage": true
|
|
356
|
+
}
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
"name": "downstreamJobs",
|
|
360
|
+
"path": "downstreamJobs",
|
|
361
|
+
"isLineage": true,
|
|
362
|
+
"relationship": {
|
|
363
|
+
"name": "UsedBy",
|
|
364
|
+
"entityTypes": [
|
|
365
|
+
"dataJob",
|
|
366
|
+
"dataProcessInstance"
|
|
367
|
+
],
|
|
368
|
+
"isLineage": true
|
|
369
|
+
}
|
|
370
|
+
},
|
|
371
|
+
{
|
|
372
|
+
"name": "mlFeatures",
|
|
373
|
+
"path": "mlFeatures",
|
|
374
|
+
"isLineage": true,
|
|
375
|
+
"relationship": {
|
|
376
|
+
"name": "Consumes",
|
|
377
|
+
"entityTypes": [
|
|
378
|
+
"mlFeature"
|
|
379
|
+
],
|
|
380
|
+
"isLineage": true
|
|
381
|
+
}
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
"name": "groups",
|
|
385
|
+
"path": "groups",
|
|
386
|
+
"isLineage": true,
|
|
387
|
+
"relationship": {
|
|
388
|
+
"name": "MemberOf",
|
|
389
|
+
"entityTypes": [
|
|
390
|
+
"mlModelGroup"
|
|
391
|
+
],
|
|
392
|
+
"isLineage": true
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
]
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
},
|
|
399
|
+
"generated_by": "metadata-ingestion/scripts/modeldocgen.py",
|
|
400
|
+
"generated_at": "2025-07-01T10:49:03.713749+00:00"
|
|
401
|
+
}
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
from functools import lru_cache
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
5
|
-
|
|
6
|
-
from datahub.utilities.urns.urn import guess_entity_type
|
|
5
|
+
from typing import Dict, List, Optional
|
|
7
6
|
|
|
8
7
|
logger = logging.getLogger(__name__)
|
|
9
8
|
|
|
@@ -18,10 +17,9 @@ def _load_lineage_data() -> Dict:
|
|
|
18
17
|
Load lineage data from the autogenerated lineage.json file.
|
|
19
18
|
|
|
20
19
|
Returns:
|
|
21
|
-
Dict containing the lineage information
|
|
20
|
+
Dict containing the lineage information, or empty dict if file doesn't exist
|
|
22
21
|
|
|
23
22
|
Raises:
|
|
24
|
-
FileNotFoundError: If lineage.json doesn't exist
|
|
25
23
|
json.JSONDecodeError: If lineage.json is malformed
|
|
26
24
|
"""
|
|
27
25
|
global _lineage_data
|
|
@@ -34,151 +32,55 @@ def _load_lineage_data() -> Dict:
|
|
|
34
32
|
lineage_file = current_file.parent / "lineage.json"
|
|
35
33
|
|
|
36
34
|
if not lineage_file.exists():
|
|
37
|
-
|
|
35
|
+
logger.warning(
|
|
36
|
+
f"Lineage file not found: {lineage_file}. "
|
|
37
|
+
"This may indicate a packaging issue. Lineage detection will be disabled."
|
|
38
|
+
)
|
|
39
|
+
_lineage_data = {}
|
|
40
|
+
return _lineage_data
|
|
38
41
|
|
|
39
42
|
try:
|
|
40
43
|
with open(lineage_file, "r") as f:
|
|
41
44
|
_lineage_data = json.load(f)
|
|
42
45
|
return _lineage_data
|
|
43
46
|
except json.JSONDecodeError as e:
|
|
44
|
-
|
|
45
|
-
f"Failed to parse lineage.json: {e}
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
50
|
-
"""
|
|
51
|
-
This is experimental internal API subject to breaking changes without prior notice.
|
|
52
|
-
|
|
53
|
-
Get lineage fields for a specific entity type and aspect.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
57
|
-
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
List of lineage field dictionaries, each containing:
|
|
61
|
-
- name: field name
|
|
62
|
-
- path: dot-notation path to the field
|
|
63
|
-
- isLineage: boolean indicating if it's lineage
|
|
64
|
-
- relationship: relationship information
|
|
65
|
-
|
|
66
|
-
Raises:
|
|
67
|
-
FileNotFoundError: If lineage.json doesn't exist
|
|
68
|
-
json.JSONDecodeError: If lineage.json is malformed
|
|
69
|
-
"""
|
|
70
|
-
lineage_data = _load_lineage_data()
|
|
71
|
-
|
|
72
|
-
entity_data = lineage_data.get("entities", {}).get(entity_type, {})
|
|
73
|
-
aspect_data = entity_data.get(aspect_name, {})
|
|
74
|
-
|
|
75
|
-
return aspect_data.get("fields", [])
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
|
|
79
|
-
"""
|
|
80
|
-
This is experimental internal API subject to breaking changes without prior notice.
|
|
81
|
-
|
|
82
|
-
Check if a specific field path is lineage-related.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
|
|
86
|
-
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
87
|
-
field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
True if the field is lineage-related, False otherwise
|
|
91
|
-
|
|
92
|
-
Raises:
|
|
93
|
-
FileNotFoundError: If lineage.json doesn't exist
|
|
94
|
-
json.JSONDecodeError: If lineage.json is malformed
|
|
95
|
-
AssertionError: If URN doesn't start with 'urn:li:'
|
|
96
|
-
"""
|
|
97
|
-
entity_type = guess_entity_type(urn)
|
|
98
|
-
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
99
|
-
|
|
100
|
-
for field in lineage_fields:
|
|
101
|
-
if field.get("path") == field_path:
|
|
102
|
-
return field.get("isLineage", False)
|
|
103
|
-
|
|
104
|
-
return False
|
|
47
|
+
logger.error(
|
|
48
|
+
f"Failed to parse lineage.json: {e}. Lineage detection will be disabled."
|
|
49
|
+
)
|
|
50
|
+
_lineage_data = {}
|
|
51
|
+
return _lineage_data
|
|
105
52
|
|
|
106
53
|
|
|
107
|
-
def
|
|
54
|
+
def _get_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
108
55
|
"""
|
|
109
56
|
This is experimental internal API subject to breaking changes without prior notice.
|
|
110
|
-
|
|
111
|
-
Check if an aspect has any lineage fields.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
|
|
115
|
-
aspect: The aspect object
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
True if the aspect has lineage fields, False otherwise
|
|
119
|
-
|
|
120
|
-
Raises:
|
|
121
|
-
FileNotFoundError: If lineage.json doesn't exist
|
|
122
|
-
json.JSONDecodeError: If lineage.json is malformed
|
|
123
|
-
AssertionError: If URN doesn't start with 'urn:li:'
|
|
124
57
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
58
|
+
return (
|
|
59
|
+
_load_lineage_data()
|
|
60
|
+
.get("entities", {})
|
|
61
|
+
.get(entity_type, {})
|
|
62
|
+
.get(aspect_name, {})
|
|
63
|
+
.get("fields", [])
|
|
129
64
|
)
|
|
130
65
|
|
|
131
|
-
lineage_fields = get_lineage_fields(entity_type, aspect_name)
|
|
132
|
-
return len(lineage_fields) > 0
|
|
133
|
-
|
|
134
66
|
|
|
135
|
-
def
|
|
67
|
+
def _get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
136
68
|
"""
|
|
137
69
|
This is experimental internal API subject to breaking changes without prior notice.
|
|
138
|
-
|
|
139
|
-
Check if an aspect has any lineage fields.
|
|
140
|
-
|
|
141
|
-
Args:
|
|
142
|
-
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
143
|
-
aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
True if the aspect has lineage fields, False otherwise
|
|
147
|
-
|
|
148
|
-
Raises:
|
|
149
|
-
FileNotFoundError: If lineage.json doesn't exist
|
|
150
|
-
json.JSONDecodeError: If lineage.json is malformed
|
|
151
70
|
"""
|
|
152
|
-
|
|
153
|
-
|
|
71
|
+
return [
|
|
72
|
+
field
|
|
73
|
+
for field in _get_fields(entity_type, aspect_name)
|
|
74
|
+
if field.get("isLineage", False)
|
|
75
|
+
]
|
|
154
76
|
|
|
155
77
|
|
|
156
|
-
|
|
78
|
+
@lru_cache(maxsize=128)
|
|
79
|
+
def is_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
|
|
157
80
|
"""
|
|
158
81
|
This is experimental internal API subject to breaking changes without prior notice.
|
|
159
|
-
|
|
160
|
-
Get all aspects that have lineage fields for a given entity type.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
entity_type: The entity type (e.g., 'dataset', 'dataJob')
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
Set of aspect names that have lineage fields
|
|
167
|
-
|
|
168
|
-
Raises:
|
|
169
|
-
FileNotFoundError: If lineage.json doesn't exist
|
|
170
|
-
json.JSONDecodeError: If lineage.json is malformed
|
|
171
82
|
"""
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
entity_data = lineage_data.get("entities", {}).get(entity_type, {})
|
|
175
|
-
lineage_aspects = set()
|
|
176
|
-
|
|
177
|
-
for aspect_name, aspect_data in entity_data.items():
|
|
178
|
-
if aspect_data.get("fields"):
|
|
179
|
-
lineage_aspects.add(aspect_name)
|
|
180
|
-
|
|
181
|
-
return lineage_aspects
|
|
83
|
+
return len(_get_lineage_fields(entity_type, aspect_name)) > 0
|
|
182
84
|
|
|
183
85
|
|
|
184
86
|
def clear_cache() -> None:
|
|
@@ -125,7 +125,7 @@ class AvroToMceSchemaConverter:
|
|
|
125
125
|
self._prefix_name_stack: PrefixNameStack = [self.version_string]
|
|
126
126
|
# Tracks the fields on the current path.
|
|
127
127
|
self._fields_stack: FieldStack = []
|
|
128
|
-
#
|
|
128
|
+
# Stack of record types currently being processed. Used to prevent infinite recursion with recursive types.
|
|
129
129
|
self._record_types_seen: List[str] = []
|
|
130
130
|
# If part of the key-schema or value-schema.
|
|
131
131
|
self._is_key_schema = is_key_schema
|
|
@@ -522,10 +522,12 @@ class AvroToMceSchemaConverter:
|
|
|
522
522
|
# Handle recursive record definitions
|
|
523
523
|
recurse: bool = True
|
|
524
524
|
if isinstance(schema, avro.schema.RecordSchema):
|
|
525
|
-
if
|
|
526
|
-
|
|
527
|
-
|
|
525
|
+
# Only prevent recursion if we're currently processing this record type (true recursion)
|
|
526
|
+
# Allow reuse of the same record type in different contexts
|
|
527
|
+
if schema.fullname in self._record_types_seen:
|
|
528
528
|
recurse = False
|
|
529
|
+
else:
|
|
530
|
+
self._record_types_seen.append(schema.fullname)
|
|
529
531
|
|
|
530
532
|
# Adjust actual schema if needed
|
|
531
533
|
actual_schema = self._get_underlying_type_if_option_as_union(schema, schema)
|
|
@@ -559,6 +561,13 @@ class AvroToMceSchemaConverter:
|
|
|
559
561
|
for sub_schema in self._get_sub_schemas(actual_schema):
|
|
560
562
|
yield from self._to_mce_fields(sub_schema)
|
|
561
563
|
|
|
564
|
+
# Clean up the processing stack
|
|
565
|
+
if (
|
|
566
|
+
isinstance(schema, avro.schema.RecordSchema)
|
|
567
|
+
and schema.fullname in self._record_types_seen
|
|
568
|
+
):
|
|
569
|
+
self._record_types_seen.remove(schema.fullname)
|
|
570
|
+
|
|
562
571
|
def _gen_non_nested_to_mce_fields(
|
|
563
572
|
self, schema: SchemaOrField
|
|
564
573
|
) -> Iterable[SchemaField]:
|
|
@@ -1576,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1576
1576
|
env: str = DEFAULT_ENV,
|
|
1577
1577
|
default_db: Optional[str] = None,
|
|
1578
1578
|
default_schema: Optional[str] = None,
|
|
1579
|
-
|
|
1579
|
+
override_dialect: Optional[str] = None,
|
|
1580
1580
|
) -> "SqlParsingResult":
|
|
1581
1581
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
|
1582
1582
|
|
|
@@ -1590,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1590
1590
|
schema_resolver=schema_resolver,
|
|
1591
1591
|
default_db=default_db,
|
|
1592
1592
|
default_schema=default_schema,
|
|
1593
|
-
|
|
1593
|
+
override_dialect=override_dialect,
|
|
1594
1594
|
)
|
|
1595
1595
|
|
|
1596
1596
|
def create_tag(self, tag_name: str) -> str:
|