acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
{
|
|
2
|
+
"entities": {
|
|
3
|
+
"dataJob": {
|
|
4
|
+
"dataJobInputOutput": {
|
|
5
|
+
"aspect": "dataJobInputOutput",
|
|
6
|
+
"fields": [
|
|
7
|
+
{
|
|
8
|
+
"name": "inputDatasets",
|
|
9
|
+
"path": "inputDatasets",
|
|
10
|
+
"isLineage": true,
|
|
11
|
+
"relationship": {
|
|
12
|
+
"name": "Consumes",
|
|
13
|
+
"entityTypes": [
|
|
14
|
+
"dataset"
|
|
15
|
+
],
|
|
16
|
+
"isLineage": true
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "inputDatasetEdges",
|
|
21
|
+
"path": "inputDatasetEdges",
|
|
22
|
+
"isLineage": true,
|
|
23
|
+
"relationship": {
|
|
24
|
+
"name": "Consumes",
|
|
25
|
+
"entityTypes": [
|
|
26
|
+
"dataset"
|
|
27
|
+
],
|
|
28
|
+
"isLineage": true
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"name": "outputDatasets",
|
|
33
|
+
"path": "outputDatasets",
|
|
34
|
+
"isLineage": true,
|
|
35
|
+
"relationship": {
|
|
36
|
+
"name": "Produces",
|
|
37
|
+
"entityTypes": [
|
|
38
|
+
"dataset"
|
|
39
|
+
],
|
|
40
|
+
"isLineage": true
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"name": "outputDatasetEdges",
|
|
45
|
+
"path": "outputDatasetEdges",
|
|
46
|
+
"isLineage": true,
|
|
47
|
+
"relationship": {
|
|
48
|
+
"name": "Produces",
|
|
49
|
+
"entityTypes": [
|
|
50
|
+
"dataset"
|
|
51
|
+
],
|
|
52
|
+
"isLineage": true
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"name": "inputDatajobs",
|
|
57
|
+
"path": "inputDatajobs",
|
|
58
|
+
"isLineage": true,
|
|
59
|
+
"relationship": {
|
|
60
|
+
"name": "DownstreamOf",
|
|
61
|
+
"entityTypes": [
|
|
62
|
+
"dataJob"
|
|
63
|
+
],
|
|
64
|
+
"isLineage": true
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"name": "inputDatajobEdges",
|
|
69
|
+
"path": "inputDatajobEdges",
|
|
70
|
+
"isLineage": true,
|
|
71
|
+
"relationship": {
|
|
72
|
+
"name": "DownstreamOf",
|
|
73
|
+
"entityTypes": [
|
|
74
|
+
"dataJob"
|
|
75
|
+
],
|
|
76
|
+
"isLineage": true
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
"dataProcessInstance": {
|
|
83
|
+
"dataProcessInstanceOutput": {
|
|
84
|
+
"aspect": "dataProcessInstanceOutput",
|
|
85
|
+
"fields": [
|
|
86
|
+
{
|
|
87
|
+
"name": "outputEdges",
|
|
88
|
+
"path": "outputEdges",
|
|
89
|
+
"isLineage": true,
|
|
90
|
+
"relationship": {
|
|
91
|
+
"name": "DataProcessInstanceProduces",
|
|
92
|
+
"entityTypes": [
|
|
93
|
+
"dataset",
|
|
94
|
+
"mlModel",
|
|
95
|
+
"dataProcessInstance"
|
|
96
|
+
],
|
|
97
|
+
"isLineage": true
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
"dataProcessInstanceInput": {
|
|
103
|
+
"aspect": "dataProcessInstanceInput",
|
|
104
|
+
"fields": [
|
|
105
|
+
{
|
|
106
|
+
"name": "inputEdges",
|
|
107
|
+
"path": "inputEdges",
|
|
108
|
+
"isLineage": true,
|
|
109
|
+
"relationship": {
|
|
110
|
+
"name": "DataProcessInstanceConsumes",
|
|
111
|
+
"entityTypes": [
|
|
112
|
+
"dataset",
|
|
113
|
+
"mlModel",
|
|
114
|
+
"dataProcessInstance"
|
|
115
|
+
],
|
|
116
|
+
"isLineage": true
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
},
|
|
122
|
+
"dataProcess": {
|
|
123
|
+
"dataProcessInfo": {
|
|
124
|
+
"aspect": "dataProcessInfo",
|
|
125
|
+
"fields": [
|
|
126
|
+
{
|
|
127
|
+
"name": "inputs",
|
|
128
|
+
"path": "inputs",
|
|
129
|
+
"isLineage": true,
|
|
130
|
+
"relationship": {
|
|
131
|
+
"name": "Consumes",
|
|
132
|
+
"entityTypes": [
|
|
133
|
+
"dataset"
|
|
134
|
+
],
|
|
135
|
+
"isLineage": true
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
{
|
|
139
|
+
"name": "outputs",
|
|
140
|
+
"path": "outputs",
|
|
141
|
+
"isLineage": true,
|
|
142
|
+
"relationship": {
|
|
143
|
+
"name": "Consumes",
|
|
144
|
+
"entityTypes": [
|
|
145
|
+
"dataset"
|
|
146
|
+
],
|
|
147
|
+
"isLineage": true
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
]
|
|
151
|
+
}
|
|
152
|
+
},
|
|
153
|
+
"dataset": {
|
|
154
|
+
"upstreamLineage": {
|
|
155
|
+
"aspect": "upstreamLineage",
|
|
156
|
+
"fields": [
|
|
157
|
+
{
|
|
158
|
+
"name": "dataset",
|
|
159
|
+
"path": "upstreams.dataset",
|
|
160
|
+
"isLineage": true,
|
|
161
|
+
"relationship": {
|
|
162
|
+
"name": "DownstreamOf",
|
|
163
|
+
"entityTypes": [
|
|
164
|
+
"dataset"
|
|
165
|
+
],
|
|
166
|
+
"isLineage": true
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"chart": {
|
|
173
|
+
"chartInfo": {
|
|
174
|
+
"aspect": "chartInfo",
|
|
175
|
+
"fields": [
|
|
176
|
+
{
|
|
177
|
+
"name": "inputs",
|
|
178
|
+
"path": "inputs",
|
|
179
|
+
"isLineage": true,
|
|
180
|
+
"relationship": {
|
|
181
|
+
"name": "Consumes",
|
|
182
|
+
"entityTypes": [
|
|
183
|
+
"dataset"
|
|
184
|
+
],
|
|
185
|
+
"isLineage": true
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"name": "inputEdges",
|
|
190
|
+
"path": "inputEdges",
|
|
191
|
+
"isLineage": true,
|
|
192
|
+
"relationship": {
|
|
193
|
+
"name": "Consumes",
|
|
194
|
+
"entityTypes": [
|
|
195
|
+
"dataset"
|
|
196
|
+
],
|
|
197
|
+
"isLineage": true
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
]
|
|
201
|
+
}
|
|
202
|
+
},
|
|
203
|
+
"dashboard": {
|
|
204
|
+
"dashboardInfo": {
|
|
205
|
+
"aspect": "dashboardInfo",
|
|
206
|
+
"fields": [
|
|
207
|
+
{
|
|
208
|
+
"name": "charts",
|
|
209
|
+
"path": "charts",
|
|
210
|
+
"isLineage": true,
|
|
211
|
+
"relationship": {
|
|
212
|
+
"name": "Contains",
|
|
213
|
+
"entityTypes": [
|
|
214
|
+
"chart"
|
|
215
|
+
],
|
|
216
|
+
"isLineage": true
|
|
217
|
+
}
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"name": "chartEdges",
|
|
221
|
+
"path": "chartEdges",
|
|
222
|
+
"isLineage": true,
|
|
223
|
+
"relationship": {
|
|
224
|
+
"name": "Contains",
|
|
225
|
+
"entityTypes": [
|
|
226
|
+
"chart"
|
|
227
|
+
],
|
|
228
|
+
"isLineage": true
|
|
229
|
+
}
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"name": "datasets",
|
|
233
|
+
"path": "datasets",
|
|
234
|
+
"isLineage": true,
|
|
235
|
+
"relationship": {
|
|
236
|
+
"name": "Consumes",
|
|
237
|
+
"entityTypes": [
|
|
238
|
+
"dataset"
|
|
239
|
+
],
|
|
240
|
+
"isLineage": true
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
{
|
|
244
|
+
"name": "datasetEdges",
|
|
245
|
+
"path": "datasetEdges",
|
|
246
|
+
"isLineage": true,
|
|
247
|
+
"relationship": {
|
|
248
|
+
"name": "Consumes",
|
|
249
|
+
"entityTypes": [
|
|
250
|
+
"dataset"
|
|
251
|
+
],
|
|
252
|
+
"isLineage": true
|
|
253
|
+
}
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
"name": "dashboards",
|
|
257
|
+
"path": "dashboards",
|
|
258
|
+
"isLineage": true,
|
|
259
|
+
"relationship": {
|
|
260
|
+
"name": "DashboardContainsDashboard",
|
|
261
|
+
"entityTypes": [
|
|
262
|
+
"dashboard"
|
|
263
|
+
],
|
|
264
|
+
"isLineage": true
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
]
|
|
268
|
+
}
|
|
269
|
+
},
|
|
270
|
+
"mlModelGroup": {
|
|
271
|
+
"mlModelGroupProperties": {
|
|
272
|
+
"aspect": "mlModelGroupProperties",
|
|
273
|
+
"fields": [
|
|
274
|
+
{
|
|
275
|
+
"name": "trainingJobs",
|
|
276
|
+
"path": "trainingJobs",
|
|
277
|
+
"isLineage": true,
|
|
278
|
+
"relationship": {
|
|
279
|
+
"name": "TrainedBy",
|
|
280
|
+
"entityTypes": [
|
|
281
|
+
"dataJob",
|
|
282
|
+
"dataProcessInstance"
|
|
283
|
+
],
|
|
284
|
+
"isLineage": true
|
|
285
|
+
}
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"name": "downstreamJobs",
|
|
289
|
+
"path": "downstreamJobs",
|
|
290
|
+
"isLineage": true,
|
|
291
|
+
"relationship": {
|
|
292
|
+
"name": "UsedBy",
|
|
293
|
+
"entityTypes": [
|
|
294
|
+
"dataJob",
|
|
295
|
+
"dataProcessInstance"
|
|
296
|
+
],
|
|
297
|
+
"isLineage": true
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
]
|
|
301
|
+
}
|
|
302
|
+
},
|
|
303
|
+
"mlFeature": {
|
|
304
|
+
"mlFeatureProperties": {
|
|
305
|
+
"aspect": "mlFeatureProperties",
|
|
306
|
+
"fields": [
|
|
307
|
+
{
|
|
308
|
+
"name": "sources",
|
|
309
|
+
"path": "sources",
|
|
310
|
+
"isLineage": true,
|
|
311
|
+
"relationship": {
|
|
312
|
+
"name": "DerivedFrom",
|
|
313
|
+
"entityTypes": [
|
|
314
|
+
"dataset"
|
|
315
|
+
],
|
|
316
|
+
"isLineage": true
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
]
|
|
320
|
+
}
|
|
321
|
+
},
|
|
322
|
+
"mlPrimaryKey": {
|
|
323
|
+
"mlPrimaryKeyProperties": {
|
|
324
|
+
"aspect": "mlPrimaryKeyProperties",
|
|
325
|
+
"fields": [
|
|
326
|
+
{
|
|
327
|
+
"name": "sources",
|
|
328
|
+
"path": "sources",
|
|
329
|
+
"isLineage": true,
|
|
330
|
+
"relationship": {
|
|
331
|
+
"name": "DerivedFrom",
|
|
332
|
+
"entityTypes": [
|
|
333
|
+
"dataset"
|
|
334
|
+
],
|
|
335
|
+
"isLineage": true
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
]
|
|
339
|
+
}
|
|
340
|
+
},
|
|
341
|
+
"mlModel": {
|
|
342
|
+
"mlModelProperties": {
|
|
343
|
+
"aspect": "mlModelProperties",
|
|
344
|
+
"fields": [
|
|
345
|
+
{
|
|
346
|
+
"name": "trainingJobs",
|
|
347
|
+
"path": "trainingJobs",
|
|
348
|
+
"isLineage": true,
|
|
349
|
+
"relationship": {
|
|
350
|
+
"name": "TrainedBy",
|
|
351
|
+
"entityTypes": [
|
|
352
|
+
"dataJob",
|
|
353
|
+
"dataProcessInstance"
|
|
354
|
+
],
|
|
355
|
+
"isLineage": true
|
|
356
|
+
}
|
|
357
|
+
},
|
|
358
|
+
{
|
|
359
|
+
"name": "downstreamJobs",
|
|
360
|
+
"path": "downstreamJobs",
|
|
361
|
+
"isLineage": true,
|
|
362
|
+
"relationship": {
|
|
363
|
+
"name": "UsedBy",
|
|
364
|
+
"entityTypes": [
|
|
365
|
+
"dataJob",
|
|
366
|
+
"dataProcessInstance"
|
|
367
|
+
],
|
|
368
|
+
"isLineage": true
|
|
369
|
+
}
|
|
370
|
+
},
|
|
371
|
+
{
|
|
372
|
+
"name": "mlFeatures",
|
|
373
|
+
"path": "mlFeatures",
|
|
374
|
+
"isLineage": true,
|
|
375
|
+
"relationship": {
|
|
376
|
+
"name": "Consumes",
|
|
377
|
+
"entityTypes": [
|
|
378
|
+
"mlFeature"
|
|
379
|
+
],
|
|
380
|
+
"isLineage": true
|
|
381
|
+
}
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
"name": "groups",
|
|
385
|
+
"path": "groups",
|
|
386
|
+
"isLineage": true,
|
|
387
|
+
"relationship": {
|
|
388
|
+
"name": "MemberOf",
|
|
389
|
+
"entityTypes": [
|
|
390
|
+
"mlModelGroup"
|
|
391
|
+
],
|
|
392
|
+
"isLineage": true
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
]
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
},
|
|
399
|
+
"generated_by": "metadata-ingestion/scripts/modeldocgen.py",
|
|
400
|
+
"generated_at": "2025-07-01T10:49:03.713749+00:00"
|
|
401
|
+
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
# Global cache for lineage data to avoid repeated file reads
|
|
11
|
+
_lineage_data: Optional["LineageData"] = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Field:
|
|
16
|
+
name: str
|
|
17
|
+
path: str
|
|
18
|
+
isLineage: bool
|
|
19
|
+
relationship: Optional[Dict]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Aspect:
|
|
24
|
+
name: str
|
|
25
|
+
fields: List[Field]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class Entity:
|
|
30
|
+
name: str
|
|
31
|
+
aspects: Dict[str, Aspect]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class LineageData:
|
|
36
|
+
# entity name -> aspect
|
|
37
|
+
entities: Dict[str, Entity]
|
|
38
|
+
generated_by: str
|
|
39
|
+
generated_at: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_lineage_data() -> LineageData:
|
|
43
|
+
"""
|
|
44
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
45
|
+
"""
|
|
46
|
+
global _lineage_data
|
|
47
|
+
|
|
48
|
+
if _lineage_data is not None:
|
|
49
|
+
return _lineage_data
|
|
50
|
+
|
|
51
|
+
raw_data = _load_lineage_data()
|
|
52
|
+
_entities = raw_data.get("entities", {})
|
|
53
|
+
for entity_name, entity_data in _entities.items():
|
|
54
|
+
entity = Entity(
|
|
55
|
+
name=entity_name,
|
|
56
|
+
aspects={},
|
|
57
|
+
)
|
|
58
|
+
for aspect_name, aspect_data in entity_data.items():
|
|
59
|
+
entity.aspects[aspect_name] = Aspect(
|
|
60
|
+
name=aspect_name,
|
|
61
|
+
fields=[
|
|
62
|
+
Field(
|
|
63
|
+
name=field["name"],
|
|
64
|
+
path=field["path"],
|
|
65
|
+
isLineage=field["isLineage"],
|
|
66
|
+
relationship=field.get("relationship", None),
|
|
67
|
+
)
|
|
68
|
+
for field in aspect_data.get("fields", [])
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
_entities[entity_name] = entity
|
|
72
|
+
|
|
73
|
+
_lineage_data = LineageData(
|
|
74
|
+
entities=_entities,
|
|
75
|
+
generated_by=raw_data.get("generated_by", ""),
|
|
76
|
+
generated_at=raw_data.get("generated_at", ""),
|
|
77
|
+
)
|
|
78
|
+
return _lineage_data
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_all_aspect_names() -> List[str]:
|
|
82
|
+
"""
|
|
83
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
84
|
+
"""
|
|
85
|
+
entities = get_lineage_data().entities
|
|
86
|
+
if not entities:
|
|
87
|
+
return []
|
|
88
|
+
first_entity = next(iter(entities.values()))
|
|
89
|
+
return list(first_entity.aspects.keys())
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _load_lineage_data() -> Dict:
|
|
93
|
+
"""
|
|
94
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
95
|
+
|
|
96
|
+
Load lineage data from the autogenerated lineage.json file.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Dict containing the lineage information, or empty dict if file doesn't exist
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
json.JSONDecodeError: If lineage.json is malformed
|
|
103
|
+
"""
|
|
104
|
+
# Get the path to lineage.json relative to this file
|
|
105
|
+
current_file = Path(__file__)
|
|
106
|
+
lineage_file = current_file.parent / "lineage.json"
|
|
107
|
+
|
|
108
|
+
if not lineage_file.exists():
|
|
109
|
+
logger.warning(
|
|
110
|
+
f"Lineage file not found: {lineage_file}. "
|
|
111
|
+
"This may indicate a packaging issue. Lineage detection will be disabled."
|
|
112
|
+
)
|
|
113
|
+
return {}
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
with open(lineage_file, "r") as f:
|
|
117
|
+
return json.load(f)
|
|
118
|
+
except json.JSONDecodeError as e:
|
|
119
|
+
logger.error(
|
|
120
|
+
f"Failed to parse lineage.json: {e}. Lineage detection will be disabled."
|
|
121
|
+
)
|
|
122
|
+
return {}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _get_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
126
|
+
"""
|
|
127
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
128
|
+
"""
|
|
129
|
+
lineage_data = get_lineage_data()
|
|
130
|
+
entity = lineage_data.entities.get(entity_type)
|
|
131
|
+
if not entity:
|
|
132
|
+
return []
|
|
133
|
+
|
|
134
|
+
aspect = entity.aspects.get(aspect_name)
|
|
135
|
+
if not aspect:
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
return [
|
|
139
|
+
{
|
|
140
|
+
"name": field.name,
|
|
141
|
+
"path": field.path,
|
|
142
|
+
"isLineage": field.isLineage,
|
|
143
|
+
"relationship": field.relationship,
|
|
144
|
+
}
|
|
145
|
+
for field in aspect.fields
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
|
|
150
|
+
"""
|
|
151
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
152
|
+
"""
|
|
153
|
+
return [
|
|
154
|
+
field
|
|
155
|
+
for field in _get_fields(entity_type, aspect_name)
|
|
156
|
+
if field.get("isLineage", False)
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@lru_cache(maxsize=128)
|
|
161
|
+
def is_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
|
|
162
|
+
"""
|
|
163
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
164
|
+
"""
|
|
165
|
+
return len(_get_lineage_fields(entity_type, aspect_name)) > 0
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def clear_cache() -> None:
|
|
169
|
+
"""
|
|
170
|
+
This is experimental internal API subject to breaking changes without prior notice.
|
|
171
|
+
|
|
172
|
+
Clear the internal cache of lineage data.
|
|
173
|
+
|
|
174
|
+
This is useful for testing or when the lineage.json file has been updated.
|
|
175
|
+
"""
|
|
176
|
+
global _lineage_data
|
|
177
|
+
_lineage_data = None
|
|
@@ -125,7 +125,7 @@ class AvroToMceSchemaConverter:
|
|
|
125
125
|
self._prefix_name_stack: PrefixNameStack = [self.version_string]
|
|
126
126
|
# Tracks the fields on the current path.
|
|
127
127
|
self._fields_stack: FieldStack = []
|
|
128
|
-
#
|
|
128
|
+
# Stack of record types currently being processed. Used to prevent infinite recursion with recursive types.
|
|
129
129
|
self._record_types_seen: List[str] = []
|
|
130
130
|
# If part of the key-schema or value-schema.
|
|
131
131
|
self._is_key_schema = is_key_schema
|
|
@@ -522,10 +522,12 @@ class AvroToMceSchemaConverter:
|
|
|
522
522
|
# Handle recursive record definitions
|
|
523
523
|
recurse: bool = True
|
|
524
524
|
if isinstance(schema, avro.schema.RecordSchema):
|
|
525
|
-
if
|
|
526
|
-
|
|
527
|
-
|
|
525
|
+
# Only prevent recursion if we're currently processing this record type (true recursion)
|
|
526
|
+
# Allow reuse of the same record type in different contexts
|
|
527
|
+
if schema.fullname in self._record_types_seen:
|
|
528
528
|
recurse = False
|
|
529
|
+
else:
|
|
530
|
+
self._record_types_seen.append(schema.fullname)
|
|
529
531
|
|
|
530
532
|
# Adjust actual schema if needed
|
|
531
533
|
actual_schema = self._get_underlying_type_if_option_as_union(schema, schema)
|
|
@@ -559,6 +561,13 @@ class AvroToMceSchemaConverter:
|
|
|
559
561
|
for sub_schema in self._get_sub_schemas(actual_schema):
|
|
560
562
|
yield from self._to_mce_fields(sub_schema)
|
|
561
563
|
|
|
564
|
+
# Clean up the processing stack
|
|
565
|
+
if (
|
|
566
|
+
isinstance(schema, avro.schema.RecordSchema)
|
|
567
|
+
and schema.fullname in self._record_types_seen
|
|
568
|
+
):
|
|
569
|
+
self._record_types_seen.remove(schema.fullname)
|
|
570
|
+
|
|
562
571
|
def _gen_non_nested_to_mce_fields(
|
|
563
572
|
self, schema: SchemaOrField
|
|
564
573
|
) -> Iterable[SchemaField]:
|
|
@@ -90,6 +90,11 @@ class ClassificationHandler:
|
|
|
90
90
|
|
|
91
91
|
def get_classifiers(self) -> List[Classifier]:
|
|
92
92
|
classifiers = []
|
|
93
|
+
if (
|
|
94
|
+
not isinstance(self.config, ClassificationSourceConfigMixin)
|
|
95
|
+
or self.config.classification is None
|
|
96
|
+
):
|
|
97
|
+
return classifiers
|
|
93
98
|
|
|
94
99
|
for classifier in self.config.classification.classifiers:
|
|
95
100
|
classifier_class = classifier_registry.get(classifier.type)
|