acryl-datahub 1.3.0.1rc5__py3-none-any.whl → 1.3.0.1rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/METADATA +2439 -2439
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/RECORD +23 -20
- datahub/_version.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
- datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
- datahub/metadata/_internal_schema_classes.py +223 -0
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +206 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +228 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "record",
|
|
3
|
+
"Aspect": {
|
|
4
|
+
"name": "dataHubFileInfo"
|
|
5
|
+
},
|
|
6
|
+
"name": "DataHubFileInfo",
|
|
7
|
+
"namespace": "com.linkedin.pegasus2avro.file",
|
|
8
|
+
"fields": [
|
|
9
|
+
{
|
|
10
|
+
"type": {
|
|
11
|
+
"type": "record",
|
|
12
|
+
"name": "BucketStorageLocation",
|
|
13
|
+
"namespace": "com.linkedin.pegasus2avro.file",
|
|
14
|
+
"fields": [
|
|
15
|
+
{
|
|
16
|
+
"Searchable": {
|
|
17
|
+
"fieldType": "KEYWORD"
|
|
18
|
+
},
|
|
19
|
+
"type": "string",
|
|
20
|
+
"name": "storageBucket",
|
|
21
|
+
"doc": "The storage bucket this file is stored in"
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"Searchable": {
|
|
25
|
+
"fieldType": "KEYWORD"
|
|
26
|
+
},
|
|
27
|
+
"type": "string",
|
|
28
|
+
"name": "storageKey",
|
|
29
|
+
"doc": "The key for where this file is stored inside of the given bucket"
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"doc": "Information where a file is stored"
|
|
33
|
+
},
|
|
34
|
+
"name": "bucketStorageLocation",
|
|
35
|
+
"doc": "Info about where a file is stored"
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"Searchable": {
|
|
39
|
+
"fieldType": "TEXT_PARTIAL"
|
|
40
|
+
},
|
|
41
|
+
"type": "string",
|
|
42
|
+
"name": "originalFileName",
|
|
43
|
+
"doc": "The original filename as uploaded by the user"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"Searchable": {
|
|
47
|
+
"fieldType": "KEYWORD"
|
|
48
|
+
},
|
|
49
|
+
"type": "string",
|
|
50
|
+
"name": "mimeType",
|
|
51
|
+
"doc": "MIME type of the file (e.g., image/png, application/pdf)"
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"type": "long",
|
|
55
|
+
"name": "sizeInBytes",
|
|
56
|
+
"doc": "Size of the file in bytes"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"Searchable": {
|
|
60
|
+
"fieldType": "KEYWORD"
|
|
61
|
+
},
|
|
62
|
+
"type": {
|
|
63
|
+
"type": "enum",
|
|
64
|
+
"symbolDocs": {
|
|
65
|
+
"ASSET_DOCUMENTATION": "File uploaded for entity documentation"
|
|
66
|
+
},
|
|
67
|
+
"name": "FileUploadScenario",
|
|
68
|
+
"namespace": "com.linkedin.pegasus2avro.file",
|
|
69
|
+
"symbols": [
|
|
70
|
+
"ASSET_DOCUMENTATION"
|
|
71
|
+
]
|
|
72
|
+
},
|
|
73
|
+
"name": "scenario",
|
|
74
|
+
"doc": "The scenario/context in which this file was uploaded"
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"Relationship": {
|
|
78
|
+
"entityTypes": [
|
|
79
|
+
"dataset",
|
|
80
|
+
"chart",
|
|
81
|
+
"container",
|
|
82
|
+
"dashboard",
|
|
83
|
+
"dataFlow",
|
|
84
|
+
"dataJob",
|
|
85
|
+
"glossaryTerm",
|
|
86
|
+
"glossaryNode",
|
|
87
|
+
"mlModel",
|
|
88
|
+
"mlFeature",
|
|
89
|
+
"notebook",
|
|
90
|
+
"mlFeatureTable",
|
|
91
|
+
"mlPrimaryKey",
|
|
92
|
+
"mlModelGroup",
|
|
93
|
+
"domain",
|
|
94
|
+
"dataProduct"
|
|
95
|
+
],
|
|
96
|
+
"name": "ReferencedBy"
|
|
97
|
+
},
|
|
98
|
+
"Searchable": {
|
|
99
|
+
"fieldType": "URN"
|
|
100
|
+
},
|
|
101
|
+
"java": {
|
|
102
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
103
|
+
},
|
|
104
|
+
"type": [
|
|
105
|
+
"null",
|
|
106
|
+
"string"
|
|
107
|
+
],
|
|
108
|
+
"name": "referencedByAsset",
|
|
109
|
+
"default": null,
|
|
110
|
+
"doc": "Optional URN of the entity this file is associated with (e.g., the dataset whose docs contain this file)",
|
|
111
|
+
"Urn": "Urn",
|
|
112
|
+
"entityTypes": [
|
|
113
|
+
"dataset",
|
|
114
|
+
"chart",
|
|
115
|
+
"container",
|
|
116
|
+
"dashboard",
|
|
117
|
+
"dataFlow",
|
|
118
|
+
"dataJob",
|
|
119
|
+
"glossaryTerm",
|
|
120
|
+
"glossaryNode",
|
|
121
|
+
"mlModel",
|
|
122
|
+
"mlFeature",
|
|
123
|
+
"notebook",
|
|
124
|
+
"mlFeatureTable",
|
|
125
|
+
"mlPrimaryKey",
|
|
126
|
+
"mlModelGroup",
|
|
127
|
+
"domain",
|
|
128
|
+
"dataProduct"
|
|
129
|
+
]
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
"Relationship": {
|
|
133
|
+
"entityTypes": [
|
|
134
|
+
"schemaField"
|
|
135
|
+
],
|
|
136
|
+
"name": "ReferencedBy"
|
|
137
|
+
},
|
|
138
|
+
"Searchable": {
|
|
139
|
+
"fieldType": "URN"
|
|
140
|
+
},
|
|
141
|
+
"java": {
|
|
142
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
143
|
+
},
|
|
144
|
+
"type": [
|
|
145
|
+
"null",
|
|
146
|
+
"string"
|
|
147
|
+
],
|
|
148
|
+
"name": "schemaField",
|
|
149
|
+
"default": null,
|
|
150
|
+
"doc": "The dataset schema field urn this file is referenced by",
|
|
151
|
+
"Urn": "Urn",
|
|
152
|
+
"entityTypes": [
|
|
153
|
+
"schemaField"
|
|
154
|
+
]
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"Searchable": {
|
|
158
|
+
"/actor": {
|
|
159
|
+
"fieldName": "createdBy",
|
|
160
|
+
"fieldType": "URN"
|
|
161
|
+
},
|
|
162
|
+
"/time": {
|
|
163
|
+
"fieldName": "createdAt",
|
|
164
|
+
"fieldType": "DATETIME"
|
|
165
|
+
}
|
|
166
|
+
},
|
|
167
|
+
"type": {
|
|
168
|
+
"type": "record",
|
|
169
|
+
"name": "AuditStamp",
|
|
170
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
171
|
+
"fields": [
|
|
172
|
+
{
|
|
173
|
+
"type": "long",
|
|
174
|
+
"name": "time",
|
|
175
|
+
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
"java": {
|
|
179
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
180
|
+
},
|
|
181
|
+
"type": "string",
|
|
182
|
+
"name": "actor",
|
|
183
|
+
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
|
|
184
|
+
"Urn": "Urn"
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"java": {
|
|
188
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
189
|
+
},
|
|
190
|
+
"type": [
|
|
191
|
+
"null",
|
|
192
|
+
"string"
|
|
193
|
+
],
|
|
194
|
+
"name": "impersonator",
|
|
195
|
+
"default": null,
|
|
196
|
+
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
|
|
197
|
+
"Urn": "Urn"
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
"type": [
|
|
201
|
+
"null",
|
|
202
|
+
"string"
|
|
203
|
+
],
|
|
204
|
+
"name": "message",
|
|
205
|
+
"default": null,
|
|
206
|
+
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
|
|
207
|
+
}
|
|
208
|
+
],
|
|
209
|
+
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
|
|
210
|
+
},
|
|
211
|
+
"name": "created",
|
|
212
|
+
"doc": "Timestamp when this file was created and by whom"
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
"Searchable": {
|
|
216
|
+
"fieldType": "KEYWORD"
|
|
217
|
+
},
|
|
218
|
+
"type": [
|
|
219
|
+
"null",
|
|
220
|
+
"string"
|
|
221
|
+
],
|
|
222
|
+
"name": "contentHash",
|
|
223
|
+
"default": null,
|
|
224
|
+
"doc": "SHA-256 hash of file contents"
|
|
225
|
+
}
|
|
226
|
+
],
|
|
227
|
+
"doc": "Information about a DataHub file - a file stored in S3 for use within DataHub platform features like documentation, home pages, and announcements."
|
|
228
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "record",
|
|
3
|
+
"Aspect": {
|
|
4
|
+
"name": "dataHubFileKey",
|
|
5
|
+
"keyForEntity": "dataHubFile",
|
|
6
|
+
"entityCategory": "core",
|
|
7
|
+
"entityAspects": [
|
|
8
|
+
"dataHubFileInfo"
|
|
9
|
+
]
|
|
10
|
+
},
|
|
11
|
+
"name": "DataHubFileKey",
|
|
12
|
+
"namespace": "com.linkedin.pegasus2avro.metadata.key",
|
|
13
|
+
"fields": [
|
|
14
|
+
{
|
|
15
|
+
"type": "string",
|
|
16
|
+
"name": "id",
|
|
17
|
+
"doc": "Unique id for the file."
|
|
18
|
+
}
|
|
19
|
+
],
|
|
20
|
+
"doc": "Key for a DataHubFile"
|
|
21
|
+
}
|
|
@@ -1340,11 +1340,25 @@ class SqlParsingAggregator(Closeable):
|
|
|
1340
1340
|
upstreams.setdefault(upstream, query.query_id)
|
|
1341
1341
|
|
|
1342
1342
|
for lineage_info in query.column_lineage:
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1343
|
+
if (
|
|
1344
|
+
not lineage_info.downstream.column
|
|
1345
|
+
or not lineage_info.downstream.column.strip()
|
|
1346
|
+
):
|
|
1347
|
+
logger.debug(
|
|
1348
|
+
f"Skipping lineage entry with empty downstream column in query {query.query_id}"
|
|
1347
1349
|
)
|
|
1350
|
+
continue
|
|
1351
|
+
|
|
1352
|
+
for upstream_ref in lineage_info.upstreams:
|
|
1353
|
+
if upstream_ref.column and upstream_ref.column.strip():
|
|
1354
|
+
cll[lineage_info.downstream.column].setdefault(
|
|
1355
|
+
SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
|
|
1356
|
+
query.query_id,
|
|
1357
|
+
)
|
|
1358
|
+
else:
|
|
1359
|
+
logger.debug(
|
|
1360
|
+
f"Skipping empty column reference in lineage for query {query.query_id}"
|
|
1361
|
+
)
|
|
1348
1362
|
|
|
1349
1363
|
# Finally, we can build our lineage edge.
|
|
1350
1364
|
required_queries = OrderedSet[QueryId]()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|