acryl-datahub 1.3.0.1rc4__py3-none-any.whl → 1.3.0.1rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (31) hide show
  1. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/METADATA +2637 -2633
  2. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/RECORD +31 -28
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/source/aws/aws_common.py +161 -0
  5. datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
  6. datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
  7. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +5 -3
  8. datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
  9. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  10. datahub/ingestion/source/redshift/usage.py +2 -2
  11. datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
  12. datahub/ingestion/source/snowflake/snowflake_queries.py +46 -6
  13. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
  14. datahub/ingestion/source/sql/mysql.py +101 -4
  15. datahub/ingestion/source/sql/postgres.py +81 -4
  16. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  17. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  18. datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
  19. datahub/metadata/_internal_schema_classes.py +772 -546
  20. datahub/metadata/_urns/urn_defs.py +1751 -1695
  21. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  22. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  23. datahub/metadata/schema.avsc +18450 -18242
  24. datahub/metadata/schemas/DataHubFileInfo.avsc +228 -0
  25. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  26. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +3 -1
  27. datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
  28. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/WHEEL +0 -0
  29. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/entry_points.txt +0 -0
  30. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/licenses/LICENSE +0 -0
  31. {acryl_datahub-1.3.0.1rc4.dist-info → acryl_datahub-1.3.0.1rc6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,228 @@
1
+ {
2
+ "type": "record",
3
+ "Aspect": {
4
+ "name": "dataHubFileInfo"
5
+ },
6
+ "name": "DataHubFileInfo",
7
+ "namespace": "com.linkedin.pegasus2avro.file",
8
+ "fields": [
9
+ {
10
+ "type": {
11
+ "type": "record",
12
+ "name": "BucketStorageLocation",
13
+ "namespace": "com.linkedin.pegasus2avro.file",
14
+ "fields": [
15
+ {
16
+ "Searchable": {
17
+ "fieldType": "KEYWORD"
18
+ },
19
+ "type": "string",
20
+ "name": "storageBucket",
21
+ "doc": "The storage bucket this file is stored in"
22
+ },
23
+ {
24
+ "Searchable": {
25
+ "fieldType": "KEYWORD"
26
+ },
27
+ "type": "string",
28
+ "name": "storageKey",
29
+ "doc": "The key for where this file is stored inside of the given bucket"
30
+ }
31
+ ],
32
+ "doc": "Information where a file is stored"
33
+ },
34
+ "name": "bucketStorageLocation",
35
+ "doc": "Info about where a file is stored"
36
+ },
37
+ {
38
+ "Searchable": {
39
+ "fieldType": "TEXT_PARTIAL"
40
+ },
41
+ "type": "string",
42
+ "name": "originalFileName",
43
+ "doc": "The original filename as uploaded by the user"
44
+ },
45
+ {
46
+ "Searchable": {
47
+ "fieldType": "KEYWORD"
48
+ },
49
+ "type": "string",
50
+ "name": "mimeType",
51
+ "doc": "MIME type of the file (e.g., image/png, application/pdf)"
52
+ },
53
+ {
54
+ "type": "long",
55
+ "name": "sizeInBytes",
56
+ "doc": "Size of the file in bytes"
57
+ },
58
+ {
59
+ "Searchable": {
60
+ "fieldType": "KEYWORD"
61
+ },
62
+ "type": {
63
+ "type": "enum",
64
+ "symbolDocs": {
65
+ "ASSET_DOCUMENTATION": "File uploaded for entity documentation"
66
+ },
67
+ "name": "FileUploadScenario",
68
+ "namespace": "com.linkedin.pegasus2avro.file",
69
+ "symbols": [
70
+ "ASSET_DOCUMENTATION"
71
+ ]
72
+ },
73
+ "name": "scenario",
74
+ "doc": "The scenario/context in which this file was uploaded"
75
+ },
76
+ {
77
+ "Relationship": {
78
+ "entityTypes": [
79
+ "dataset",
80
+ "chart",
81
+ "container",
82
+ "dashboard",
83
+ "dataFlow",
84
+ "dataJob",
85
+ "glossaryTerm",
86
+ "glossaryNode",
87
+ "mlModel",
88
+ "mlFeature",
89
+ "notebook",
90
+ "mlFeatureTable",
91
+ "mlPrimaryKey",
92
+ "mlModelGroup",
93
+ "domain",
94
+ "dataProduct"
95
+ ],
96
+ "name": "ReferencedBy"
97
+ },
98
+ "Searchable": {
99
+ "fieldType": "URN"
100
+ },
101
+ "java": {
102
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
103
+ },
104
+ "type": [
105
+ "null",
106
+ "string"
107
+ ],
108
+ "name": "referencedByAsset",
109
+ "default": null,
110
+ "doc": "Optional URN of the entity this file is associated with (e.g., the dataset whose docs contain this file)",
111
+ "Urn": "Urn",
112
+ "entityTypes": [
113
+ "dataset",
114
+ "chart",
115
+ "container",
116
+ "dashboard",
117
+ "dataFlow",
118
+ "dataJob",
119
+ "glossaryTerm",
120
+ "glossaryNode",
121
+ "mlModel",
122
+ "mlFeature",
123
+ "notebook",
124
+ "mlFeatureTable",
125
+ "mlPrimaryKey",
126
+ "mlModelGroup",
127
+ "domain",
128
+ "dataProduct"
129
+ ]
130
+ },
131
+ {
132
+ "Relationship": {
133
+ "entityTypes": [
134
+ "schemaField"
135
+ ],
136
+ "name": "ReferencedBy"
137
+ },
138
+ "Searchable": {
139
+ "fieldType": "URN"
140
+ },
141
+ "java": {
142
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
143
+ },
144
+ "type": [
145
+ "null",
146
+ "string"
147
+ ],
148
+ "name": "schemaField",
149
+ "default": null,
150
+ "doc": "The dataset schema field urn this file is referenced by",
151
+ "Urn": "Urn",
152
+ "entityTypes": [
153
+ "schemaField"
154
+ ]
155
+ },
156
+ {
157
+ "Searchable": {
158
+ "/actor": {
159
+ "fieldName": "createdBy",
160
+ "fieldType": "URN"
161
+ },
162
+ "/time": {
163
+ "fieldName": "createdAt",
164
+ "fieldType": "DATETIME"
165
+ }
166
+ },
167
+ "type": {
168
+ "type": "record",
169
+ "name": "AuditStamp",
170
+ "namespace": "com.linkedin.pegasus2avro.common",
171
+ "fields": [
172
+ {
173
+ "type": "long",
174
+ "name": "time",
175
+ "doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
176
+ },
177
+ {
178
+ "java": {
179
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
180
+ },
181
+ "type": "string",
182
+ "name": "actor",
183
+ "doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
184
+ "Urn": "Urn"
185
+ },
186
+ {
187
+ "java": {
188
+ "class": "com.linkedin.pegasus2avro.common.urn.Urn"
189
+ },
190
+ "type": [
191
+ "null",
192
+ "string"
193
+ ],
194
+ "name": "impersonator",
195
+ "default": null,
196
+ "doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
197
+ "Urn": "Urn"
198
+ },
199
+ {
200
+ "type": [
201
+ "null",
202
+ "string"
203
+ ],
204
+ "name": "message",
205
+ "default": null,
206
+ "doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
207
+ }
208
+ ],
209
+ "doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
210
+ },
211
+ "name": "created",
212
+ "doc": "Timestamp when this file was created and by whom"
213
+ },
214
+ {
215
+ "Searchable": {
216
+ "fieldType": "KEYWORD"
217
+ },
218
+ "type": [
219
+ "null",
220
+ "string"
221
+ ],
222
+ "name": "contentHash",
223
+ "default": null,
224
+ "doc": "SHA-256 hash of file contents"
225
+ }
226
+ ],
227
+ "doc": "Information about a DataHub file - a file stored in S3 for use within DataHub platform features like documentation, home pages, and announcements."
228
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "type": "record",
3
+ "Aspect": {
4
+ "name": "dataHubFileKey",
5
+ "keyForEntity": "dataHubFile",
6
+ "entityCategory": "core",
7
+ "entityAspects": [
8
+ "dataHubFileInfo"
9
+ ]
10
+ },
11
+ "name": "DataHubFileKey",
12
+ "namespace": "com.linkedin.pegasus2avro.metadata.key",
13
+ "fields": [
14
+ {
15
+ "type": "string",
16
+ "name": "id",
17
+ "doc": "Unique id for the file."
18
+ }
19
+ ],
20
+ "doc": "Key for a DataHubFile"
21
+ }
@@ -26,6 +26,7 @@
26
26
  "HIERARCHY": "A module displaying a hierarchy to navigate",
27
27
  "LINK": "Link type module",
28
28
  "OWNED_ASSETS": "Module displaying assets owned by a user",
29
+ "PLATFORMS": "Module displaying the platforms in an instance",
29
30
  "RELATED_TERMS": "Module displaying the related terms of a given glossary term",
30
31
  "RICH_TEXT": "Module containing rich text to be rendered"
31
32
  },
@@ -41,7 +42,8 @@
41
42
  "ASSETS",
42
43
  "CHILD_HIERARCHY",
43
44
  "DATA_PRODUCTS",
44
- "RELATED_TERMS"
45
+ "RELATED_TERMS",
46
+ "PLATFORMS"
45
47
  ],
46
48
  "doc": "Enum containing the types of page modules that there are"
47
49
  },
@@ -1340,11 +1340,25 @@ class SqlParsingAggregator(Closeable):
1340
1340
  upstreams.setdefault(upstream, query.query_id)
1341
1341
 
1342
1342
  for lineage_info in query.column_lineage:
1343
- for upstream_ref in lineage_info.upstreams:
1344
- cll[lineage_info.downstream.column].setdefault(
1345
- SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
1346
- query.query_id,
1343
+ if (
1344
+ not lineage_info.downstream.column
1345
+ or not lineage_info.downstream.column.strip()
1346
+ ):
1347
+ logger.debug(
1348
+ f"Skipping lineage entry with empty downstream column in query {query.query_id}"
1347
1349
  )
1350
+ continue
1351
+
1352
+ for upstream_ref in lineage_info.upstreams:
1353
+ if upstream_ref.column and upstream_ref.column.strip():
1354
+ cll[lineage_info.downstream.column].setdefault(
1355
+ SchemaFieldUrn(upstream_ref.table, upstream_ref.column),
1356
+ query.query_id,
1357
+ )
1358
+ else:
1359
+ logger.debug(
1360
+ f"Skipping empty column reference in lineage for query {query.query_id}"
1361
+ )
1348
1362
 
1349
1363
  # Finally, we can build our lineage edge.
1350
1364
  required_queries = OrderedSet[QueryId]()