acryl-datahub 0.15.0rc4__py3-none-any.whl → 0.15.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/METADATA +2504 -2474
- {acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/RECORD +24 -22
- {acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +11 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_source.py +3 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +21 -8
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +1 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/sql_types.py +72 -7
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/utilities/urn_encoder.py +2 -1
- {acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc4.dist-info → acryl_datahub-0.15.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from neo4j import GraphDatabase
|
|
8
|
+
from pydantic.fields import Field
|
|
9
|
+
|
|
10
|
+
from datahub.configuration.source_common import EnvConfigMixin
|
|
11
|
+
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
12
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
14
|
+
from datahub.ingestion.api.decorators import (
|
|
15
|
+
SupportStatus,
|
|
16
|
+
config_class,
|
|
17
|
+
platform_name,
|
|
18
|
+
support_status,
|
|
19
|
+
)
|
|
20
|
+
from datahub.ingestion.api.source import Source, SourceReport
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
|
|
24
|
+
from datahub.metadata.schema_classes import (
|
|
25
|
+
AuditStampClass,
|
|
26
|
+
BooleanTypeClass,
|
|
27
|
+
DatasetPropertiesClass,
|
|
28
|
+
DateTypeClass,
|
|
29
|
+
NullTypeClass,
|
|
30
|
+
NumberTypeClass,
|
|
31
|
+
OtherSchemaClass,
|
|
32
|
+
SchemaFieldClass,
|
|
33
|
+
SchemaMetadataClass,
|
|
34
|
+
StringTypeClass,
|
|
35
|
+
SubTypesClass,
|
|
36
|
+
UnionTypeClass,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
log = logging.getLogger(__name__)
|
|
40
|
+
logging.basicConfig(level=logging.INFO)
|
|
41
|
+
|
|
42
|
+
_type_mapping: Dict[Union[Type, str], Type] = {
|
|
43
|
+
"list": UnionTypeClass,
|
|
44
|
+
"boolean": BooleanTypeClass,
|
|
45
|
+
"integer": NumberTypeClass,
|
|
46
|
+
"local_date_time": DateTypeClass,
|
|
47
|
+
"float": NumberTypeClass,
|
|
48
|
+
"string": StringTypeClass,
|
|
49
|
+
"date": DateTypeClass,
|
|
50
|
+
"node": StringTypeClass,
|
|
51
|
+
"relationship": StringTypeClass,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Neo4jConfig(EnvConfigMixin):
|
|
56
|
+
username: str = Field(description="Neo4j Username")
|
|
57
|
+
password: str = Field(description="Neo4j Password")
|
|
58
|
+
uri: str = Field(description="The URI for the Neo4j server")
|
|
59
|
+
env: str = Field(description="Neo4j env")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Neo4jSourceReport(SourceReport):
|
|
64
|
+
obj_failures: int = 0
|
|
65
|
+
obj_created: int = 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@platform_name("Neo4j", id="neo4j")
|
|
69
|
+
@config_class(Neo4jConfig)
|
|
70
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
+
class Neo4jSource(Source):
|
|
72
|
+
NODE = "node"
|
|
73
|
+
RELATIONSHIP = "relationship"
|
|
74
|
+
PLATFORM = "neo4j"
|
|
75
|
+
|
|
76
|
+
def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
|
|
77
|
+
self.ctx = ctx
|
|
78
|
+
self.config = config
|
|
79
|
+
self.report = Neo4jSourceReport()
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def create(cls, config_dict, ctx):
|
|
83
|
+
config = Neo4jConfig.parse_obj(config_dict)
|
|
84
|
+
return cls(ctx, config)
|
|
85
|
+
|
|
86
|
+
def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
|
|
87
|
+
type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
|
|
88
|
+
return SchemaFieldDataType(type=type_class())
|
|
89
|
+
|
|
90
|
+
def get_schema_field_class(
|
|
91
|
+
self, col_name: str, col_type: str, **kwargs: Any
|
|
92
|
+
) -> SchemaFieldClass:
|
|
93
|
+
if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
|
|
94
|
+
col_type = self.NODE
|
|
95
|
+
else:
|
|
96
|
+
col_type = col_type
|
|
97
|
+
return SchemaFieldClass(
|
|
98
|
+
fieldPath=col_name,
|
|
99
|
+
type=self.get_field_type(col_type),
|
|
100
|
+
nativeDataType=col_type,
|
|
101
|
+
description=col_type.upper()
|
|
102
|
+
if col_type in (self.NODE, self.RELATIONSHIP)
|
|
103
|
+
else col_type,
|
|
104
|
+
lastModified=AuditStampClass(
|
|
105
|
+
time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def add_properties(
|
|
110
|
+
self,
|
|
111
|
+
dataset: str,
|
|
112
|
+
description: Optional[str] = None,
|
|
113
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
114
|
+
) -> MetadataChangeProposalWrapper:
|
|
115
|
+
dataset_properties = DatasetPropertiesClass(
|
|
116
|
+
description=description,
|
|
117
|
+
customProperties=custom_properties,
|
|
118
|
+
)
|
|
119
|
+
return MetadataChangeProposalWrapper(
|
|
120
|
+
entityUrn=make_dataset_urn(
|
|
121
|
+
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
122
|
+
),
|
|
123
|
+
aspect=dataset_properties,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def generate_neo4j_object(
|
|
127
|
+
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
128
|
+
) -> MetadataChangeProposalWrapper:
|
|
129
|
+
try:
|
|
130
|
+
fields = [
|
|
131
|
+
self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
|
|
132
|
+
for d in columns
|
|
133
|
+
for key, value in d.items()
|
|
134
|
+
]
|
|
135
|
+
mcp = MetadataChangeProposalWrapper(
|
|
136
|
+
entityUrn=make_dataset_urn(
|
|
137
|
+
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
138
|
+
),
|
|
139
|
+
aspect=SchemaMetadataClass(
|
|
140
|
+
schemaName=dataset,
|
|
141
|
+
platform=make_data_platform_urn(self.PLATFORM),
|
|
142
|
+
version=0,
|
|
143
|
+
hash="",
|
|
144
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
145
|
+
lastModified=AuditStampClass(
|
|
146
|
+
time=round(time.time() * 1000),
|
|
147
|
+
actor="urn:li:corpuser:ingestion",
|
|
148
|
+
),
|
|
149
|
+
fields=fields,
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
self.report.obj_created += 1
|
|
153
|
+
except Exception as e:
|
|
154
|
+
log.error(e)
|
|
155
|
+
self.report.obj_failures += 1
|
|
156
|
+
return mcp
|
|
157
|
+
|
|
158
|
+
def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
|
|
159
|
+
driver = GraphDatabase.driver(
|
|
160
|
+
self.config.uri, auth=(self.config.username, self.config.password)
|
|
161
|
+
)
|
|
162
|
+
"""
|
|
163
|
+
This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
|
|
164
|
+
with two columns: key and value. The key represents the Neo4j object, while the value contains the
|
|
165
|
+
corresponding metadata.
|
|
166
|
+
|
|
167
|
+
When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
|
|
168
|
+
metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
|
|
169
|
+
relationships.
|
|
170
|
+
|
|
171
|
+
In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
|
|
172
|
+
dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
|
|
173
|
+
|
|
174
|
+
Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
|
|
175
|
+
single dataframe, which will be used to create the DataHub objects.
|
|
176
|
+
|
|
177
|
+
See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
log.info(f"{query}")
|
|
181
|
+
with driver.session() as session:
|
|
182
|
+
result = session.run(query)
|
|
183
|
+
data = [record for record in result]
|
|
184
|
+
log.info("Closing Neo4j driver")
|
|
185
|
+
driver.close()
|
|
186
|
+
|
|
187
|
+
node_df = self.process_nodes(data)
|
|
188
|
+
rel_df = self.process_relationships(data, node_df)
|
|
189
|
+
|
|
190
|
+
union_cols = ["key", "obj_type", "property_data_types", "description"]
|
|
191
|
+
df = pd.concat([node_df[union_cols], rel_df[union_cols]])
|
|
192
|
+
except Exception as e:
|
|
193
|
+
self.report.failure(
|
|
194
|
+
message="Failed to get neo4j metadata",
|
|
195
|
+
exc=e,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return df
|
|
199
|
+
|
|
200
|
+
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
201
|
+
nodes = [record for record in data if record["value"]["type"] == self.NODE]
|
|
202
|
+
node_df = pd.DataFrame(
|
|
203
|
+
nodes,
|
|
204
|
+
columns=["key", "value"],
|
|
205
|
+
)
|
|
206
|
+
node_df["obj_type"] = node_df["value"].apply(
|
|
207
|
+
lambda record: self.get_obj_type(record)
|
|
208
|
+
)
|
|
209
|
+
node_df["relationships"] = node_df["value"].apply(
|
|
210
|
+
lambda record: self.get_relationships(record)
|
|
211
|
+
)
|
|
212
|
+
node_df["properties"] = node_df["value"].apply(
|
|
213
|
+
lambda record: self.get_properties(record)
|
|
214
|
+
)
|
|
215
|
+
node_df["property_data_types"] = node_df["properties"].apply(
|
|
216
|
+
lambda record: self.get_property_data_types(record)
|
|
217
|
+
)
|
|
218
|
+
node_df["description"] = node_df.apply(
|
|
219
|
+
lambda record: self.get_node_description(record, node_df), axis=1
|
|
220
|
+
)
|
|
221
|
+
return node_df
|
|
222
|
+
|
|
223
|
+
def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
|
|
224
|
+
rels = [
|
|
225
|
+
record for record in data if record["value"]["type"] == self.RELATIONSHIP
|
|
226
|
+
]
|
|
227
|
+
rel_df = pd.DataFrame(rels, columns=["key", "value"])
|
|
228
|
+
rel_df["obj_type"] = rel_df["value"].apply(
|
|
229
|
+
lambda record: self.get_obj_type(record)
|
|
230
|
+
)
|
|
231
|
+
rel_df["properties"] = rel_df["value"].apply(
|
|
232
|
+
lambda record: self.get_properties(record)
|
|
233
|
+
)
|
|
234
|
+
rel_df["property_data_types"] = rel_df["properties"].apply(
|
|
235
|
+
lambda record: self.get_property_data_types(record)
|
|
236
|
+
)
|
|
237
|
+
rel_df["description"] = rel_df.apply(
|
|
238
|
+
lambda record: self.get_rel_descriptions(record, node_df), axis=1
|
|
239
|
+
)
|
|
240
|
+
return rel_df
|
|
241
|
+
|
|
242
|
+
def get_obj_type(self, record: dict) -> str:
|
|
243
|
+
return record["type"]
|
|
244
|
+
|
|
245
|
+
def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
|
|
246
|
+
descriptions = []
|
|
247
|
+
for _, row in df.iterrows():
|
|
248
|
+
relationships = row.get("relationships", {})
|
|
249
|
+
for relationship, props in relationships.items():
|
|
250
|
+
if record["key"] == relationship:
|
|
251
|
+
if props["direction"] == "in":
|
|
252
|
+
for prop in props["labels"]:
|
|
253
|
+
descriptions.append(
|
|
254
|
+
f"({row['key']})-[{record['key']}]->({prop})"
|
|
255
|
+
)
|
|
256
|
+
return "\n".join(descriptions)
|
|
257
|
+
|
|
258
|
+
def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
|
|
259
|
+
descriptions = []
|
|
260
|
+
for _, row in df.iterrows():
|
|
261
|
+
if record["key"] == row["key"]:
|
|
262
|
+
for relationship, props in row["relationships"].items():
|
|
263
|
+
direction = props["direction"]
|
|
264
|
+
for node in set(props["labels"]):
|
|
265
|
+
if direction == "in":
|
|
266
|
+
descriptions.append(
|
|
267
|
+
f"({row['key']})<-[{relationship}]-({node})"
|
|
268
|
+
)
|
|
269
|
+
elif direction == "out":
|
|
270
|
+
descriptions.append(
|
|
271
|
+
f"({row['key']})-[{relationship}]->({node})"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return "\n".join(descriptions)
|
|
275
|
+
|
|
276
|
+
def get_property_data_types(self, record: dict) -> List[dict]:
|
|
277
|
+
return [{k: v["type"]} for k, v in record.items()]
|
|
278
|
+
|
|
279
|
+
def get_properties(self, record: dict) -> str:
|
|
280
|
+
return record["properties"]
|
|
281
|
+
|
|
282
|
+
def get_relationships(self, record: dict) -> dict:
|
|
283
|
+
return record.get("relationships", None)
|
|
284
|
+
|
|
285
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
286
|
+
df = self.get_neo4j_metadata(
|
|
287
|
+
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
288
|
+
)
|
|
289
|
+
for index, row in df.iterrows():
|
|
290
|
+
try:
|
|
291
|
+
yield MetadataWorkUnit(
|
|
292
|
+
id=row["key"],
|
|
293
|
+
mcp=self.generate_neo4j_object(
|
|
294
|
+
columns=row["property_data_types"],
|
|
295
|
+
dataset=row["key"],
|
|
296
|
+
),
|
|
297
|
+
is_primary_source=True,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
yield MetadataWorkUnit(
|
|
301
|
+
id=row["key"],
|
|
302
|
+
mcp=MetadataChangeProposalWrapper(
|
|
303
|
+
entityUrn=make_dataset_urn(
|
|
304
|
+
platform=self.PLATFORM,
|
|
305
|
+
name=row["key"],
|
|
306
|
+
env=self.config.env,
|
|
307
|
+
),
|
|
308
|
+
aspect=SubTypesClass(
|
|
309
|
+
typeNames=[
|
|
310
|
+
DatasetSubTypes.NEO4J_NODE
|
|
311
|
+
if row["obj_type"] == self.NODE
|
|
312
|
+
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
313
|
+
]
|
|
314
|
+
),
|
|
315
|
+
),
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
yield MetadataWorkUnit(
|
|
319
|
+
id=row["key"],
|
|
320
|
+
mcp=self.add_properties(
|
|
321
|
+
dataset=row["key"],
|
|
322
|
+
custom_properties=None,
|
|
323
|
+
description=row["description"],
|
|
324
|
+
),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
raise e
|
|
329
|
+
|
|
330
|
+
def get_report(self):
|
|
331
|
+
return self.report
|
|
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
|
|
|
103
103
|
logger = logging.getLogger(__name__)
|
|
104
104
|
|
|
105
105
|
# https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
|
|
106
|
+
# TODO: Move to the standardized types in sql_types.py
|
|
106
107
|
SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
|
|
107
108
|
"DATE": DateType,
|
|
108
109
|
"BIGINT": NumberType,
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
26
26
|
platform_name,
|
|
27
27
|
support_status,
|
|
28
28
|
)
|
|
29
|
+
from datahub.ingestion.api.source import StructuredLogLevel
|
|
29
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
31
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
31
32
|
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
35
36
|
register_custom_type,
|
|
36
37
|
)
|
|
37
38
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
|
|
39
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
38
40
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
39
41
|
add_table_to_schema_container,
|
|
40
42
|
gen_database_container,
|
|
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
|
|
|
48
50
|
get_schema_fields_for_sqlalchemy_column,
|
|
49
51
|
)
|
|
50
52
|
|
|
53
|
+
try:
|
|
54
|
+
from typing_extensions import override
|
|
55
|
+
except ImportError:
|
|
56
|
+
_F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
|
|
57
|
+
|
|
58
|
+
def override(f: _F, /) -> _F: # noqa: F811
|
|
59
|
+
return f
|
|
60
|
+
|
|
61
|
+
|
|
51
62
|
logger = logging.getLogger(__name__)
|
|
52
63
|
|
|
53
64
|
assert STRUCT, "required type modules are not available"
|
|
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
|
|
|
322
333
|
- Profiling when enabled.
|
|
323
334
|
"""
|
|
324
335
|
|
|
325
|
-
|
|
336
|
+
config: AthenaConfig
|
|
337
|
+
report: SQLSourceReport
|
|
326
338
|
|
|
327
339
|
def __init__(self, config, ctx):
|
|
328
340
|
super().__init__(config, ctx, "athena")
|
|
329
341
|
self.cursor: Optional[BaseCursor] = None
|
|
330
342
|
|
|
343
|
+
self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
|
|
344
|
+
|
|
331
345
|
@classmethod
|
|
332
346
|
def create(cls, config_dict, ctx):
|
|
333
347
|
config = AthenaConfig.parse_obj(config_dict)
|
|
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
452
466
|
)
|
|
453
467
|
|
|
454
468
|
# It seems like database/schema filter in the connection string does not work and this to work around that
|
|
469
|
+
@override
|
|
455
470
|
def get_schema_names(self, inspector: Inspector) -> List[str]:
|
|
456
471
|
athena_config = typing.cast(AthenaConfig, self.config)
|
|
457
472
|
schemas = inspector.get_schema_names()
|
|
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
|
|
|
459
474
|
return [schema for schema in schemas if schema == athena_config.database]
|
|
460
475
|
return schemas
|
|
461
476
|
|
|
462
|
-
|
|
477
|
+
@classmethod
|
|
478
|
+
def _casted_partition_key(cls, key: str) -> str:
|
|
479
|
+
# We need to cast the partition keys to a VARCHAR, since otherwise
|
|
480
|
+
# Athena may throw an error during concatenation / comparison.
|
|
481
|
+
return f"CAST({key} as VARCHAR)"
|
|
482
|
+
|
|
483
|
+
@override
|
|
463
484
|
def get_partitions(
|
|
464
485
|
self, inspector: Inspector, schema: str, table: str
|
|
465
|
-
) -> List[str]:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
athena_config = typing.cast(AthenaConfig, self.config)
|
|
469
|
-
|
|
470
|
-
if not athena_config.extract_partitions:
|
|
471
|
-
return []
|
|
486
|
+
) -> Optional[List[str]]:
|
|
487
|
+
if not self.config.extract_partitions:
|
|
488
|
+
return None
|
|
472
489
|
|
|
473
490
|
if not self.cursor:
|
|
474
|
-
return
|
|
491
|
+
return None
|
|
475
492
|
|
|
476
493
|
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
477
494
|
table_name=table, schema_name=schema
|
|
478
495
|
)
|
|
479
496
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
return []
|
|
497
|
+
partitions = []
|
|
498
|
+
for key in metadata.partition_keys:
|
|
499
|
+
if key.name:
|
|
500
|
+
partitions.append(key.name)
|
|
501
|
+
if not partitions:
|
|
502
|
+
return []
|
|
487
503
|
|
|
488
|
-
|
|
489
|
-
|
|
504
|
+
with self.report.report_exc(
|
|
505
|
+
message="Failed to extract partition details",
|
|
506
|
+
context=f"{schema}.{table}",
|
|
507
|
+
level=StructuredLogLevel.WARN,
|
|
508
|
+
):
|
|
509
|
+
# We create an artifical concatenated partition key to be able to query max partition easier
|
|
510
|
+
part_concat = " || '-' || ".join(
|
|
511
|
+
self._casted_partition_key(key) for key in partitions
|
|
512
|
+
)
|
|
490
513
|
max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
|
|
491
514
|
ret = self.cursor.execute(max_partition_query)
|
|
492
515
|
max_partition: Dict[str, str] = {}
|
|
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
|
|
|
500
523
|
partitions=partitions,
|
|
501
524
|
max_partition=max_partition,
|
|
502
525
|
)
|
|
503
|
-
return partitions
|
|
504
526
|
|
|
505
|
-
return
|
|
527
|
+
return partitions
|
|
506
528
|
|
|
507
529
|
# Overwrite to modify the creation of schema fields
|
|
508
530
|
def get_schema_fields_for_column(
|
|
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
573
|
if partition and partition.max_partition:
|
|
552
574
|
max_partition_filters = []
|
|
553
575
|
for key, value in partition.max_partition.items():
|
|
554
|
-
max_partition_filters.append(
|
|
576
|
+
max_partition_filters.append(
|
|
577
|
+
f"{self._casted_partition_key(key)} = '{value}'"
|
|
578
|
+
)
|
|
555
579
|
max_partition = str(partition.max_partition)
|
|
556
580
|
return (
|
|
557
581
|
max_partition,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import Any, Dict, ValuesView
|
|
2
|
+
from typing import Any, Dict, Optional, Type, Union, ValuesView
|
|
3
3
|
|
|
4
4
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
5
5
|
ArrayType,
|
|
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
16
16
|
UnionType,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
DATAHUB_FIELD_TYPE = Union[
|
|
20
|
+
ArrayType,
|
|
21
|
+
BooleanType,
|
|
22
|
+
BytesType,
|
|
23
|
+
DateType,
|
|
24
|
+
EnumType,
|
|
25
|
+
MapType,
|
|
26
|
+
NullType,
|
|
27
|
+
NumberType,
|
|
28
|
+
RecordType,
|
|
29
|
+
StringType,
|
|
30
|
+
TimeType,
|
|
31
|
+
UnionType,
|
|
32
|
+
]
|
|
22
33
|
|
|
23
|
-
# we map from format_type since this is what dbt uses
|
|
24
|
-
# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
|
|
25
34
|
|
|
26
|
-
#
|
|
35
|
+
# These can be obtained by running `select format_type(oid, null),* from pg_type;`
|
|
36
|
+
# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
|
|
37
|
+
# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
|
|
38
|
+
# We map from format_type since this is what dbt uses.
|
|
39
|
+
# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
|
|
40
|
+
# See https://www.npgsql.org/dev/types.html for helpful type annotations
|
|
27
41
|
POSTGRES_TYPES_MAP: Dict[str, Any] = {
|
|
28
42
|
"boolean": BooleanType,
|
|
29
43
|
"bytea": BytesType,
|
|
@@ -430,3 +444,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
430
444
|
"geography": None,
|
|
431
445
|
"uuid": StringType,
|
|
432
446
|
}
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
_merged_mapping = {
|
|
450
|
+
"boolean": BooleanType,
|
|
451
|
+
"date": DateType,
|
|
452
|
+
"time": TimeType,
|
|
453
|
+
"numeric": NumberType,
|
|
454
|
+
"text": StringType,
|
|
455
|
+
"timestamp with time zone": DateType,
|
|
456
|
+
"timestamp without time zone": DateType,
|
|
457
|
+
"integer": NumberType,
|
|
458
|
+
"float8": NumberType,
|
|
459
|
+
"struct": RecordType,
|
|
460
|
+
**POSTGRES_TYPES_MAP,
|
|
461
|
+
**SNOWFLAKE_TYPES_MAP,
|
|
462
|
+
**BIGQUERY_TYPES_MAP,
|
|
463
|
+
**SPARK_SQL_TYPES_MAP,
|
|
464
|
+
**TRINO_SQL_TYPES_MAP,
|
|
465
|
+
**ATHENA_SQL_TYPES_MAP,
|
|
466
|
+
**VERTICA_SQL_TYPES_MAP,
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def resolve_sql_type(
|
|
471
|
+
column_type: Optional[str],
|
|
472
|
+
platform: Optional[str] = None,
|
|
473
|
+
) -> Optional[DATAHUB_FIELD_TYPE]:
|
|
474
|
+
# In theory, we should use the platform-specific mapping where available.
|
|
475
|
+
# However, the types don't ever conflict, so the merged mapping is fine.
|
|
476
|
+
TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
|
|
477
|
+
_merged_mapping.get(column_type) if column_type else None
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
if TypeClass is None and column_type:
|
|
481
|
+
# resolve a modified type
|
|
482
|
+
if platform == "trino":
|
|
483
|
+
TypeClass = resolve_trino_modified_type(column_type)
|
|
484
|
+
elif platform == "athena":
|
|
485
|
+
TypeClass = resolve_athena_modified_type(column_type)
|
|
486
|
+
elif platform == "postgres" or platform == "redshift":
|
|
487
|
+
# Redshift uses a variant of Postgres, so we can use the same logic.
|
|
488
|
+
TypeClass = resolve_postgres_modified_type(column_type)
|
|
489
|
+
elif platform == "vertica":
|
|
490
|
+
TypeClass = resolve_vertica_modified_type(column_type)
|
|
491
|
+
elif platform == "snowflake":
|
|
492
|
+
# Snowflake types are uppercase, so we check that.
|
|
493
|
+
TypeClass = _merged_mapping.get(column_type.upper())
|
|
494
|
+
|
|
495
|
+
if TypeClass:
|
|
496
|
+
return TypeClass()
|
|
497
|
+
return None
|
|
@@ -33,6 +33,7 @@ from datahub.metadata.schema_classes import (
|
|
|
33
33
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
|
+
# TODO: (maybe) Replace with standardized types in sql_types.py
|
|
36
37
|
DATA_TYPE_REGISTRY: dict = {
|
|
37
38
|
ColumnTypeName.BOOLEAN: BooleanTypeClass,
|
|
38
39
|
ColumnTypeName.BYTE: BytesTypeClass,
|
datahub/utilities/urn_encoder.py
CHANGED
|
@@ -4,7 +4,8 @@ from typing import List
|
|
|
4
4
|
# NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage.
|
|
5
5
|
# If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts
|
|
6
6
|
# We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes.
|
|
7
|
-
|
|
7
|
+
# Also see https://datahubproject.io/docs/what/urn/#restrictions
|
|
8
|
+
RESERVED_CHARS = {",", "(", ")", "␟"}
|
|
8
9
|
RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"})
|
|
9
10
|
|
|
10
11
|
|
|
File without changes
|
|
File without changes
|