acryl-datahub 0.15.0rc4__py3-none-any.whl → 0.15.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -0,0 +1,331 @@
1
+ import logging
2
+ import time
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Iterable, List, Optional, Type, Union
5
+
6
+ import pandas as pd
7
+ from neo4j import GraphDatabase
8
+ from pydantic.fields import Field
9
+
10
+ from datahub.configuration.source_common import EnvConfigMixin
11
+ from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.api.common import PipelineContext
14
+ from datahub.ingestion.api.decorators import (
15
+ SupportStatus,
16
+ config_class,
17
+ platform_name,
18
+ support_status,
19
+ )
20
+ from datahub.ingestion.api.source import Source, SourceReport
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
23
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
24
+ from datahub.metadata.schema_classes import (
25
+ AuditStampClass,
26
+ BooleanTypeClass,
27
+ DatasetPropertiesClass,
28
+ DateTypeClass,
29
+ NullTypeClass,
30
+ NumberTypeClass,
31
+ OtherSchemaClass,
32
+ SchemaFieldClass,
33
+ SchemaMetadataClass,
34
+ StringTypeClass,
35
+ SubTypesClass,
36
+ UnionTypeClass,
37
+ )
38
+
39
+ log = logging.getLogger(__name__)
40
+ logging.basicConfig(level=logging.INFO)
41
+
42
+ _type_mapping: Dict[Union[Type, str], Type] = {
43
+ "list": UnionTypeClass,
44
+ "boolean": BooleanTypeClass,
45
+ "integer": NumberTypeClass,
46
+ "local_date_time": DateTypeClass,
47
+ "float": NumberTypeClass,
48
+ "string": StringTypeClass,
49
+ "date": DateTypeClass,
50
+ "node": StringTypeClass,
51
+ "relationship": StringTypeClass,
52
+ }
53
+
54
+
55
+ class Neo4jConfig(EnvConfigMixin):
56
+ username: str = Field(description="Neo4j Username")
57
+ password: str = Field(description="Neo4j Password")
58
+ uri: str = Field(description="The URI for the Neo4j server")
59
+ env: str = Field(description="Neo4j env")
60
+
61
+
62
+ @dataclass
63
+ class Neo4jSourceReport(SourceReport):
64
+ obj_failures: int = 0
65
+ obj_created: int = 0
66
+
67
+
68
+ @platform_name("Neo4j", id="neo4j")
69
+ @config_class(Neo4jConfig)
70
+ @support_status(SupportStatus.CERTIFIED)
71
+ class Neo4jSource(Source):
72
+ NODE = "node"
73
+ RELATIONSHIP = "relationship"
74
+ PLATFORM = "neo4j"
75
+
76
+ def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
77
+ self.ctx = ctx
78
+ self.config = config
79
+ self.report = Neo4jSourceReport()
80
+
81
+ @classmethod
82
+ def create(cls, config_dict, ctx):
83
+ config = Neo4jConfig.parse_obj(config_dict)
84
+ return cls(ctx, config)
85
+
86
+ def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
87
+ type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
88
+ return SchemaFieldDataType(type=type_class())
89
+
90
+ def get_schema_field_class(
91
+ self, col_name: str, col_type: str, **kwargs: Any
92
+ ) -> SchemaFieldClass:
93
+ if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
94
+ col_type = self.NODE
95
+ else:
96
+ col_type = col_type
97
+ return SchemaFieldClass(
98
+ fieldPath=col_name,
99
+ type=self.get_field_type(col_type),
100
+ nativeDataType=col_type,
101
+ description=col_type.upper()
102
+ if col_type in (self.NODE, self.RELATIONSHIP)
103
+ else col_type,
104
+ lastModified=AuditStampClass(
105
+ time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
106
+ ),
107
+ )
108
+
109
+ def add_properties(
110
+ self,
111
+ dataset: str,
112
+ description: Optional[str] = None,
113
+ custom_properties: Optional[Dict[str, str]] = None,
114
+ ) -> MetadataChangeProposalWrapper:
115
+ dataset_properties = DatasetPropertiesClass(
116
+ description=description,
117
+ customProperties=custom_properties,
118
+ )
119
+ return MetadataChangeProposalWrapper(
120
+ entityUrn=make_dataset_urn(
121
+ platform=self.PLATFORM, name=dataset, env=self.config.env
122
+ ),
123
+ aspect=dataset_properties,
124
+ )
125
+
126
+ def generate_neo4j_object(
127
+ self, dataset: str, columns: list, obj_type: Optional[str] = None
128
+ ) -> MetadataChangeProposalWrapper:
129
+ try:
130
+ fields = [
131
+ self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
132
+ for d in columns
133
+ for key, value in d.items()
134
+ ]
135
+ mcp = MetadataChangeProposalWrapper(
136
+ entityUrn=make_dataset_urn(
137
+ platform=self.PLATFORM, name=dataset, env=self.config.env
138
+ ),
139
+ aspect=SchemaMetadataClass(
140
+ schemaName=dataset,
141
+ platform=make_data_platform_urn(self.PLATFORM),
142
+ version=0,
143
+ hash="",
144
+ platformSchema=OtherSchemaClass(rawSchema=""),
145
+ lastModified=AuditStampClass(
146
+ time=round(time.time() * 1000),
147
+ actor="urn:li:corpuser:ingestion",
148
+ ),
149
+ fields=fields,
150
+ ),
151
+ )
152
+ self.report.obj_created += 1
153
+ except Exception as e:
154
+ log.error(e)
155
+ self.report.obj_failures += 1
156
+ return mcp
157
+
158
+ def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
159
+ driver = GraphDatabase.driver(
160
+ self.config.uri, auth=(self.config.username, self.config.password)
161
+ )
162
+ """
163
+ This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
164
+ with two columns: key and value. The key represents the Neo4j object, while the value contains the
165
+ corresponding metadata.
166
+
167
+ When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
168
+ metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
169
+ relationships.
170
+
171
+ In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
172
+ dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
173
+
174
+ Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
175
+ single dataframe, which will be used to create the DataHub objects.
176
+
177
+ See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
178
+ """
179
+ try:
180
+ log.info(f"{query}")
181
+ with driver.session() as session:
182
+ result = session.run(query)
183
+ data = [record for record in result]
184
+ log.info("Closing Neo4j driver")
185
+ driver.close()
186
+
187
+ node_df = self.process_nodes(data)
188
+ rel_df = self.process_relationships(data, node_df)
189
+
190
+ union_cols = ["key", "obj_type", "property_data_types", "description"]
191
+ df = pd.concat([node_df[union_cols], rel_df[union_cols]])
192
+ except Exception as e:
193
+ self.report.failure(
194
+ message="Failed to get neo4j metadata",
195
+ exc=e,
196
+ )
197
+
198
+ return df
199
+
200
+ def process_nodes(self, data: list) -> pd.DataFrame:
201
+ nodes = [record for record in data if record["value"]["type"] == self.NODE]
202
+ node_df = pd.DataFrame(
203
+ nodes,
204
+ columns=["key", "value"],
205
+ )
206
+ node_df["obj_type"] = node_df["value"].apply(
207
+ lambda record: self.get_obj_type(record)
208
+ )
209
+ node_df["relationships"] = node_df["value"].apply(
210
+ lambda record: self.get_relationships(record)
211
+ )
212
+ node_df["properties"] = node_df["value"].apply(
213
+ lambda record: self.get_properties(record)
214
+ )
215
+ node_df["property_data_types"] = node_df["properties"].apply(
216
+ lambda record: self.get_property_data_types(record)
217
+ )
218
+ node_df["description"] = node_df.apply(
219
+ lambda record: self.get_node_description(record, node_df), axis=1
220
+ )
221
+ return node_df
222
+
223
+ def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
224
+ rels = [
225
+ record for record in data if record["value"]["type"] == self.RELATIONSHIP
226
+ ]
227
+ rel_df = pd.DataFrame(rels, columns=["key", "value"])
228
+ rel_df["obj_type"] = rel_df["value"].apply(
229
+ lambda record: self.get_obj_type(record)
230
+ )
231
+ rel_df["properties"] = rel_df["value"].apply(
232
+ lambda record: self.get_properties(record)
233
+ )
234
+ rel_df["property_data_types"] = rel_df["properties"].apply(
235
+ lambda record: self.get_property_data_types(record)
236
+ )
237
+ rel_df["description"] = rel_df.apply(
238
+ lambda record: self.get_rel_descriptions(record, node_df), axis=1
239
+ )
240
+ return rel_df
241
+
242
+ def get_obj_type(self, record: dict) -> str:
243
+ return record["type"]
244
+
245
+ def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
246
+ descriptions = []
247
+ for _, row in df.iterrows():
248
+ relationships = row.get("relationships", {})
249
+ for relationship, props in relationships.items():
250
+ if record["key"] == relationship:
251
+ if props["direction"] == "in":
252
+ for prop in props["labels"]:
253
+ descriptions.append(
254
+ f"({row['key']})-[{record['key']}]->({prop})"
255
+ )
256
+ return "\n".join(descriptions)
257
+
258
+ def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
259
+ descriptions = []
260
+ for _, row in df.iterrows():
261
+ if record["key"] == row["key"]:
262
+ for relationship, props in row["relationships"].items():
263
+ direction = props["direction"]
264
+ for node in set(props["labels"]):
265
+ if direction == "in":
266
+ descriptions.append(
267
+ f"({row['key']})<-[{relationship}]-({node})"
268
+ )
269
+ elif direction == "out":
270
+ descriptions.append(
271
+ f"({row['key']})-[{relationship}]->({node})"
272
+ )
273
+
274
+ return "\n".join(descriptions)
275
+
276
+ def get_property_data_types(self, record: dict) -> List[dict]:
277
+ return [{k: v["type"]} for k, v in record.items()]
278
+
279
+ def get_properties(self, record: dict) -> str:
280
+ return record["properties"]
281
+
282
+ def get_relationships(self, record: dict) -> dict:
283
+ return record.get("relationships", None)
284
+
285
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
286
+ df = self.get_neo4j_metadata(
287
+ "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
288
+ )
289
+ for index, row in df.iterrows():
290
+ try:
291
+ yield MetadataWorkUnit(
292
+ id=row["key"],
293
+ mcp=self.generate_neo4j_object(
294
+ columns=row["property_data_types"],
295
+ dataset=row["key"],
296
+ ),
297
+ is_primary_source=True,
298
+ )
299
+
300
+ yield MetadataWorkUnit(
301
+ id=row["key"],
302
+ mcp=MetadataChangeProposalWrapper(
303
+ entityUrn=make_dataset_urn(
304
+ platform=self.PLATFORM,
305
+ name=row["key"],
306
+ env=self.config.env,
307
+ ),
308
+ aspect=SubTypesClass(
309
+ typeNames=[
310
+ DatasetSubTypes.NEO4J_NODE
311
+ if row["obj_type"] == self.NODE
312
+ else DatasetSubTypes.NEO4J_RELATIONSHIP
313
+ ]
314
+ ),
315
+ ),
316
+ )
317
+
318
+ yield MetadataWorkUnit(
319
+ id=row["key"],
320
+ mcp=self.add_properties(
321
+ dataset=row["key"],
322
+ custom_properties=None,
323
+ description=row["description"],
324
+ ),
325
+ )
326
+
327
+ except Exception as e:
328
+ raise e
329
+
330
+ def get_report(self):
331
+ return self.report
@@ -15,6 +15,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
15
15
  TimeType,
16
16
  )
17
17
 
18
+ # TODO: Replace with standardized types in sql_types.py
18
19
  FIELD_TYPE_MAPPING: Dict[
19
20
  str,
20
21
  Type[
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
222
222
  ```
223
223
  """
224
224
 
225
+ # TODO: Replace with standardized types in sql_types.py
225
226
  REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
226
227
  str,
227
228
  Type[
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
103
103
  logger = logging.getLogger(__name__)
104
104
 
105
105
  # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
106
+ # TODO: Move to the standardized types in sql_types.py
106
107
  SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
107
108
  "DATE": DateType,
108
109
  "BIGINT": NumberType,
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
26
26
  platform_name,
27
27
  support_status,
28
28
  )
29
+ from datahub.ingestion.api.source import StructuredLogLevel
29
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
31
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
31
32
  from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
35
36
  register_custom_type,
36
37
  )
37
38
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
39
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
38
40
  from datahub.ingestion.source.sql.sql_utils import (
39
41
  add_table_to_schema_container,
40
42
  gen_database_container,
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
48
50
  get_schema_fields_for_sqlalchemy_column,
49
51
  )
50
52
 
53
+ try:
54
+ from typing_extensions import override
55
+ except ImportError:
56
+ _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
57
+
58
+ def override(f: _F, /) -> _F: # noqa: F811
59
+ return f
60
+
61
+
51
62
  logger = logging.getLogger(__name__)
52
63
 
53
64
  assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
322
333
  - Profiling when enabled.
323
334
  """
324
335
 
325
- table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
336
+ config: AthenaConfig
337
+ report: SQLSourceReport
326
338
 
327
339
  def __init__(self, config, ctx):
328
340
  super().__init__(config, ctx, "athena")
329
341
  self.cursor: Optional[BaseCursor] = None
330
342
 
343
+ self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
344
+
331
345
  @classmethod
332
346
  def create(cls, config_dict, ctx):
333
347
  config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
452
466
  )
453
467
 
454
468
  # It seems like database/schema filter in the connection string does not work and this to work around that
469
+ @override
455
470
  def get_schema_names(self, inspector: Inspector) -> List[str]:
456
471
  athena_config = typing.cast(AthenaConfig, self.config)
457
472
  schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
459
474
  return [schema for schema in schemas if schema == athena_config.database]
460
475
  return schemas
461
476
 
462
- # Overwrite to get partitions
477
+ @classmethod
478
+ def _casted_partition_key(cls, key: str) -> str:
479
+ # We need to cast the partition keys to a VARCHAR, since otherwise
480
+ # Athena may throw an error during concatenation / comparison.
481
+ return f"CAST({key} as VARCHAR)"
482
+
483
+ @override
463
484
  def get_partitions(
464
485
  self, inspector: Inspector, schema: str, table: str
465
- ) -> List[str]:
466
- partitions = []
467
-
468
- athena_config = typing.cast(AthenaConfig, self.config)
469
-
470
- if not athena_config.extract_partitions:
471
- return []
486
+ ) -> Optional[List[str]]:
487
+ if not self.config.extract_partitions:
488
+ return None
472
489
 
473
490
  if not self.cursor:
474
- return []
491
+ return None
475
492
 
476
493
  metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
477
494
  table_name=table, schema_name=schema
478
495
  )
479
496
 
480
- if metadata.partition_keys:
481
- for key in metadata.partition_keys:
482
- if key.name:
483
- partitions.append(key.name)
484
-
485
- if not partitions:
486
- return []
497
+ partitions = []
498
+ for key in metadata.partition_keys:
499
+ if key.name:
500
+ partitions.append(key.name)
501
+ if not partitions:
502
+ return []
487
503
 
488
- # We create an artiificaial concatenated partition key to be able to query max partition easier
489
- part_concat = "|| '-' ||".join(partitions)
504
+ with self.report.report_exc(
505
+ message="Failed to extract partition details",
506
+ context=f"{schema}.{table}",
507
+ level=StructuredLogLevel.WARN,
508
+ ):
509
+ # We create an artifical concatenated partition key to be able to query max partition easier
510
+ part_concat = " || '-' || ".join(
511
+ self._casted_partition_key(key) for key in partitions
512
+ )
490
513
  max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
491
514
  ret = self.cursor.execute(max_partition_query)
492
515
  max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
500
523
  partitions=partitions,
501
524
  max_partition=max_partition,
502
525
  )
503
- return partitions
504
526
 
505
- return []
527
+ return partitions
506
528
 
507
529
  # Overwrite to modify the creation of schema fields
508
530
  def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
551
573
  if partition and partition.max_partition:
552
574
  max_partition_filters = []
553
575
  for key, value in partition.max_partition.items():
554
- max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
576
+ max_partition_filters.append(
577
+ f"{self._casted_partition_key(key)} = '{value}'"
578
+ )
555
579
  max_partition = str(partition.max_partition)
556
580
  return (
557
581
  max_partition,
@@ -1,5 +1,5 @@
1
1
  import re
2
- from typing import Any, Dict, ValuesView
2
+ from typing import Any, Dict, Optional, Type, Union, ValuesView
3
3
 
4
4
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
5
5
  ArrayType,
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
16
16
  UnionType,
17
17
  )
18
18
 
19
- # these can be obtained by running `select format_type(oid, null),* from pg_type;`
20
- # we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
21
- # (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
19
+ DATAHUB_FIELD_TYPE = Union[
20
+ ArrayType,
21
+ BooleanType,
22
+ BytesType,
23
+ DateType,
24
+ EnumType,
25
+ MapType,
26
+ NullType,
27
+ NumberType,
28
+ RecordType,
29
+ StringType,
30
+ TimeType,
31
+ UnionType,
32
+ ]
22
33
 
23
- # we map from format_type since this is what dbt uses
24
- # see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
25
34
 
26
- # see https://www.npgsql.org/dev/types.html for helpful type annotations
35
+ # These can be obtained by running `select format_type(oid, null),* from pg_type;`
36
+ # We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
37
+ # (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
38
+ # We map from format_type since this is what dbt uses.
39
+ # See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
40
+ # See https://www.npgsql.org/dev/types.html for helpful type annotations
27
41
  POSTGRES_TYPES_MAP: Dict[str, Any] = {
28
42
  "boolean": BooleanType,
29
43
  "bytea": BytesType,
@@ -430,3 +444,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
430
444
  "geography": None,
431
445
  "uuid": StringType,
432
446
  }
447
+
448
+
449
+ _merged_mapping = {
450
+ "boolean": BooleanType,
451
+ "date": DateType,
452
+ "time": TimeType,
453
+ "numeric": NumberType,
454
+ "text": StringType,
455
+ "timestamp with time zone": DateType,
456
+ "timestamp without time zone": DateType,
457
+ "integer": NumberType,
458
+ "float8": NumberType,
459
+ "struct": RecordType,
460
+ **POSTGRES_TYPES_MAP,
461
+ **SNOWFLAKE_TYPES_MAP,
462
+ **BIGQUERY_TYPES_MAP,
463
+ **SPARK_SQL_TYPES_MAP,
464
+ **TRINO_SQL_TYPES_MAP,
465
+ **ATHENA_SQL_TYPES_MAP,
466
+ **VERTICA_SQL_TYPES_MAP,
467
+ }
468
+
469
+
470
+ def resolve_sql_type(
471
+ column_type: Optional[str],
472
+ platform: Optional[str] = None,
473
+ ) -> Optional[DATAHUB_FIELD_TYPE]:
474
+ # In theory, we should use the platform-specific mapping where available.
475
+ # However, the types don't ever conflict, so the merged mapping is fine.
476
+ TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
477
+ _merged_mapping.get(column_type) if column_type else None
478
+ )
479
+
480
+ if TypeClass is None and column_type:
481
+ # resolve a modified type
482
+ if platform == "trino":
483
+ TypeClass = resolve_trino_modified_type(column_type)
484
+ elif platform == "athena":
485
+ TypeClass = resolve_athena_modified_type(column_type)
486
+ elif platform == "postgres" or platform == "redshift":
487
+ # Redshift uses a variant of Postgres, so we can use the same logic.
488
+ TypeClass = resolve_postgres_modified_type(column_type)
489
+ elif platform == "vertica":
490
+ TypeClass = resolve_vertica_modified_type(column_type)
491
+ elif platform == "snowflake":
492
+ # Snowflake types are uppercase, so we check that.
493
+ TypeClass = _merged_mapping.get(column_type.upper())
494
+
495
+ if TypeClass:
496
+ return TypeClass()
497
+ return None
@@ -33,6 +33,7 @@ from datahub.metadata.schema_classes import (
33
33
 
34
34
  logger = logging.getLogger(__name__)
35
35
 
36
+ # TODO: (maybe) Replace with standardized types in sql_types.py
36
37
  DATA_TYPE_REGISTRY: dict = {
37
38
  ColumnTypeName.BOOLEAN: BooleanTypeClass,
38
39
  ColumnTypeName.BYTE: BytesTypeClass,
@@ -4,7 +4,8 @@ from typing import List
4
4
  # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage.
5
5
  # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts
6
6
  # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes.
7
- RESERVED_CHARS = {",", "(", ")"}
7
+ # Also see https://datahubproject.io/docs/what/urn/#restrictions
8
+ RESERVED_CHARS = {",", "(", ")", "␟"}
8
9
  RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"})
9
10
 
10
11