acryl-datahub 0.15.0rc5__py3-none-any.whl → 0.15.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (43) hide show
  1. {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/METADATA +2456 -2426
  2. {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/RECORD +43 -41
  3. {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/entry_points.txt +1 -0
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  6. datahub/cli/put_cli.py +1 -1
  7. datahub/cli/specific/dataproduct_cli.py +1 -1
  8. datahub/emitter/mcp_patch_builder.py +43 -0
  9. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  10. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  11. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  12. datahub/ingestion/source/common/subtypes.py +2 -0
  13. datahub/ingestion/source/csv_enricher.py +1 -1
  14. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  15. datahub/ingestion/source/dremio/dremio_api.py +11 -0
  16. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  17. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  18. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  19. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  20. datahub/ingestion/source/elastic_search.py +1 -1
  21. datahub/ingestion/source/gc/dataprocess_cleanup.py +6 -1
  22. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +1 -1
  23. datahub/ingestion/source/ge_data_profiler.py +23 -1
  24. datahub/ingestion/source/neo4j/__init__.py +0 -0
  25. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  26. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  27. datahub/ingestion/source/redshift/redshift.py +1 -0
  28. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +1 -0
  29. datahub/ingestion/source/sql/athena.py +46 -22
  30. datahub/ingestion/source/sql/sql_types.py +85 -8
  31. datahub/ingestion/source/unity/proxy_types.py +1 -0
  32. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  33. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  34. datahub/integrations/assertion/common.py +1 -1
  35. datahub/lite/duckdb_lite.py +12 -17
  36. datahub/specific/chart.py +0 -39
  37. datahub/specific/dashboard.py +0 -39
  38. datahub/specific/datajob.py +3 -47
  39. datahub/utilities/urn_encoder.py +2 -1
  40. datahub/utilities/urns/_urn_base.py +1 -1
  41. datahub/utilities/urns/structured_properties_urn.py +1 -1
  42. {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/WHEEL +0 -0
  43. {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/top_level.txt +0 -0
@@ -774,3 +774,14 @@ class DremioAPIOperations:
774
774
  containers.extend(future.result())
775
775
 
776
776
  return containers
777
+
778
+ def get_context_for_vds(self, resource_id: str) -> str:
779
+ context_array = self.get(
780
+ url=f"/catalog/{resource_id}",
781
+ ).get("sqlContext")
782
+ if context_array:
783
+ return ".".join(
784
+ f'"{part}"' if "." in part else f"{part}" for part in context_array
785
+ )
786
+ else:
787
+ return ""
@@ -142,6 +142,7 @@ class DremioAspects:
142
142
  platform: str,
143
143
  ui_url: str,
144
144
  env: str,
145
+ ingest_owner: bool,
145
146
  domain: Optional[str] = None,
146
147
  platform_instance: Optional[str] = None,
147
148
  ):
@@ -150,6 +151,7 @@ class DremioAspects:
150
151
  self.env = env
151
152
  self.domain = domain
152
153
  self.ui_url = ui_url
154
+ self.ingest_owner = ingest_owner
153
155
 
154
156
  def get_container_key(
155
157
  self, name: Optional[str], path: Optional[List[str]]
@@ -426,21 +428,23 @@ class DremioAspects:
426
428
  return f'{self.ui_url}/{container_type}/{dataset_url_path}"{dataset.resource_name}"'
427
429
 
428
430
  def _create_ownership(self, dataset: DremioDataset) -> Optional[OwnershipClass]:
429
- if not dataset.owner:
430
- return None
431
- owner = (
432
- make_user_urn(dataset.owner)
433
- if dataset.owner_type == "USER"
434
- else make_group_urn(dataset.owner)
435
- )
436
- return OwnershipClass(
437
- owners=[
438
- OwnerClass(
439
- owner=owner,
440
- type=OwnershipTypeClass.TECHNICAL_OWNER,
441
- )
442
- ]
443
- )
431
+ if self.ingest_owner and dataset.owner:
432
+ owner_urn = (
433
+ make_user_urn(dataset.owner)
434
+ if dataset.owner_type == "USER"
435
+ else make_group_urn(dataset.owner)
436
+ )
437
+ ownership: OwnershipClass = OwnershipClass(
438
+ owners=[
439
+ OwnerClass(
440
+ owner=owner_urn,
441
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
442
+ )
443
+ ]
444
+ )
445
+ return ownership
446
+
447
+ return None
444
448
 
445
449
  def _create_glossary_terms(self, entity: DremioDataset) -> GlossaryTermsClass:
446
450
  return GlossaryTermsClass(
@@ -174,3 +174,8 @@ class DremioSourceConfig(
174
174
  default=False,
175
175
  description="Whether to include query-based lineage information.",
176
176
  )
177
+
178
+ ingest_owner: bool = Field(
179
+ default=True,
180
+ description="Ingest Owner from source. This will override Owner info entered from UI",
181
+ )
@@ -200,6 +200,7 @@ class DremioDataset:
200
200
  columns: List[DremioDatasetColumn]
201
201
  sql_definition: Optional[str]
202
202
  dataset_type: DremioDatasetType
203
+ default_schema: Optional[str]
203
204
  owner: Optional[str]
204
205
  owner_type: Optional[str]
205
206
  created: str
@@ -235,6 +236,9 @@ class DremioDataset:
235
236
 
236
237
  if self.sql_definition:
237
238
  self.dataset_type = DremioDatasetType.VIEW
239
+ self.default_schema = api_operations.get_context_for_vds(
240
+ resource_id=self.resource_id
241
+ )
238
242
  else:
239
243
  self.dataset_type = DremioDatasetType.TABLE
240
244
 
@@ -97,6 +97,7 @@ class DremioSource(StatefulIngestionSourceBase):
97
97
  - Ownership and Glossary Terms:
98
98
  - Metadata related to ownership of datasets, extracted from Dremio’s ownership model.
99
99
  - Glossary terms and business metadata associated with datasets, providing additional context to the data.
100
+ - Note: Ownership information will only be available for the Cloud and Enterprise editions, it will not be available for the Community edition.
100
101
 
101
102
  - Optional SQL Profiling (if enabled):
102
103
  - Table, row, and column statistics can be profiled and ingested via optional SQL queries.
@@ -123,6 +124,7 @@ class DremioSource(StatefulIngestionSourceBase):
123
124
  self.dremio_aspects = DremioAspects(
124
125
  platform=self.get_platform(),
125
126
  domain=self.config.domain,
127
+ ingest_owner=self.config.ingest_owner,
126
128
  platform_instance=self.config.platform_instance,
127
129
  env=self.config.env,
128
130
  ui_url=dremio_api.ui_url,
@@ -394,10 +396,12 @@ class DremioSource(StatefulIngestionSourceBase):
394
396
  ):
395
397
  yield dremio_mcp
396
398
  # Check if the emitted aspect is SchemaMetadataClass
397
- if isinstance(dremio_mcp.metadata, SchemaMetadataClass):
399
+ if isinstance(
400
+ dremio_mcp.metadata, MetadataChangeProposalWrapper
401
+ ) and isinstance(dremio_mcp.metadata.aspect, SchemaMetadataClass):
398
402
  self.sql_parsing_aggregator.register_schema(
399
403
  urn=dataset_urn,
400
- schema=dremio_mcp.metadata,
404
+ schema=dremio_mcp.metadata.aspect,
401
405
  )
402
406
 
403
407
  if dataset_info.dataset_type == DremioDatasetType.VIEW:
@@ -415,6 +419,7 @@ class DremioSource(StatefulIngestionSourceBase):
415
419
  view_urn=dataset_urn,
416
420
  view_definition=dataset_info.sql_definition,
417
421
  default_db=self.default_db,
422
+ default_schema=dataset_info.default_schema,
418
423
  )
419
424
 
420
425
  elif dataset_info.dataset_type == DremioDatasetType.TABLE:
@@ -227,7 +227,7 @@ def collapse_name(name: str, collapse_urns: CollapseUrns) -> str:
227
227
  def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
228
228
  if len(collapse_urns.urns_suffix_regex) == 0:
229
229
  return urn
230
- urn_obj = DatasetUrn.create_from_string(urn)
230
+ urn_obj = DatasetUrn.from_string(urn)
231
231
  name = collapse_name(name=urn_obj.get_dataset_name(), collapse_urns=collapse_urns)
232
232
  data_platform_urn = urn_obj.get_data_platform_urn()
233
233
  return str(
@@ -277,7 +277,12 @@ class DataProcessCleanup:
277
277
  assert self.ctx.graph
278
278
 
279
279
  dpis = self.fetch_dpis(job.urn, self.config.batch_size)
280
- dpis.sort(key=lambda x: x["created"]["time"], reverse=True)
280
+ dpis.sort(
281
+ key=lambda x: x["created"]["time"]
282
+ if x["created"] and x["created"]["time"]
283
+ else 0,
284
+ reverse=True,
285
+ )
281
286
 
282
287
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
283
288
  if self.config.keep_last_n:
@@ -104,7 +104,7 @@ class SoftDeletedEntitiesCleanup:
104
104
  def delete_entity(self, urn: str) -> None:
105
105
  assert self.ctx.graph
106
106
 
107
- entity_urn = Urn.create_from_string(urn)
107
+ entity_urn = Urn.from_string(urn)
108
108
  self.report.num_soft_deleted_entity_removed += 1
109
109
  self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
110
110
  self.report.num_soft_deleted_entity_removed_by_type.get(
@@ -57,7 +57,11 @@ from datahub.ingestion.source.profiling.common import (
57
57
  convert_to_cardinality,
58
58
  )
59
59
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
60
- from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata
60
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
61
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
62
+ EditableSchemaMetadata,
63
+ NumberType,
64
+ )
61
65
  from datahub.metadata.schema_classes import (
62
66
  DatasetFieldProfileClass,
63
67
  DatasetProfileClass,
@@ -361,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
361
365
  platform: str
362
366
  env: str
363
367
 
368
+ column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
369
+
364
370
  def _get_columns_to_profile(self) -> List[str]:
365
371
  if not self.config.any_field_level_metrics_enabled():
366
372
  return []
@@ -374,6 +380,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
374
380
 
375
381
  for col_dict in self.dataset.columns:
376
382
  col = col_dict["name"]
383
+ self.column_types[col] = str(col_dict["type"])
377
384
  # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
378
385
  if not self.config._allow_deny_patterns.allowed(
379
386
  f"{self.dataset_name}.{col}"
@@ -430,6 +437,21 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
430
437
  self.dataset, column
431
438
  )
432
439
 
440
+ if column_spec.type_ == ProfilerDataType.UNKNOWN:
441
+ try:
442
+ datahub_field_type = resolve_sql_type(
443
+ self.column_types[column], self.dataset.engine.dialect.name.lower()
444
+ )
445
+ except Exception as e:
446
+ logger.debug(
447
+ f"Error resolving sql type {self.column_types[column]}: {e}"
448
+ )
449
+ datahub_field_type = None
450
+ if datahub_field_type is None:
451
+ return
452
+ if isinstance(datahub_field_type, NumberType):
453
+ column_spec.type_ = ProfilerDataType.NUMERIC
454
+
433
455
  @_run_with_query_combiner
434
456
  def _get_column_cardinality(
435
457
  self, column_spec: _SingleColumnSpec, column: str
File without changes
@@ -0,0 +1,331 @@
1
+ import logging
2
+ import time
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Iterable, List, Optional, Type, Union
5
+
6
+ import pandas as pd
7
+ from neo4j import GraphDatabase
8
+ from pydantic.fields import Field
9
+
10
+ from datahub.configuration.source_common import EnvConfigMixin
11
+ from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.api.common import PipelineContext
14
+ from datahub.ingestion.api.decorators import (
15
+ SupportStatus,
16
+ config_class,
17
+ platform_name,
18
+ support_status,
19
+ )
20
+ from datahub.ingestion.api.source import Source, SourceReport
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
23
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
24
+ from datahub.metadata.schema_classes import (
25
+ AuditStampClass,
26
+ BooleanTypeClass,
27
+ DatasetPropertiesClass,
28
+ DateTypeClass,
29
+ NullTypeClass,
30
+ NumberTypeClass,
31
+ OtherSchemaClass,
32
+ SchemaFieldClass,
33
+ SchemaMetadataClass,
34
+ StringTypeClass,
35
+ SubTypesClass,
36
+ UnionTypeClass,
37
+ )
38
+
39
+ log = logging.getLogger(__name__)
40
+ logging.basicConfig(level=logging.INFO)
41
+
42
+ _type_mapping: Dict[Union[Type, str], Type] = {
43
+ "list": UnionTypeClass,
44
+ "boolean": BooleanTypeClass,
45
+ "integer": NumberTypeClass,
46
+ "local_date_time": DateTypeClass,
47
+ "float": NumberTypeClass,
48
+ "string": StringTypeClass,
49
+ "date": DateTypeClass,
50
+ "node": StringTypeClass,
51
+ "relationship": StringTypeClass,
52
+ }
53
+
54
+
55
+ class Neo4jConfig(EnvConfigMixin):
56
+ username: str = Field(description="Neo4j Username")
57
+ password: str = Field(description="Neo4j Password")
58
+ uri: str = Field(description="The URI for the Neo4j server")
59
+ env: str = Field(description="Neo4j env")
60
+
61
+
62
+ @dataclass
63
+ class Neo4jSourceReport(SourceReport):
64
+ obj_failures: int = 0
65
+ obj_created: int = 0
66
+
67
+
68
+ @platform_name("Neo4j", id="neo4j")
69
+ @config_class(Neo4jConfig)
70
+ @support_status(SupportStatus.CERTIFIED)
71
+ class Neo4jSource(Source):
72
+ NODE = "node"
73
+ RELATIONSHIP = "relationship"
74
+ PLATFORM = "neo4j"
75
+
76
+ def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
77
+ self.ctx = ctx
78
+ self.config = config
79
+ self.report = Neo4jSourceReport()
80
+
81
+ @classmethod
82
+ def create(cls, config_dict, ctx):
83
+ config = Neo4jConfig.parse_obj(config_dict)
84
+ return cls(ctx, config)
85
+
86
+ def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
87
+ type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
88
+ return SchemaFieldDataType(type=type_class())
89
+
90
+ def get_schema_field_class(
91
+ self, col_name: str, col_type: str, **kwargs: Any
92
+ ) -> SchemaFieldClass:
93
+ if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
94
+ col_type = self.NODE
95
+ else:
96
+ col_type = col_type
97
+ return SchemaFieldClass(
98
+ fieldPath=col_name,
99
+ type=self.get_field_type(col_type),
100
+ nativeDataType=col_type,
101
+ description=col_type.upper()
102
+ if col_type in (self.NODE, self.RELATIONSHIP)
103
+ else col_type,
104
+ lastModified=AuditStampClass(
105
+ time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
106
+ ),
107
+ )
108
+
109
+ def add_properties(
110
+ self,
111
+ dataset: str,
112
+ description: Optional[str] = None,
113
+ custom_properties: Optional[Dict[str, str]] = None,
114
+ ) -> MetadataChangeProposalWrapper:
115
+ dataset_properties = DatasetPropertiesClass(
116
+ description=description,
117
+ customProperties=custom_properties,
118
+ )
119
+ return MetadataChangeProposalWrapper(
120
+ entityUrn=make_dataset_urn(
121
+ platform=self.PLATFORM, name=dataset, env=self.config.env
122
+ ),
123
+ aspect=dataset_properties,
124
+ )
125
+
126
+ def generate_neo4j_object(
127
+ self, dataset: str, columns: list, obj_type: Optional[str] = None
128
+ ) -> MetadataChangeProposalWrapper:
129
+ try:
130
+ fields = [
131
+ self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
132
+ for d in columns
133
+ for key, value in d.items()
134
+ ]
135
+ mcp = MetadataChangeProposalWrapper(
136
+ entityUrn=make_dataset_urn(
137
+ platform=self.PLATFORM, name=dataset, env=self.config.env
138
+ ),
139
+ aspect=SchemaMetadataClass(
140
+ schemaName=dataset,
141
+ platform=make_data_platform_urn(self.PLATFORM),
142
+ version=0,
143
+ hash="",
144
+ platformSchema=OtherSchemaClass(rawSchema=""),
145
+ lastModified=AuditStampClass(
146
+ time=round(time.time() * 1000),
147
+ actor="urn:li:corpuser:ingestion",
148
+ ),
149
+ fields=fields,
150
+ ),
151
+ )
152
+ self.report.obj_created += 1
153
+ except Exception as e:
154
+ log.error(e)
155
+ self.report.obj_failures += 1
156
+ return mcp
157
+
158
+ def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
159
+ driver = GraphDatabase.driver(
160
+ self.config.uri, auth=(self.config.username, self.config.password)
161
+ )
162
+ """
163
+ This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
164
+ with two columns: key and value. The key represents the Neo4j object, while the value contains the
165
+ corresponding metadata.
166
+
167
+ When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
168
+ metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
169
+ relationships.
170
+
171
+ In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
172
+ dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
173
+
174
+ Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
175
+ single dataframe, which will be used to create the DataHub objects.
176
+
177
+ See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
178
+ """
179
+ try:
180
+ log.info(f"{query}")
181
+ with driver.session() as session:
182
+ result = session.run(query)
183
+ data = [record for record in result]
184
+ log.info("Closing Neo4j driver")
185
+ driver.close()
186
+
187
+ node_df = self.process_nodes(data)
188
+ rel_df = self.process_relationships(data, node_df)
189
+
190
+ union_cols = ["key", "obj_type", "property_data_types", "description"]
191
+ df = pd.concat([node_df[union_cols], rel_df[union_cols]])
192
+ except Exception as e:
193
+ self.report.failure(
194
+ message="Failed to get neo4j metadata",
195
+ exc=e,
196
+ )
197
+
198
+ return df
199
+
200
+ def process_nodes(self, data: list) -> pd.DataFrame:
201
+ nodes = [record for record in data if record["value"]["type"] == self.NODE]
202
+ node_df = pd.DataFrame(
203
+ nodes,
204
+ columns=["key", "value"],
205
+ )
206
+ node_df["obj_type"] = node_df["value"].apply(
207
+ lambda record: self.get_obj_type(record)
208
+ )
209
+ node_df["relationships"] = node_df["value"].apply(
210
+ lambda record: self.get_relationships(record)
211
+ )
212
+ node_df["properties"] = node_df["value"].apply(
213
+ lambda record: self.get_properties(record)
214
+ )
215
+ node_df["property_data_types"] = node_df["properties"].apply(
216
+ lambda record: self.get_property_data_types(record)
217
+ )
218
+ node_df["description"] = node_df.apply(
219
+ lambda record: self.get_node_description(record, node_df), axis=1
220
+ )
221
+ return node_df
222
+
223
+ def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
224
+ rels = [
225
+ record for record in data if record["value"]["type"] == self.RELATIONSHIP
226
+ ]
227
+ rel_df = pd.DataFrame(rels, columns=["key", "value"])
228
+ rel_df["obj_type"] = rel_df["value"].apply(
229
+ lambda record: self.get_obj_type(record)
230
+ )
231
+ rel_df["properties"] = rel_df["value"].apply(
232
+ lambda record: self.get_properties(record)
233
+ )
234
+ rel_df["property_data_types"] = rel_df["properties"].apply(
235
+ lambda record: self.get_property_data_types(record)
236
+ )
237
+ rel_df["description"] = rel_df.apply(
238
+ lambda record: self.get_rel_descriptions(record, node_df), axis=1
239
+ )
240
+ return rel_df
241
+
242
+ def get_obj_type(self, record: dict) -> str:
243
+ return record["type"]
244
+
245
+ def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
246
+ descriptions = []
247
+ for _, row in df.iterrows():
248
+ relationships = row.get("relationships", {})
249
+ for relationship, props in relationships.items():
250
+ if record["key"] == relationship:
251
+ if props["direction"] == "in":
252
+ for prop in props["labels"]:
253
+ descriptions.append(
254
+ f"({row['key']})-[{record['key']}]->({prop})"
255
+ )
256
+ return "\n".join(descriptions)
257
+
258
+ def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
259
+ descriptions = []
260
+ for _, row in df.iterrows():
261
+ if record["key"] == row["key"]:
262
+ for relationship, props in row["relationships"].items():
263
+ direction = props["direction"]
264
+ for node in set(props["labels"]):
265
+ if direction == "in":
266
+ descriptions.append(
267
+ f"({row['key']})<-[{relationship}]-({node})"
268
+ )
269
+ elif direction == "out":
270
+ descriptions.append(
271
+ f"({row['key']})-[{relationship}]->({node})"
272
+ )
273
+
274
+ return "\n".join(descriptions)
275
+
276
+ def get_property_data_types(self, record: dict) -> List[dict]:
277
+ return [{k: v["type"]} for k, v in record.items()]
278
+
279
+ def get_properties(self, record: dict) -> str:
280
+ return record["properties"]
281
+
282
+ def get_relationships(self, record: dict) -> dict:
283
+ return record.get("relationships", None)
284
+
285
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
286
+ df = self.get_neo4j_metadata(
287
+ "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
288
+ )
289
+ for index, row in df.iterrows():
290
+ try:
291
+ yield MetadataWorkUnit(
292
+ id=row["key"],
293
+ mcp=self.generate_neo4j_object(
294
+ columns=row["property_data_types"],
295
+ dataset=row["key"],
296
+ ),
297
+ is_primary_source=True,
298
+ )
299
+
300
+ yield MetadataWorkUnit(
301
+ id=row["key"],
302
+ mcp=MetadataChangeProposalWrapper(
303
+ entityUrn=make_dataset_urn(
304
+ platform=self.PLATFORM,
305
+ name=row["key"],
306
+ env=self.config.env,
307
+ ),
308
+ aspect=SubTypesClass(
309
+ typeNames=[
310
+ DatasetSubTypes.NEO4J_NODE
311
+ if row["obj_type"] == self.NODE
312
+ else DatasetSubTypes.NEO4J_RELATIONSHIP
313
+ ]
314
+ ),
315
+ ),
316
+ )
317
+
318
+ yield MetadataWorkUnit(
319
+ id=row["key"],
320
+ mcp=self.add_properties(
321
+ dataset=row["key"],
322
+ custom_properties=None,
323
+ description=row["description"],
324
+ ),
325
+ )
326
+
327
+ except Exception as e:
328
+ raise e
329
+
330
+ def get_report(self):
331
+ return self.report
@@ -15,6 +15,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
15
15
  TimeType,
16
16
  )
17
17
 
18
+ # TODO: Replace with standardized types in sql_types.py
18
19
  FIELD_TYPE_MAPPING: Dict[
19
20
  str,
20
21
  Type[
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
222
222
  ```
223
223
  """
224
224
 
225
+ # TODO: Replace with standardized types in sql_types.py
225
226
  REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
226
227
  str,
227
228
  Type[
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
103
103
  logger = logging.getLogger(__name__)
104
104
 
105
105
  # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
106
+ # TODO: Move to the standardized types in sql_types.py
106
107
  SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
107
108
  "DATE": DateType,
108
109
  "BIGINT": NumberType,