acryl-datahub 0.15.0rc5__py3-none-any.whl → 0.15.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/METADATA +2456 -2426
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/RECORD +43 -41
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +11 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +6 -1
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +1 -1
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +1 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +3 -47
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc5.dist-info → acryl_datahub-0.15.0rc7.dist-info}/top_level.txt +0 -0
|
@@ -774,3 +774,14 @@ class DremioAPIOperations:
|
|
|
774
774
|
containers.extend(future.result())
|
|
775
775
|
|
|
776
776
|
return containers
|
|
777
|
+
|
|
778
|
+
def get_context_for_vds(self, resource_id: str) -> str:
|
|
779
|
+
context_array = self.get(
|
|
780
|
+
url=f"/catalog/{resource_id}",
|
|
781
|
+
).get("sqlContext")
|
|
782
|
+
if context_array:
|
|
783
|
+
return ".".join(
|
|
784
|
+
f'"{part}"' if "." in part else f"{part}" for part in context_array
|
|
785
|
+
)
|
|
786
|
+
else:
|
|
787
|
+
return ""
|
|
@@ -142,6 +142,7 @@ class DremioAspects:
|
|
|
142
142
|
platform: str,
|
|
143
143
|
ui_url: str,
|
|
144
144
|
env: str,
|
|
145
|
+
ingest_owner: bool,
|
|
145
146
|
domain: Optional[str] = None,
|
|
146
147
|
platform_instance: Optional[str] = None,
|
|
147
148
|
):
|
|
@@ -150,6 +151,7 @@ class DremioAspects:
|
|
|
150
151
|
self.env = env
|
|
151
152
|
self.domain = domain
|
|
152
153
|
self.ui_url = ui_url
|
|
154
|
+
self.ingest_owner = ingest_owner
|
|
153
155
|
|
|
154
156
|
def get_container_key(
|
|
155
157
|
self, name: Optional[str], path: Optional[List[str]]
|
|
@@ -426,21 +428,23 @@ class DremioAspects:
|
|
|
426
428
|
return f'{self.ui_url}/{container_type}/{dataset_url_path}"{dataset.resource_name}"'
|
|
427
429
|
|
|
428
430
|
def _create_ownership(self, dataset: DremioDataset) -> Optional[OwnershipClass]:
|
|
429
|
-
if
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
431
|
+
if self.ingest_owner and dataset.owner:
|
|
432
|
+
owner_urn = (
|
|
433
|
+
make_user_urn(dataset.owner)
|
|
434
|
+
if dataset.owner_type == "USER"
|
|
435
|
+
else make_group_urn(dataset.owner)
|
|
436
|
+
)
|
|
437
|
+
ownership: OwnershipClass = OwnershipClass(
|
|
438
|
+
owners=[
|
|
439
|
+
OwnerClass(
|
|
440
|
+
owner=owner_urn,
|
|
441
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
442
|
+
)
|
|
443
|
+
]
|
|
444
|
+
)
|
|
445
|
+
return ownership
|
|
446
|
+
|
|
447
|
+
return None
|
|
444
448
|
|
|
445
449
|
def _create_glossary_terms(self, entity: DremioDataset) -> GlossaryTermsClass:
|
|
446
450
|
return GlossaryTermsClass(
|
|
@@ -174,3 +174,8 @@ class DremioSourceConfig(
|
|
|
174
174
|
default=False,
|
|
175
175
|
description="Whether to include query-based lineage information.",
|
|
176
176
|
)
|
|
177
|
+
|
|
178
|
+
ingest_owner: bool = Field(
|
|
179
|
+
default=True,
|
|
180
|
+
description="Ingest Owner from source. This will override Owner info entered from UI",
|
|
181
|
+
)
|
|
@@ -200,6 +200,7 @@ class DremioDataset:
|
|
|
200
200
|
columns: List[DremioDatasetColumn]
|
|
201
201
|
sql_definition: Optional[str]
|
|
202
202
|
dataset_type: DremioDatasetType
|
|
203
|
+
default_schema: Optional[str]
|
|
203
204
|
owner: Optional[str]
|
|
204
205
|
owner_type: Optional[str]
|
|
205
206
|
created: str
|
|
@@ -235,6 +236,9 @@ class DremioDataset:
|
|
|
235
236
|
|
|
236
237
|
if self.sql_definition:
|
|
237
238
|
self.dataset_type = DremioDatasetType.VIEW
|
|
239
|
+
self.default_schema = api_operations.get_context_for_vds(
|
|
240
|
+
resource_id=self.resource_id
|
|
241
|
+
)
|
|
238
242
|
else:
|
|
239
243
|
self.dataset_type = DremioDatasetType.TABLE
|
|
240
244
|
|
|
@@ -97,6 +97,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
97
97
|
- Ownership and Glossary Terms:
|
|
98
98
|
- Metadata related to ownership of datasets, extracted from Dremio’s ownership model.
|
|
99
99
|
- Glossary terms and business metadata associated with datasets, providing additional context to the data.
|
|
100
|
+
- Note: Ownership information will only be available for the Cloud and Enterprise editions, it will not be available for the Community edition.
|
|
100
101
|
|
|
101
102
|
- Optional SQL Profiling (if enabled):
|
|
102
103
|
- Table, row, and column statistics can be profiled and ingested via optional SQL queries.
|
|
@@ -123,6 +124,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
123
124
|
self.dremio_aspects = DremioAspects(
|
|
124
125
|
platform=self.get_platform(),
|
|
125
126
|
domain=self.config.domain,
|
|
127
|
+
ingest_owner=self.config.ingest_owner,
|
|
126
128
|
platform_instance=self.config.platform_instance,
|
|
127
129
|
env=self.config.env,
|
|
128
130
|
ui_url=dremio_api.ui_url,
|
|
@@ -394,10 +396,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
394
396
|
):
|
|
395
397
|
yield dremio_mcp
|
|
396
398
|
# Check if the emitted aspect is SchemaMetadataClass
|
|
397
|
-
if isinstance(
|
|
399
|
+
if isinstance(
|
|
400
|
+
dremio_mcp.metadata, MetadataChangeProposalWrapper
|
|
401
|
+
) and isinstance(dremio_mcp.metadata.aspect, SchemaMetadataClass):
|
|
398
402
|
self.sql_parsing_aggregator.register_schema(
|
|
399
403
|
urn=dataset_urn,
|
|
400
|
-
schema=dremio_mcp.metadata,
|
|
404
|
+
schema=dremio_mcp.metadata.aspect,
|
|
401
405
|
)
|
|
402
406
|
|
|
403
407
|
if dataset_info.dataset_type == DremioDatasetType.VIEW:
|
|
@@ -415,6 +419,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
415
419
|
view_urn=dataset_urn,
|
|
416
420
|
view_definition=dataset_info.sql_definition,
|
|
417
421
|
default_db=self.default_db,
|
|
422
|
+
default_schema=dataset_info.default_schema,
|
|
418
423
|
)
|
|
419
424
|
|
|
420
425
|
elif dataset_info.dataset_type == DremioDatasetType.TABLE:
|
|
@@ -227,7 +227,7 @@ def collapse_name(name: str, collapse_urns: CollapseUrns) -> str:
|
|
|
227
227
|
def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
|
|
228
228
|
if len(collapse_urns.urns_suffix_regex) == 0:
|
|
229
229
|
return urn
|
|
230
|
-
urn_obj = DatasetUrn.
|
|
230
|
+
urn_obj = DatasetUrn.from_string(urn)
|
|
231
231
|
name = collapse_name(name=urn_obj.get_dataset_name(), collapse_urns=collapse_urns)
|
|
232
232
|
data_platform_urn = urn_obj.get_data_platform_urn()
|
|
233
233
|
return str(
|
|
@@ -277,7 +277,12 @@ class DataProcessCleanup:
|
|
|
277
277
|
assert self.ctx.graph
|
|
278
278
|
|
|
279
279
|
dpis = self.fetch_dpis(job.urn, self.config.batch_size)
|
|
280
|
-
dpis.sort(
|
|
280
|
+
dpis.sort(
|
|
281
|
+
key=lambda x: x["created"]["time"]
|
|
282
|
+
if x["created"] and x["created"]["time"]
|
|
283
|
+
else 0,
|
|
284
|
+
reverse=True,
|
|
285
|
+
)
|
|
281
286
|
|
|
282
287
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
283
288
|
if self.config.keep_last_n:
|
|
@@ -104,7 +104,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
104
104
|
def delete_entity(self, urn: str) -> None:
|
|
105
105
|
assert self.ctx.graph
|
|
106
106
|
|
|
107
|
-
entity_urn = Urn.
|
|
107
|
+
entity_urn = Urn.from_string(urn)
|
|
108
108
|
self.report.num_soft_deleted_entity_removed += 1
|
|
109
109
|
self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
|
|
110
110
|
self.report.num_soft_deleted_entity_removed_by_type.get(
|
|
@@ -57,7 +57,11 @@ from datahub.ingestion.source.profiling.common import (
|
|
|
57
57
|
convert_to_cardinality,
|
|
58
58
|
)
|
|
59
59
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
60
|
-
from datahub.
|
|
60
|
+
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
61
|
+
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
62
|
+
EditableSchemaMetadata,
|
|
63
|
+
NumberType,
|
|
64
|
+
)
|
|
61
65
|
from datahub.metadata.schema_classes import (
|
|
62
66
|
DatasetFieldProfileClass,
|
|
63
67
|
DatasetProfileClass,
|
|
@@ -361,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
361
365
|
platform: str
|
|
362
366
|
env: str
|
|
363
367
|
|
|
368
|
+
column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
369
|
+
|
|
364
370
|
def _get_columns_to_profile(self) -> List[str]:
|
|
365
371
|
if not self.config.any_field_level_metrics_enabled():
|
|
366
372
|
return []
|
|
@@ -374,6 +380,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
374
380
|
|
|
375
381
|
for col_dict in self.dataset.columns:
|
|
376
382
|
col = col_dict["name"]
|
|
383
|
+
self.column_types[col] = str(col_dict["type"])
|
|
377
384
|
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
|
|
378
385
|
if not self.config._allow_deny_patterns.allowed(
|
|
379
386
|
f"{self.dataset_name}.{col}"
|
|
@@ -430,6 +437,21 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
430
437
|
self.dataset, column
|
|
431
438
|
)
|
|
432
439
|
|
|
440
|
+
if column_spec.type_ == ProfilerDataType.UNKNOWN:
|
|
441
|
+
try:
|
|
442
|
+
datahub_field_type = resolve_sql_type(
|
|
443
|
+
self.column_types[column], self.dataset.engine.dialect.name.lower()
|
|
444
|
+
)
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.debug(
|
|
447
|
+
f"Error resolving sql type {self.column_types[column]}: {e}"
|
|
448
|
+
)
|
|
449
|
+
datahub_field_type = None
|
|
450
|
+
if datahub_field_type is None:
|
|
451
|
+
return
|
|
452
|
+
if isinstance(datahub_field_type, NumberType):
|
|
453
|
+
column_spec.type_ = ProfilerDataType.NUMERIC
|
|
454
|
+
|
|
433
455
|
@_run_with_query_combiner
|
|
434
456
|
def _get_column_cardinality(
|
|
435
457
|
self, column_spec: _SingleColumnSpec, column: str
|
|
File without changes
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from neo4j import GraphDatabase
|
|
8
|
+
from pydantic.fields import Field
|
|
9
|
+
|
|
10
|
+
from datahub.configuration.source_common import EnvConfigMixin
|
|
11
|
+
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
12
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
14
|
+
from datahub.ingestion.api.decorators import (
|
|
15
|
+
SupportStatus,
|
|
16
|
+
config_class,
|
|
17
|
+
platform_name,
|
|
18
|
+
support_status,
|
|
19
|
+
)
|
|
20
|
+
from datahub.ingestion.api.source import Source, SourceReport
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
23
|
+
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
|
|
24
|
+
from datahub.metadata.schema_classes import (
|
|
25
|
+
AuditStampClass,
|
|
26
|
+
BooleanTypeClass,
|
|
27
|
+
DatasetPropertiesClass,
|
|
28
|
+
DateTypeClass,
|
|
29
|
+
NullTypeClass,
|
|
30
|
+
NumberTypeClass,
|
|
31
|
+
OtherSchemaClass,
|
|
32
|
+
SchemaFieldClass,
|
|
33
|
+
SchemaMetadataClass,
|
|
34
|
+
StringTypeClass,
|
|
35
|
+
SubTypesClass,
|
|
36
|
+
UnionTypeClass,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
log = logging.getLogger(__name__)
|
|
40
|
+
logging.basicConfig(level=logging.INFO)
|
|
41
|
+
|
|
42
|
+
_type_mapping: Dict[Union[Type, str], Type] = {
|
|
43
|
+
"list": UnionTypeClass,
|
|
44
|
+
"boolean": BooleanTypeClass,
|
|
45
|
+
"integer": NumberTypeClass,
|
|
46
|
+
"local_date_time": DateTypeClass,
|
|
47
|
+
"float": NumberTypeClass,
|
|
48
|
+
"string": StringTypeClass,
|
|
49
|
+
"date": DateTypeClass,
|
|
50
|
+
"node": StringTypeClass,
|
|
51
|
+
"relationship": StringTypeClass,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Neo4jConfig(EnvConfigMixin):
|
|
56
|
+
username: str = Field(description="Neo4j Username")
|
|
57
|
+
password: str = Field(description="Neo4j Password")
|
|
58
|
+
uri: str = Field(description="The URI for the Neo4j server")
|
|
59
|
+
env: str = Field(description="Neo4j env")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Neo4jSourceReport(SourceReport):
|
|
64
|
+
obj_failures: int = 0
|
|
65
|
+
obj_created: int = 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@platform_name("Neo4j", id="neo4j")
|
|
69
|
+
@config_class(Neo4jConfig)
|
|
70
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
+
class Neo4jSource(Source):
|
|
72
|
+
NODE = "node"
|
|
73
|
+
RELATIONSHIP = "relationship"
|
|
74
|
+
PLATFORM = "neo4j"
|
|
75
|
+
|
|
76
|
+
def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
|
|
77
|
+
self.ctx = ctx
|
|
78
|
+
self.config = config
|
|
79
|
+
self.report = Neo4jSourceReport()
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def create(cls, config_dict, ctx):
|
|
83
|
+
config = Neo4jConfig.parse_obj(config_dict)
|
|
84
|
+
return cls(ctx, config)
|
|
85
|
+
|
|
86
|
+
def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
|
|
87
|
+
type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
|
|
88
|
+
return SchemaFieldDataType(type=type_class())
|
|
89
|
+
|
|
90
|
+
def get_schema_field_class(
|
|
91
|
+
self, col_name: str, col_type: str, **kwargs: Any
|
|
92
|
+
) -> SchemaFieldClass:
|
|
93
|
+
if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
|
|
94
|
+
col_type = self.NODE
|
|
95
|
+
else:
|
|
96
|
+
col_type = col_type
|
|
97
|
+
return SchemaFieldClass(
|
|
98
|
+
fieldPath=col_name,
|
|
99
|
+
type=self.get_field_type(col_type),
|
|
100
|
+
nativeDataType=col_type,
|
|
101
|
+
description=col_type.upper()
|
|
102
|
+
if col_type in (self.NODE, self.RELATIONSHIP)
|
|
103
|
+
else col_type,
|
|
104
|
+
lastModified=AuditStampClass(
|
|
105
|
+
time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def add_properties(
|
|
110
|
+
self,
|
|
111
|
+
dataset: str,
|
|
112
|
+
description: Optional[str] = None,
|
|
113
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
114
|
+
) -> MetadataChangeProposalWrapper:
|
|
115
|
+
dataset_properties = DatasetPropertiesClass(
|
|
116
|
+
description=description,
|
|
117
|
+
customProperties=custom_properties,
|
|
118
|
+
)
|
|
119
|
+
return MetadataChangeProposalWrapper(
|
|
120
|
+
entityUrn=make_dataset_urn(
|
|
121
|
+
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
122
|
+
),
|
|
123
|
+
aspect=dataset_properties,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def generate_neo4j_object(
|
|
127
|
+
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
128
|
+
) -> MetadataChangeProposalWrapper:
|
|
129
|
+
try:
|
|
130
|
+
fields = [
|
|
131
|
+
self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
|
|
132
|
+
for d in columns
|
|
133
|
+
for key, value in d.items()
|
|
134
|
+
]
|
|
135
|
+
mcp = MetadataChangeProposalWrapper(
|
|
136
|
+
entityUrn=make_dataset_urn(
|
|
137
|
+
platform=self.PLATFORM, name=dataset, env=self.config.env
|
|
138
|
+
),
|
|
139
|
+
aspect=SchemaMetadataClass(
|
|
140
|
+
schemaName=dataset,
|
|
141
|
+
platform=make_data_platform_urn(self.PLATFORM),
|
|
142
|
+
version=0,
|
|
143
|
+
hash="",
|
|
144
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
145
|
+
lastModified=AuditStampClass(
|
|
146
|
+
time=round(time.time() * 1000),
|
|
147
|
+
actor="urn:li:corpuser:ingestion",
|
|
148
|
+
),
|
|
149
|
+
fields=fields,
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
self.report.obj_created += 1
|
|
153
|
+
except Exception as e:
|
|
154
|
+
log.error(e)
|
|
155
|
+
self.report.obj_failures += 1
|
|
156
|
+
return mcp
|
|
157
|
+
|
|
158
|
+
def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
|
|
159
|
+
driver = GraphDatabase.driver(
|
|
160
|
+
self.config.uri, auth=(self.config.username, self.config.password)
|
|
161
|
+
)
|
|
162
|
+
"""
|
|
163
|
+
This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
|
|
164
|
+
with two columns: key and value. The key represents the Neo4j object, while the value contains the
|
|
165
|
+
corresponding metadata.
|
|
166
|
+
|
|
167
|
+
When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
|
|
168
|
+
metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
|
|
169
|
+
relationships.
|
|
170
|
+
|
|
171
|
+
In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
|
|
172
|
+
dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
|
|
173
|
+
|
|
174
|
+
Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
|
|
175
|
+
single dataframe, which will be used to create the DataHub objects.
|
|
176
|
+
|
|
177
|
+
See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
log.info(f"{query}")
|
|
181
|
+
with driver.session() as session:
|
|
182
|
+
result = session.run(query)
|
|
183
|
+
data = [record for record in result]
|
|
184
|
+
log.info("Closing Neo4j driver")
|
|
185
|
+
driver.close()
|
|
186
|
+
|
|
187
|
+
node_df = self.process_nodes(data)
|
|
188
|
+
rel_df = self.process_relationships(data, node_df)
|
|
189
|
+
|
|
190
|
+
union_cols = ["key", "obj_type", "property_data_types", "description"]
|
|
191
|
+
df = pd.concat([node_df[union_cols], rel_df[union_cols]])
|
|
192
|
+
except Exception as e:
|
|
193
|
+
self.report.failure(
|
|
194
|
+
message="Failed to get neo4j metadata",
|
|
195
|
+
exc=e,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return df
|
|
199
|
+
|
|
200
|
+
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
201
|
+
nodes = [record for record in data if record["value"]["type"] == self.NODE]
|
|
202
|
+
node_df = pd.DataFrame(
|
|
203
|
+
nodes,
|
|
204
|
+
columns=["key", "value"],
|
|
205
|
+
)
|
|
206
|
+
node_df["obj_type"] = node_df["value"].apply(
|
|
207
|
+
lambda record: self.get_obj_type(record)
|
|
208
|
+
)
|
|
209
|
+
node_df["relationships"] = node_df["value"].apply(
|
|
210
|
+
lambda record: self.get_relationships(record)
|
|
211
|
+
)
|
|
212
|
+
node_df["properties"] = node_df["value"].apply(
|
|
213
|
+
lambda record: self.get_properties(record)
|
|
214
|
+
)
|
|
215
|
+
node_df["property_data_types"] = node_df["properties"].apply(
|
|
216
|
+
lambda record: self.get_property_data_types(record)
|
|
217
|
+
)
|
|
218
|
+
node_df["description"] = node_df.apply(
|
|
219
|
+
lambda record: self.get_node_description(record, node_df), axis=1
|
|
220
|
+
)
|
|
221
|
+
return node_df
|
|
222
|
+
|
|
223
|
+
def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
|
|
224
|
+
rels = [
|
|
225
|
+
record for record in data if record["value"]["type"] == self.RELATIONSHIP
|
|
226
|
+
]
|
|
227
|
+
rel_df = pd.DataFrame(rels, columns=["key", "value"])
|
|
228
|
+
rel_df["obj_type"] = rel_df["value"].apply(
|
|
229
|
+
lambda record: self.get_obj_type(record)
|
|
230
|
+
)
|
|
231
|
+
rel_df["properties"] = rel_df["value"].apply(
|
|
232
|
+
lambda record: self.get_properties(record)
|
|
233
|
+
)
|
|
234
|
+
rel_df["property_data_types"] = rel_df["properties"].apply(
|
|
235
|
+
lambda record: self.get_property_data_types(record)
|
|
236
|
+
)
|
|
237
|
+
rel_df["description"] = rel_df.apply(
|
|
238
|
+
lambda record: self.get_rel_descriptions(record, node_df), axis=1
|
|
239
|
+
)
|
|
240
|
+
return rel_df
|
|
241
|
+
|
|
242
|
+
def get_obj_type(self, record: dict) -> str:
|
|
243
|
+
return record["type"]
|
|
244
|
+
|
|
245
|
+
def get_rel_descriptions(self, record: dict, df: pd.DataFrame) -> str:
|
|
246
|
+
descriptions = []
|
|
247
|
+
for _, row in df.iterrows():
|
|
248
|
+
relationships = row.get("relationships", {})
|
|
249
|
+
for relationship, props in relationships.items():
|
|
250
|
+
if record["key"] == relationship:
|
|
251
|
+
if props["direction"] == "in":
|
|
252
|
+
for prop in props["labels"]:
|
|
253
|
+
descriptions.append(
|
|
254
|
+
f"({row['key']})-[{record['key']}]->({prop})"
|
|
255
|
+
)
|
|
256
|
+
return "\n".join(descriptions)
|
|
257
|
+
|
|
258
|
+
def get_node_description(self, record: dict, df: pd.DataFrame) -> str:
|
|
259
|
+
descriptions = []
|
|
260
|
+
for _, row in df.iterrows():
|
|
261
|
+
if record["key"] == row["key"]:
|
|
262
|
+
for relationship, props in row["relationships"].items():
|
|
263
|
+
direction = props["direction"]
|
|
264
|
+
for node in set(props["labels"]):
|
|
265
|
+
if direction == "in":
|
|
266
|
+
descriptions.append(
|
|
267
|
+
f"({row['key']})<-[{relationship}]-({node})"
|
|
268
|
+
)
|
|
269
|
+
elif direction == "out":
|
|
270
|
+
descriptions.append(
|
|
271
|
+
f"({row['key']})-[{relationship}]->({node})"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return "\n".join(descriptions)
|
|
275
|
+
|
|
276
|
+
def get_property_data_types(self, record: dict) -> List[dict]:
|
|
277
|
+
return [{k: v["type"]} for k, v in record.items()]
|
|
278
|
+
|
|
279
|
+
def get_properties(self, record: dict) -> str:
|
|
280
|
+
return record["properties"]
|
|
281
|
+
|
|
282
|
+
def get_relationships(self, record: dict) -> dict:
|
|
283
|
+
return record.get("relationships", None)
|
|
284
|
+
|
|
285
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
286
|
+
df = self.get_neo4j_metadata(
|
|
287
|
+
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
288
|
+
)
|
|
289
|
+
for index, row in df.iterrows():
|
|
290
|
+
try:
|
|
291
|
+
yield MetadataWorkUnit(
|
|
292
|
+
id=row["key"],
|
|
293
|
+
mcp=self.generate_neo4j_object(
|
|
294
|
+
columns=row["property_data_types"],
|
|
295
|
+
dataset=row["key"],
|
|
296
|
+
),
|
|
297
|
+
is_primary_source=True,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
yield MetadataWorkUnit(
|
|
301
|
+
id=row["key"],
|
|
302
|
+
mcp=MetadataChangeProposalWrapper(
|
|
303
|
+
entityUrn=make_dataset_urn(
|
|
304
|
+
platform=self.PLATFORM,
|
|
305
|
+
name=row["key"],
|
|
306
|
+
env=self.config.env,
|
|
307
|
+
),
|
|
308
|
+
aspect=SubTypesClass(
|
|
309
|
+
typeNames=[
|
|
310
|
+
DatasetSubTypes.NEO4J_NODE
|
|
311
|
+
if row["obj_type"] == self.NODE
|
|
312
|
+
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
313
|
+
]
|
|
314
|
+
),
|
|
315
|
+
),
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
yield MetadataWorkUnit(
|
|
319
|
+
id=row["key"],
|
|
320
|
+
mcp=self.add_properties(
|
|
321
|
+
dataset=row["key"],
|
|
322
|
+
custom_properties=None,
|
|
323
|
+
description=row["description"],
|
|
324
|
+
),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
raise e
|
|
329
|
+
|
|
330
|
+
def get_report(self):
|
|
331
|
+
return self.report
|
|
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
|
|
|
103
103
|
logger = logging.getLogger(__name__)
|
|
104
104
|
|
|
105
105
|
# https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
|
|
106
|
+
# TODO: Move to the standardized types in sql_types.py
|
|
106
107
|
SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
|
|
107
108
|
"DATE": DateType,
|
|
108
109
|
"BIGINT": NumberType,
|