acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2524 -2471
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +87 -87
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +16 -7
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -2
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/ge_data_profiler.py +2 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
- datahub/metadata/_urns/urn_defs.py +1786 -1786
- datahub/metadata/schema.avsc +17364 -16988
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/main_client.py +2 -2
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +378 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
datahub/ingestion/source/mode.py
CHANGED
|
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
from functools import lru_cache
|
|
8
8
|
from json import JSONDecodeError
|
|
9
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
9
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import dateutil.parser as dp
|
|
12
12
|
import pydantic
|
|
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
|
|
|
203
203
|
pass
|
|
204
204
|
|
|
205
205
|
|
|
206
|
+
class HTTPError504(HTTPError):
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
|
|
206
210
|
ModeRequestError = (HTTPError, JSONDecodeError)
|
|
207
211
|
|
|
208
212
|
|
|
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
|
217
221
|
num_query_template_render: int = 0
|
|
218
222
|
num_query_template_render_failures: int = 0
|
|
219
223
|
num_query_template_render_success: int = 0
|
|
224
|
+
num_requests_exceeding_rate_limit: int = 0
|
|
225
|
+
num_requests_retried_on_timeout: int = 0
|
|
226
|
+
num_spaces_retrieved: int = 0
|
|
220
227
|
|
|
221
228
|
def report_dropped_space(self, ent_name: str) -> None:
|
|
222
229
|
self.filtered_spaces.append(ent_name)
|
|
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
456
463
|
# Datasets
|
|
457
464
|
datasets = []
|
|
458
465
|
for imported_dataset_name in report_info.get("imported_datasets", {}):
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
466
|
+
try:
|
|
467
|
+
mode_dataset = self._get_request_json(
|
|
468
|
+
f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
|
|
469
|
+
)
|
|
470
|
+
except HTTPError as http_error:
|
|
471
|
+
status_code = http_error.response.status_code
|
|
472
|
+
if status_code == 404:
|
|
473
|
+
self.report.report_warning(
|
|
474
|
+
title="Report Not Found",
|
|
475
|
+
message="Referenced report for reusable dataset was not found.",
|
|
476
|
+
context=f"Report: {report_info.get('id')}, "
|
|
477
|
+
f"Imported Dataset Report: {imported_dataset_name.get('token')}",
|
|
478
|
+
)
|
|
479
|
+
continue
|
|
480
|
+
else:
|
|
481
|
+
raise http_error
|
|
482
|
+
|
|
462
483
|
dataset_urn = builder.make_dataset_urn_with_platform_instance(
|
|
463
484
|
self.platform,
|
|
464
485
|
str(mode_dataset.get("id")),
|
|
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
562
583
|
space_info = {}
|
|
563
584
|
try:
|
|
564
585
|
logger.debug(f"Retrieving spaces for {self.workspace_uri}")
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
)
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
586
|
+
for spaces_page in self._get_paged_request_json(
|
|
587
|
+
f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
|
|
588
|
+
):
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
|
|
591
|
+
)
|
|
592
|
+
self.report.num_spaces_retrieved += len(spaces_page)
|
|
593
|
+
for s in spaces_page:
|
|
594
|
+
logger.debug(f"Space: {s.get('name')}")
|
|
595
|
+
space_name = s.get("name", "")
|
|
596
|
+
# Using both restricted and default_access_level because
|
|
597
|
+
# there is a current bug with restricted returning False everytime
|
|
598
|
+
# which has been reported to Mode team
|
|
599
|
+
if self.config.exclude_restricted and (
|
|
600
|
+
s.get("restricted")
|
|
601
|
+
or s.get("default_access_level") == "restricted"
|
|
602
|
+
):
|
|
603
|
+
logging.debug(
|
|
604
|
+
f"Skipping space {space_name} due to exclude restricted"
|
|
605
|
+
)
|
|
606
|
+
continue
|
|
607
|
+
if not self.config.space_pattern.allowed(space_name):
|
|
608
|
+
self.report.report_dropped_space(space_name)
|
|
609
|
+
logging.debug(
|
|
610
|
+
f"Skipping space {space_name} due to space pattern"
|
|
611
|
+
)
|
|
612
|
+
continue
|
|
613
|
+
space_info[s.get("token", "")] = s.get("name", "")
|
|
588
614
|
except ModeRequestError as e:
|
|
589
615
|
self.report.report_failure(
|
|
590
616
|
title="Failed to Retrieve Spaces",
|
|
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1475
1501
|
)
|
|
1476
1502
|
return charts
|
|
1477
1503
|
|
|
1504
|
+
def _get_paged_request_json(
|
|
1505
|
+
self, url: str, key: str, per_page: int
|
|
1506
|
+
) -> Iterator[List[Dict]]:
|
|
1507
|
+
page: int = 1
|
|
1508
|
+
while True:
|
|
1509
|
+
page_url = f"{url}&per_page={per_page}&page={page}"
|
|
1510
|
+
response = self._get_request_json(page_url)
|
|
1511
|
+
data: List[Dict] = response.get("_embedded", {}).get(key, [])
|
|
1512
|
+
if not data:
|
|
1513
|
+
break
|
|
1514
|
+
yield data
|
|
1515
|
+
page += 1
|
|
1516
|
+
|
|
1478
1517
|
def _get_request_json(self, url: str) -> Dict:
|
|
1479
1518
|
r = tenacity.Retrying(
|
|
1480
1519
|
wait=wait_exponential(
|
|
1481
1520
|
multiplier=self.config.api_options.retry_backoff_multiplier,
|
|
1482
1521
|
max=self.config.api_options.max_retry_interval,
|
|
1483
1522
|
),
|
|
1484
|
-
retry=retry_if_exception_type(
|
|
1523
|
+
retry=retry_if_exception_type(
|
|
1524
|
+
(HTTPError429, HTTPError504, ConnectionError)
|
|
1525
|
+
),
|
|
1485
1526
|
stop=stop_after_attempt(self.config.api_options.max_attempts),
|
|
1486
1527
|
)
|
|
1487
1528
|
|
|
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1502
1543
|
except HTTPError as http_error:
|
|
1503
1544
|
error_response = http_error.response
|
|
1504
1545
|
if error_response.status_code == 429:
|
|
1546
|
+
self.report.num_requests_exceeding_rate_limit += 1
|
|
1505
1547
|
# respect Retry-After
|
|
1506
1548
|
sleep_time = error_response.headers.get("retry-after")
|
|
1507
1549
|
if sleep_time is not None:
|
|
1508
1550
|
time.sleep(float(sleep_time))
|
|
1509
1551
|
raise HTTPError429 from None
|
|
1552
|
+
elif error_response.status_code == 504:
|
|
1553
|
+
self.report.num_requests_retried_on_timeout += 1
|
|
1554
|
+
time.sleep(0.1)
|
|
1555
|
+
raise HTTPError504 from None
|
|
1510
1556
|
|
|
1511
1557
|
logger.debug(
|
|
1512
1558
|
f"Error response ({error_response.status_code}): {error_response.text}"
|
|
@@ -5,27 +5,35 @@ from typing import Any, Dict, Iterable, List, Optional, Type, Union
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from neo4j import GraphDatabase
|
|
8
|
-
from pydantic
|
|
8
|
+
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.source_common import (
|
|
11
11
|
EnvConfigMixin,
|
|
12
|
+
PlatformInstanceConfigMixin,
|
|
13
|
+
)
|
|
14
|
+
from datahub.emitter.mce_builder import (
|
|
15
|
+
make_data_platform_urn,
|
|
16
|
+
make_dataset_urn_with_platform_instance,
|
|
12
17
|
)
|
|
13
|
-
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
14
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
16
20
|
from datahub.ingestion.api.decorators import (
|
|
17
21
|
SupportStatus,
|
|
22
|
+
capability,
|
|
18
23
|
config_class,
|
|
19
24
|
platform_name,
|
|
20
25
|
support_status,
|
|
21
26
|
)
|
|
22
27
|
from datahub.ingestion.api.source import (
|
|
23
28
|
MetadataWorkUnitProcessor,
|
|
29
|
+
SourceCapability,
|
|
24
30
|
)
|
|
31
|
+
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
25
32
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
33
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
27
34
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
35
|
StaleEntityRemovalHandler,
|
|
36
|
+
StatefulStaleMetadataRemovalConfig,
|
|
29
37
|
)
|
|
30
38
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
31
39
|
StatefulIngestionConfigBase,
|
|
@@ -64,12 +72,16 @@ _type_mapping: Dict[Union[Type, str], Type] = {
|
|
|
64
72
|
}
|
|
65
73
|
|
|
66
74
|
|
|
67
|
-
class Neo4jConfig(
|
|
75
|
+
class Neo4jConfig(
|
|
76
|
+
StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
|
|
77
|
+
):
|
|
68
78
|
username: str = Field(description="Neo4j Username")
|
|
69
79
|
password: str = Field(description="Neo4j Password")
|
|
70
80
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
71
81
|
env: str = Field(description="Neo4j env")
|
|
72
82
|
|
|
83
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
84
|
+
|
|
73
85
|
|
|
74
86
|
@dataclass
|
|
75
87
|
class Neo4jSourceReport(StatefulIngestionReport):
|
|
@@ -79,21 +91,27 @@ class Neo4jSourceReport(StatefulIngestionReport):
|
|
|
79
91
|
|
|
80
92
|
@platform_name("Neo4j", id="neo4j")
|
|
81
93
|
@config_class(Neo4jConfig)
|
|
94
|
+
@capability(
|
|
95
|
+
SourceCapability.PLATFORM_INSTANCE, "Supported via the `platform_instance` config"
|
|
96
|
+
)
|
|
82
97
|
@support_status(SupportStatus.CERTIFIED)
|
|
83
98
|
class Neo4jSource(StatefulIngestionSourceBase):
|
|
84
99
|
NODE = "node"
|
|
85
100
|
RELATIONSHIP = "relationship"
|
|
86
|
-
|
|
101
|
+
config: Neo4jConfig
|
|
102
|
+
report: Neo4jSourceReport
|
|
87
103
|
|
|
88
|
-
def __init__(self,
|
|
104
|
+
def __init__(self, config: Neo4jConfig, ctx: PipelineContext):
|
|
105
|
+
super().__init__(config, ctx)
|
|
89
106
|
self.ctx = ctx
|
|
90
107
|
self.config = config
|
|
108
|
+
self.platform = "neo4j"
|
|
91
109
|
self.report: Neo4jSourceReport = Neo4jSourceReport()
|
|
92
110
|
|
|
93
111
|
@classmethod
|
|
94
|
-
def create(cls, config_dict, ctx):
|
|
112
|
+
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "Neo4jSource":
|
|
95
113
|
config = Neo4jConfig.parse_obj(config_dict)
|
|
96
|
-
return cls(
|
|
114
|
+
return cls(config, ctx)
|
|
97
115
|
|
|
98
116
|
def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
|
|
99
117
|
type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
|
|
@@ -123,34 +141,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
123
141
|
dataset: str,
|
|
124
142
|
description: Optional[str] = None,
|
|
125
143
|
custom_properties: Optional[Dict[str, str]] = None,
|
|
126
|
-
) ->
|
|
144
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
127
145
|
dataset_properties = DatasetPropertiesClass(
|
|
128
146
|
description=description,
|
|
129
147
|
customProperties=custom_properties,
|
|
130
148
|
)
|
|
131
|
-
|
|
132
|
-
entityUrn=
|
|
133
|
-
platform=self.
|
|
149
|
+
yield MetadataChangeProposalWrapper(
|
|
150
|
+
entityUrn=make_dataset_urn_with_platform_instance(
|
|
151
|
+
platform=self.platform,
|
|
152
|
+
name=dataset,
|
|
153
|
+
platform_instance=self.config.platform_instance,
|
|
154
|
+
env=self.config.env,
|
|
134
155
|
),
|
|
135
156
|
aspect=dataset_properties,
|
|
136
|
-
)
|
|
157
|
+
).as_workunit()
|
|
137
158
|
|
|
138
159
|
def generate_neo4j_object(
|
|
139
160
|
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
140
|
-
) -> MetadataChangeProposalWrapper:
|
|
161
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
141
162
|
try:
|
|
142
163
|
fields = [
|
|
143
164
|
self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
|
|
144
165
|
for d in columns
|
|
145
166
|
for key, value in d.items()
|
|
146
167
|
]
|
|
147
|
-
|
|
148
|
-
entityUrn=
|
|
149
|
-
platform=self.
|
|
168
|
+
return MetadataChangeProposalWrapper(
|
|
169
|
+
entityUrn=make_dataset_urn_with_platform_instance(
|
|
170
|
+
platform=self.platform,
|
|
171
|
+
name=dataset,
|
|
172
|
+
platform_instance=self.config.platform_instance,
|
|
173
|
+
env=self.config.env,
|
|
150
174
|
),
|
|
151
175
|
aspect=SchemaMetadataClass(
|
|
152
176
|
schemaName=dataset,
|
|
153
|
-
platform=make_data_platform_urn(self.
|
|
177
|
+
platform=make_data_platform_urn(self.platform),
|
|
154
178
|
version=0,
|
|
155
179
|
hash="",
|
|
156
180
|
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
@@ -161,13 +185,16 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
161
185
|
fields=fields,
|
|
162
186
|
),
|
|
163
187
|
)
|
|
164
|
-
self.report.obj_created += 1
|
|
165
188
|
except Exception as e:
|
|
166
189
|
log.error(e)
|
|
167
|
-
self.report.
|
|
168
|
-
|
|
190
|
+
self.report.report_failure(
|
|
191
|
+
message="Failed to process dataset",
|
|
192
|
+
context=dataset,
|
|
193
|
+
exc=e,
|
|
194
|
+
)
|
|
195
|
+
return None
|
|
169
196
|
|
|
170
|
-
def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
|
|
197
|
+
def get_neo4j_metadata(self, query: str) -> Optional[pd.DataFrame]:
|
|
171
198
|
driver = GraphDatabase.driver(
|
|
172
199
|
self.config.uri, auth=(self.config.username, self.config.password)
|
|
173
200
|
)
|
|
@@ -201,13 +228,14 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
201
228
|
|
|
202
229
|
union_cols = ["key", "obj_type", "property_data_types", "description"]
|
|
203
230
|
df = pd.concat([node_df[union_cols], rel_df[union_cols]])
|
|
231
|
+
return df
|
|
204
232
|
except Exception as e:
|
|
205
233
|
self.report.failure(
|
|
206
234
|
message="Failed to get neo4j metadata",
|
|
207
235
|
exc=e,
|
|
208
236
|
)
|
|
209
237
|
|
|
210
|
-
return
|
|
238
|
+
return None
|
|
211
239
|
|
|
212
240
|
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
213
241
|
nodes = [record for record in data if record["value"]["type"] == self.NODE]
|
|
@@ -306,46 +334,48 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
306
334
|
df = self.get_neo4j_metadata(
|
|
307
335
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
308
336
|
)
|
|
337
|
+
if df is None:
|
|
338
|
+
log.warning("No metadata retrieved from Neo4j")
|
|
339
|
+
return
|
|
340
|
+
|
|
309
341
|
for _, row in df.iterrows():
|
|
310
342
|
try:
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
columns=row["property_data_types"],
|
|
315
|
-
dataset=row["key"],
|
|
316
|
-
),
|
|
317
|
-
is_primary_source=True,
|
|
343
|
+
neo4j_obj = self.generate_neo4j_object(
|
|
344
|
+
columns=row["property_data_types"],
|
|
345
|
+
dataset=row["key"],
|
|
318
346
|
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
aspect=SubTypesClass(
|
|
329
|
-
typeNames=[
|
|
330
|
-
DatasetSubTypes.NEO4J_NODE
|
|
331
|
-
if row["obj_type"] == self.NODE
|
|
332
|
-
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
333
|
-
]
|
|
334
|
-
),
|
|
347
|
+
if neo4j_obj:
|
|
348
|
+
yield from auto_workunit([neo4j_obj])
|
|
349
|
+
|
|
350
|
+
yield MetadataChangeProposalWrapper(
|
|
351
|
+
entityUrn=make_dataset_urn_with_platform_instance(
|
|
352
|
+
platform=self.platform,
|
|
353
|
+
name=row["key"],
|
|
354
|
+
platform_instance=self.config.platform_instance,
|
|
355
|
+
env=self.config.env,
|
|
335
356
|
),
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
custom_properties=None,
|
|
343
|
-
description=row["description"],
|
|
357
|
+
aspect=SubTypesClass(
|
|
358
|
+
typeNames=[
|
|
359
|
+
DatasetSubTypes.NEO4J_NODE
|
|
360
|
+
if row["obj_type"] == self.NODE
|
|
361
|
+
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
362
|
+
]
|
|
344
363
|
),
|
|
364
|
+
).as_workunit()
|
|
365
|
+
|
|
366
|
+
yield from self.add_properties(
|
|
367
|
+
dataset=row["key"],
|
|
368
|
+
custom_properties=None,
|
|
369
|
+
description=row["description"],
|
|
345
370
|
)
|
|
346
371
|
|
|
347
372
|
except Exception as e:
|
|
348
|
-
|
|
373
|
+
log.error(f"Failed to process row {row['key']}: {str(e)}")
|
|
374
|
+
self.report.report_failure(
|
|
375
|
+
message="Error processing Neo4j metadata",
|
|
376
|
+
context=row["key"],
|
|
377
|
+
exc=e,
|
|
378
|
+
)
|
|
349
379
|
|
|
350
|
-
def get_report(self):
|
|
380
|
+
def get_report(self) -> "Neo4jSourceReport":
|
|
351
381
|
return self.report
|
|
@@ -513,7 +513,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
513
513
|
include_workspace_name_in_dataset_urn: bool = pydantic.Field(
|
|
514
514
|
default=False,
|
|
515
515
|
description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
|
|
516
|
-
"Read section #11560 at https://
|
|
516
|
+
"Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
|
|
517
517
|
"To maintain backward compatibility, this is set to False.",
|
|
518
518
|
)
|
|
519
519
|
|
|
@@ -63,10 +63,10 @@ class SessionWithTimeout(requests.Session):
|
|
|
63
63
|
super().__init__(*args, **kwargs)
|
|
64
64
|
self.timeout = timeout
|
|
65
65
|
|
|
66
|
-
def request(self, method, url, **kwargs):
|
|
66
|
+
def request(self, method, url, *args, **kwargs):
|
|
67
67
|
# Set the default timeout if none is provided
|
|
68
68
|
kwargs.setdefault("timeout", self.timeout)
|
|
69
|
-
return super().request(method, url, **kwargs)
|
|
69
|
+
return super().request(method, url, *args, **kwargs)
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
class DataResolverBase(ABC):
|
|
@@ -182,15 +182,16 @@ class RedshiftUsageExtractor:
|
|
|
182
182
|
self.report.num_operational_stats_filtered = 0
|
|
183
183
|
|
|
184
184
|
if self.config.include_operational_stats:
|
|
185
|
-
with self.report.new_stage(
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
185
|
+
with self.report.new_stage(
|
|
186
|
+
USAGE_EXTRACTION_OPERATIONAL_STATS
|
|
187
|
+
), PerfTimer() as timer:
|
|
188
|
+
# Generate operation aspect workunits
|
|
189
|
+
yield from self._gen_operation_aspect_workunits(
|
|
190
|
+
self.connection, all_tables
|
|
191
|
+
)
|
|
192
|
+
self.report.operational_metadata_extraction_sec[
|
|
193
|
+
self.config.database
|
|
194
|
+
] = timer.elapsed_seconds(digits=2)
|
|
194
195
|
|
|
195
196
|
# Generate aggregate events
|
|
196
197
|
with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
|
|
@@ -145,7 +145,11 @@ class ClickHouseConfig(
|
|
|
145
145
|
)
|
|
146
146
|
include_materialized_views: Optional[bool] = Field(default=True, description="")
|
|
147
147
|
|
|
148
|
-
def get_sql_alchemy_url(
|
|
148
|
+
def get_sql_alchemy_url(
|
|
149
|
+
self,
|
|
150
|
+
uri_opts: Optional[Dict[str, Any]] = None,
|
|
151
|
+
current_db: Optional[str] = None,
|
|
152
|
+
) -> str:
|
|
149
153
|
url = make_url(
|
|
150
154
|
super().get_sql_alchemy_url(uri_opts=self.uri_opts, current_db=current_db)
|
|
151
155
|
)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
# This import verifies that the dependencies are available.
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
2
4
|
import pydruid # noqa: F401
|
|
3
5
|
from pydantic.fields import Field
|
|
4
6
|
from pydruid.db.sqlalchemy import DruidDialect
|
|
@@ -38,8 +40,11 @@ class DruidConfig(BasicSQLAlchemyConfig):
|
|
|
38
40
|
description="regex patterns for schemas to filter in ingestion.",
|
|
39
41
|
)
|
|
40
42
|
|
|
41
|
-
def get_sql_alchemy_url(
|
|
42
|
-
|
|
43
|
+
def get_sql_alchemy_url(
|
|
44
|
+
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
|
45
|
+
) -> str:
|
|
46
|
+
base_url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
|
|
47
|
+
return f"{base_url}/druid/v2/sql/"
|
|
43
48
|
|
|
44
49
|
"""
|
|
45
50
|
The pydruid library already formats the table name correctly, so we do not
|
|
@@ -127,11 +127,15 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
127
127
|
)
|
|
128
128
|
return v
|
|
129
129
|
|
|
130
|
-
def get_sql_alchemy_url(
|
|
131
|
-
|
|
130
|
+
def get_sql_alchemy_url(
|
|
131
|
+
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
|
132
|
+
) -> str:
|
|
133
|
+
url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
|
|
134
|
+
|
|
132
135
|
if self.service_name:
|
|
133
136
|
assert not self.database
|
|
134
137
|
url = f"{url}/?service_name={self.service_name}"
|
|
138
|
+
|
|
135
139
|
return url
|
|
136
140
|
|
|
137
141
|
def get_identifier(self, schema: str, table: str) -> str:
|
|
@@ -10,7 +10,8 @@ from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import (
|
|
|
10
10
|
IngestionCheckpointingProviderConfig,
|
|
11
11
|
JobId,
|
|
12
12
|
)
|
|
13
|
-
from datahub.ingestion.graph.client import
|
|
13
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
14
|
+
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
14
15
|
from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
@@ -24,7 +24,7 @@ def check_user_role(
|
|
|
24
24
|
mitigation_message_prefix: str = (
|
|
25
25
|
"Assign `Site Administrator Explorer` role to the user"
|
|
26
26
|
)
|
|
27
|
-
mitigation_message_suffix: str = "Refer to the setup guide: https://
|
|
27
|
+
mitigation_message_suffix: str = "Refer to the setup guide: https://docs.datahub.com/docs/quick-ingestion-guides/tableau/setup"
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
30
|
# TODO: Add check for `Enable Derived Permissions`
|
|
@@ -2,7 +2,7 @@ import collections
|
|
|
2
2
|
import dataclasses
|
|
3
3
|
import logging
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import Dict, Iterable, List
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
8
|
from pydantic.fields import Field
|
|
@@ -74,8 +74,12 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
|
|
|
74
74
|
options: dict = Field(default={}, description="")
|
|
75
75
|
query_log_table: str = Field(default="system.query_log", exclude=True)
|
|
76
76
|
|
|
77
|
-
def get_sql_alchemy_url(
|
|
78
|
-
|
|
77
|
+
def get_sql_alchemy_url(
|
|
78
|
+
self,
|
|
79
|
+
uri_opts: Optional[Dict[str, Any]] = None,
|
|
80
|
+
current_db: Optional[str] = None,
|
|
81
|
+
) -> str:
|
|
82
|
+
return super().get_sql_alchemy_url(uri_opts=uri_opts, current_db=current_db)
|
|
79
83
|
|
|
80
84
|
|
|
81
85
|
@platform_name("ClickHouse")
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from email.utils import parseaddr
|
|
7
|
-
from typing import Dict, Iterable, List, Optional
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
8
8
|
|
|
9
9
|
from dateutil import parser
|
|
10
10
|
from pydantic.fields import Field
|
|
@@ -98,8 +98,10 @@ class TrinoUsageConfig(TrinoConfig, BaseUsageConfig, EnvBasedSourceBaseConfig):
|
|
|
98
98
|
options: dict = Field(default={}, description="")
|
|
99
99
|
database: str = Field(description="The name of the catalog from getting the usage")
|
|
100
100
|
|
|
101
|
-
def get_sql_alchemy_url(
|
|
102
|
-
|
|
101
|
+
def get_sql_alchemy_url(
|
|
102
|
+
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
|
103
|
+
) -> str:
|
|
104
|
+
return super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
|
|
103
105
|
|
|
104
106
|
|
|
105
107
|
@dataclasses.dataclass
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, TypedDict
|
|
|
3
3
|
|
|
4
4
|
from datahub.api.entities.assertion.assertion import BaseEntityAssertion
|
|
5
5
|
from datahub.ingestion.graph.client import get_default_graph
|
|
6
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
6
7
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
|
|
7
8
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
|
|
8
9
|
from datahub.utilities.urns.urn import Urn
|
|
@@ -15,7 +16,7 @@ class ColumnDict(TypedDict):
|
|
|
15
16
|
|
|
16
17
|
@lru_cache
|
|
17
18
|
def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
|
|
18
|
-
with get_default_graph() as graph:
|
|
19
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
19
20
|
props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
|
|
20
21
|
if props is not None:
|
|
21
22
|
return props.qualifiedName
|
|
@@ -24,7 +25,7 @@ def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
|
|
|
24
25
|
|
|
25
26
|
@lru_cache
|
|
26
27
|
def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
|
|
27
|
-
with get_default_graph() as graph:
|
|
28
|
+
with get_default_graph(ClientMode.INGESTION) as graph:
|
|
28
29
|
schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
|
|
29
30
|
if schema is not None:
|
|
30
31
|
return [
|