acryl-datahub 0.15.0rc20__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/METADATA +2481 -2481
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/RECORD +17 -17
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +56 -68
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/sink/datahub_rest.py +12 -1
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +23 -0
- datahub/metadata/_schema_classes.py +400 -400
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/schema.avsc +17221 -17574
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc21.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=caUPlyD6P05EsMKzRYtlTS611d82sT4szr8_WAu_rJ4,575
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
|
|
|
52
52
|
datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
53
|
datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
|
|
54
54
|
datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
|
-
datahub/api/entities/structuredproperties/structuredproperties.py,sha256=
|
|
55
|
+
datahub/api/entities/structuredproperties/structuredproperties.py,sha256=YO4mdn6BziOzvzoFe-g2KfZlOZy8gqwMyyzj_7vF4BY,8845
|
|
56
56
|
datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
|
|
57
57
|
datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
|
|
58
58
|
datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
|
|
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
119
119
|
datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
|
|
120
120
|
datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
|
|
121
121
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
122
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
122
|
+
datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
|
|
123
123
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
124
124
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
125
125
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -180,7 +180,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
|
|
|
180
180
|
datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
|
|
181
181
|
datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
|
|
182
182
|
datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
|
|
183
|
-
datahub/ingestion/sink/datahub_rest.py,sha256=
|
|
183
|
+
datahub/ingestion/sink/datahub_rest.py,sha256=ME8OygJgd7AowrokJLmdjYHxIQEy5jXWS0yKwOLR934,12592
|
|
184
184
|
datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
|
|
185
185
|
datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
|
|
186
186
|
datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -321,7 +321,7 @@ datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR
|
|
|
321
321
|
datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
|
|
322
322
|
datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
323
323
|
datahub/ingestion/source/kafka/kafka.py,sha256=9SR7bqp9J0rPYde5IClhnAuVNy9ItsB8-ZeXtTc_mEY,26442
|
|
324
|
-
datahub/ingestion/source/kafka/kafka_connect.py,sha256=
|
|
324
|
+
datahub/ingestion/source/kafka/kafka_connect.py,sha256=Jm1MYky_OPIwvVHuEjgOjK0e6-jA-dYnsLZ7r-Y_9mA,56208
|
|
325
325
|
datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
|
|
326
326
|
datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
327
327
|
datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
|
|
@@ -427,13 +427,13 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
|
|
|
427
427
|
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
|
|
428
428
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
429
429
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
430
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
430
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=EnTJoRIQKcZOIYfb_NUff_YA8IdIroaFD1JHUn-M6ok,23346
|
|
431
431
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
432
432
|
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
|
|
433
|
-
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=
|
|
433
|
+
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
|
|
434
434
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
|
|
435
|
-
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=
|
|
436
|
-
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=
|
|
435
|
+
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
|
|
436
|
+
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
|
|
437
437
|
datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
|
|
438
438
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
439
439
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
@@ -559,12 +559,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
559
559
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
560
560
|
datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
|
|
561
561
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
562
|
-
datahub/metadata/_schema_classes.py,sha256=
|
|
563
|
-
datahub/metadata/schema.avsc,sha256=
|
|
562
|
+
datahub/metadata/_schema_classes.py,sha256=FTLom36n7gr6zxYfPWWoy9AmdnB4KOIXYRoVZbS9kog,955042
|
|
563
|
+
datahub/metadata/schema.avsc,sha256=D-rNu2SC2tyvqju8pQwGNGGT9zy1_fzxzoigH5YmUvo,722242
|
|
564
564
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
565
565
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
566
566
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
567
|
-
datahub/metadata/_urns/urn_defs.py,sha256=
|
|
567
|
+
datahub/metadata/_urns/urn_defs.py,sha256=LFHZGzHlDA0KJes1Xg7-lWetXusi7bubA7Q5hu4ER88,107119
|
|
568
568
|
datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
569
569
|
datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
570
570
|
datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
|
|
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
974
974
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
975
975
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
976
976
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
977
|
-
acryl_datahub-0.15.
|
|
978
|
-
acryl_datahub-0.15.
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
977
|
+
acryl_datahub-0.15.0rc21.dist-info/METADATA,sha256=e3Tw7Cix7Z1uR8zyUtppjUv0ztJa2Kga0yl7nwPMbF8,173559
|
|
978
|
+
acryl_datahub-0.15.0rc21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
979
|
+
acryl_datahub-0.15.0rc21.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
|
|
980
|
+
acryl_datahub-0.15.0rc21.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
981
|
+
acryl_datahub-0.15.0rc21.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from contextlib import contextmanager
|
|
3
2
|
from enum import Enum
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
6
5
|
|
|
7
6
|
import yaml
|
|
8
7
|
from pydantic import validator
|
|
@@ -10,6 +9,7 @@ from ruamel.yaml import YAML
|
|
|
10
9
|
|
|
11
10
|
from datahub.configuration.common import ConfigModel
|
|
12
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
|
+
from datahub.ingestion.api.global_context import get_graph_context, set_graph_context
|
|
13
13
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
14
14
|
from datahub.metadata.schema_classes import (
|
|
15
15
|
PropertyValueClass,
|
|
@@ -24,23 +24,10 @@ logger = logging.getLogger(__name__)
|
|
|
24
24
|
class StructuredPropertiesConfig:
|
|
25
25
|
"""Configuration class to hold the graph client"""
|
|
26
26
|
|
|
27
|
-
_graph: Optional[DataHubGraph] = None
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
@contextmanager
|
|
31
|
-
def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
|
|
32
|
-
"""Context manager to temporarily set a custom graph"""
|
|
33
|
-
previous_graph = cls._graph
|
|
34
|
-
cls._graph = graph
|
|
35
|
-
try:
|
|
36
|
-
yield
|
|
37
|
-
finally:
|
|
38
|
-
cls._graph = previous_graph
|
|
39
|
-
|
|
40
27
|
@classmethod
|
|
41
|
-
def
|
|
28
|
+
def get_graph_required(cls) -> DataHubGraph:
|
|
42
29
|
"""Get the current graph, falling back to default if none set"""
|
|
43
|
-
return
|
|
30
|
+
return get_graph_context() or get_default_graph()
|
|
44
31
|
|
|
45
32
|
|
|
46
33
|
class AllowedTypes(Enum):
|
|
@@ -79,7 +66,7 @@ class TypeQualifierAllowedTypes(ConfigModel):
|
|
|
79
66
|
@validator("allowed_types", each_item=True)
|
|
80
67
|
def validate_allowed_types(cls, v):
|
|
81
68
|
if v:
|
|
82
|
-
graph = StructuredPropertiesConfig.
|
|
69
|
+
graph = StructuredPropertiesConfig.get_graph_required()
|
|
83
70
|
validated_urn = Urn.make_entity_type_urn(v)
|
|
84
71
|
if not graph.exists(validated_urn):
|
|
85
72
|
raise ValueError(
|
|
@@ -106,7 +93,7 @@ class StructuredProperties(ConfigModel):
|
|
|
106
93
|
@validator("entity_types", each_item=True)
|
|
107
94
|
def validate_entity_types(cls, v):
|
|
108
95
|
if v:
|
|
109
|
-
graph = StructuredPropertiesConfig.
|
|
96
|
+
graph = StructuredPropertiesConfig.get_graph_required()
|
|
110
97
|
validated_urn = Urn.make_entity_type_urn(v)
|
|
111
98
|
if not graph.exists(validated_urn):
|
|
112
99
|
raise ValueError(
|
|
@@ -136,63 +123,64 @@ class StructuredProperties(ConfigModel):
|
|
|
136
123
|
|
|
137
124
|
@staticmethod
|
|
138
125
|
def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
126
|
+
with set_graph_context(graph):
|
|
127
|
+
graph = StructuredPropertiesConfig.get_graph_required()
|
|
128
|
+
|
|
142
129
|
with open(file) as fp:
|
|
143
130
|
structuredproperties: List[dict] = yaml.safe_load(fp)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
131
|
+
for structuredproperty_raw in structuredproperties:
|
|
132
|
+
structuredproperty = StructuredProperties.parse_obj(
|
|
133
|
+
structuredproperty_raw
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if not structuredproperty.type.islower():
|
|
137
|
+
structuredproperty.type = structuredproperty.type.lower()
|
|
138
|
+
logger.warning(
|
|
139
|
+
f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
|
|
147
140
|
)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
|
|
152
|
-
)
|
|
153
|
-
if not AllowedTypes.check_allowed_type(structuredproperty.type):
|
|
154
|
-
raise ValueError(
|
|
155
|
-
f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
156
|
-
)
|
|
157
|
-
mcp = MetadataChangeProposalWrapper(
|
|
158
|
-
entityUrn=structuredproperty.urn,
|
|
159
|
-
aspect=StructuredPropertyDefinitionClass(
|
|
160
|
-
qualifiedName=structuredproperty.fqn,
|
|
161
|
-
valueType=Urn.make_data_type_urn(structuredproperty.type),
|
|
162
|
-
displayName=structuredproperty.display_name,
|
|
163
|
-
description=structuredproperty.description,
|
|
164
|
-
entityTypes=[
|
|
165
|
-
Urn.make_entity_type_urn(entity_type)
|
|
166
|
-
for entity_type in structuredproperty.entity_types or []
|
|
167
|
-
],
|
|
168
|
-
cardinality=structuredproperty.cardinality,
|
|
169
|
-
immutable=structuredproperty.immutable,
|
|
170
|
-
allowedValues=(
|
|
171
|
-
[
|
|
172
|
-
PropertyValueClass(
|
|
173
|
-
value=v.value, description=v.description
|
|
174
|
-
)
|
|
175
|
-
for v in structuredproperty.allowed_values
|
|
176
|
-
]
|
|
177
|
-
if structuredproperty.allowed_values
|
|
178
|
-
else None
|
|
179
|
-
),
|
|
180
|
-
typeQualifier=(
|
|
181
|
-
{
|
|
182
|
-
"allowedTypes": structuredproperty.type_qualifier.allowed_types
|
|
183
|
-
}
|
|
184
|
-
if structuredproperty.type_qualifier
|
|
185
|
-
else None
|
|
186
|
-
),
|
|
187
|
-
),
|
|
141
|
+
if not AllowedTypes.check_allowed_type(structuredproperty.type):
|
|
142
|
+
raise ValueError(
|
|
143
|
+
f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
188
144
|
)
|
|
189
|
-
|
|
145
|
+
mcp = MetadataChangeProposalWrapper(
|
|
146
|
+
entityUrn=structuredproperty.urn,
|
|
147
|
+
aspect=StructuredPropertyDefinitionClass(
|
|
148
|
+
qualifiedName=structuredproperty.fqn,
|
|
149
|
+
valueType=Urn.make_data_type_urn(structuredproperty.type),
|
|
150
|
+
displayName=structuredproperty.display_name,
|
|
151
|
+
description=structuredproperty.description,
|
|
152
|
+
entityTypes=[
|
|
153
|
+
Urn.make_entity_type_urn(entity_type)
|
|
154
|
+
for entity_type in structuredproperty.entity_types or []
|
|
155
|
+
],
|
|
156
|
+
cardinality=structuredproperty.cardinality,
|
|
157
|
+
immutable=structuredproperty.immutable,
|
|
158
|
+
allowedValues=(
|
|
159
|
+
[
|
|
160
|
+
PropertyValueClass(
|
|
161
|
+
value=v.value, description=v.description
|
|
162
|
+
)
|
|
163
|
+
for v in structuredproperty.allowed_values
|
|
164
|
+
]
|
|
165
|
+
if structuredproperty.allowed_values
|
|
166
|
+
else None
|
|
167
|
+
),
|
|
168
|
+
typeQualifier=(
|
|
169
|
+
{
|
|
170
|
+
"allowedTypes": structuredproperty.type_qualifier.allowed_types
|
|
171
|
+
}
|
|
172
|
+
if structuredproperty.type_qualifier
|
|
173
|
+
else None
|
|
174
|
+
),
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
graph.emit_mcp(mcp)
|
|
190
178
|
|
|
191
|
-
|
|
179
|
+
logger.info(f"Created structured property {structuredproperty.urn}")
|
|
192
180
|
|
|
193
181
|
@classmethod
|
|
194
182
|
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
|
|
195
|
-
with
|
|
183
|
+
with set_graph_context(graph):
|
|
196
184
|
structured_property: Optional[
|
|
197
185
|
StructuredPropertyDefinitionClass
|
|
198
186
|
] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
46
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
47
|
)
|
|
48
48
|
|
|
49
|
-
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
|
-
|
|
49
|
+
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
|
+
# for overhead like request headers.
|
|
51
|
+
# This applies to pretty much all calls to GMS.
|
|
52
|
+
INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
|
|
53
|
+
|
|
54
|
+
# This limit is somewhat arbitrary. All GMS endpoints will timeout
|
|
55
|
+
# and return a 500 if processing takes too long. To avoid sending
|
|
56
|
+
# too much to the backend and hitting a timeout, we try to limit
|
|
57
|
+
# the number of MCPs we send in a batch.
|
|
58
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
59
|
+
os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
|
|
60
|
+
)
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
290
300
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
291
301
|
# If we will exceed the limit, we need to break it up into chunks.
|
|
292
302
|
mcp_obj_chunks: List[List[str]] = []
|
|
293
|
-
current_chunk_size =
|
|
303
|
+
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
294
304
|
for mcp_obj in mcp_objs:
|
|
295
305
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
296
306
|
|
|
297
|
-
if
|
|
307
|
+
if (
|
|
308
|
+
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
309
|
+
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
310
|
+
):
|
|
298
311
|
mcp_obj_chunks.append([])
|
|
299
312
|
current_chunk_size = 0
|
|
300
313
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
@@ -18,7 +18,10 @@ from datahub.configuration.common import (
|
|
|
18
18
|
)
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
20
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
21
|
-
from datahub.emitter.rest_emitter import
|
|
21
|
+
from datahub.emitter.rest_emitter import (
|
|
22
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
|
|
23
|
+
DataHubRestEmitter,
|
|
24
|
+
)
|
|
22
25
|
from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
|
|
23
26
|
from datahub.ingestion.api.sink import (
|
|
24
27
|
NoopWriteCallback,
|
|
@@ -71,6 +74,14 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
71
74
|
# Only applies in async batch mode.
|
|
72
75
|
max_per_batch: pydantic.PositiveInt = 100
|
|
73
76
|
|
|
77
|
+
@pydantic.validator("max_per_batch", always=True)
|
|
78
|
+
def validate_max_per_batch(cls, v):
|
|
79
|
+
if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
|
|
82
|
+
)
|
|
83
|
+
return v
|
|
84
|
+
|
|
74
85
|
|
|
75
86
|
@dataclasses.dataclass
|
|
76
87
|
class DataHubRestSinkReport(SinkReport):
|
|
@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
|
|
|
282
282
|
query: str
|
|
283
283
|
transforms: list
|
|
284
284
|
|
|
285
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
286
|
-
logger.warning(f"{key}: {reason}")
|
|
287
|
-
self.report.report_warning(key, reason)
|
|
288
|
-
|
|
289
285
|
def get_parser(
|
|
290
286
|
self,
|
|
291
287
|
connector_manifest: ConnectorManifest,
|
|
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
|
|
|
355
351
|
source_table = f"{table_name_tuple[-2]}.{source_table}"
|
|
356
352
|
else:
|
|
357
353
|
include_source_dataset = False
|
|
358
|
-
self.
|
|
359
|
-
|
|
360
|
-
f"
|
|
354
|
+
self.report.warning(
|
|
355
|
+
"Could not find schema for table"
|
|
356
|
+
f"{self.connector_manifest.name} : {source_table}",
|
|
361
357
|
)
|
|
362
358
|
dataset_name: str = get_dataset_name(database_name, source_table)
|
|
363
359
|
lineage = KafkaConnectLineage(
|
|
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
|
|
|
457
453
|
target_platform=KAFKA,
|
|
458
454
|
)
|
|
459
455
|
lineages.append(lineage)
|
|
460
|
-
self.
|
|
456
|
+
self.report.warning(
|
|
457
|
+
"Could not find input dataset, the connector has query configuration set",
|
|
461
458
|
self.connector_manifest.name,
|
|
462
|
-
"could not find input dataset, the connector has query configuration set",
|
|
463
459
|
)
|
|
464
460
|
self.connector_manifest.lineages = lineages
|
|
465
461
|
return
|
|
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
|
|
|
535
531
|
include_source_dataset=False,
|
|
536
532
|
)
|
|
537
533
|
)
|
|
538
|
-
self.
|
|
539
|
-
|
|
540
|
-
f"
|
|
534
|
+
self.report.warning(
|
|
535
|
+
"Could not find input dataset for connector topics",
|
|
536
|
+
f"{self.connector_manifest.name} : {topic_names}",
|
|
541
537
|
)
|
|
542
538
|
self.connector_manifest.lineages = lineages
|
|
543
539
|
return
|
|
544
540
|
else:
|
|
545
541
|
include_source_dataset = True
|
|
546
542
|
if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
547
|
-
self.
|
|
548
|
-
|
|
549
|
-
f"
|
|
543
|
+
self.report.warning(
|
|
544
|
+
"Could not find input dataset, connector has unknown transform",
|
|
545
|
+
f"{self.connector_manifest.name} : {transforms[0]['type']}",
|
|
550
546
|
)
|
|
551
547
|
include_source_dataset = False
|
|
552
548
|
if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
553
|
-
self.
|
|
549
|
+
self.report.warning(
|
|
550
|
+
"Could not find input dataset, connector has one or more unknown transforms",
|
|
554
551
|
self.connector_manifest.name,
|
|
555
|
-
"could not find input dataset, connector has one or more unknown transforms",
|
|
556
552
|
)
|
|
557
553
|
include_source_dataset = False
|
|
558
554
|
lineages = self.default_get_lineages(
|
|
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
|
|
|
753
749
|
lineages.append(lineage)
|
|
754
750
|
self.connector_manifest.lineages = lineages
|
|
755
751
|
except Exception as e:
|
|
756
|
-
self.report.
|
|
757
|
-
|
|
752
|
+
self.report.warning(
|
|
753
|
+
"Error resolving lineage for connector",
|
|
754
|
+
self.connector_manifest.name,
|
|
755
|
+
exc=e,
|
|
758
756
|
)
|
|
759
757
|
|
|
760
758
|
return
|
|
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
|
|
|
783
781
|
defaultDataset: Optional[str] = None
|
|
784
782
|
version: str = "v1"
|
|
785
783
|
|
|
786
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
787
|
-
logger.warning(f"{key}: {reason}")
|
|
788
|
-
self.report.report_warning(key, reason)
|
|
789
|
-
|
|
790
784
|
def get_parser(
|
|
791
785
|
self,
|
|
792
786
|
connector_manifest: ConnectorManifest,
|
|
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
|
|
|
917
911
|
transformed_topic = self.apply_transformations(topic, transforms)
|
|
918
912
|
dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
|
|
919
913
|
if dataset_table is None:
|
|
920
|
-
self.
|
|
921
|
-
|
|
922
|
-
f"
|
|
914
|
+
self.report.warning(
|
|
915
|
+
"Could not find target dataset for topic, please check your connector configuration"
|
|
916
|
+
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
923
917
|
)
|
|
924
918
|
continue
|
|
925
919
|
target_dataset = f"{project}.{dataset_table}"
|
|
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
|
|
|
954
948
|
schema_name: str
|
|
955
949
|
topics_to_tables: Dict[str, str]
|
|
956
950
|
|
|
957
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
958
|
-
logger.warning(f"{key}: {reason}")
|
|
959
|
-
self.report.report_warning(key, reason)
|
|
960
|
-
|
|
961
951
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
962
952
|
"""
|
|
963
953
|
This function converts the topic name to a valid Snowflake table name using some rules.
|
|
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
|
|
|
1105
1095
|
)
|
|
1106
1096
|
self.connector_manifest.lineages = lineages
|
|
1107
1097
|
except Exception as e:
|
|
1108
|
-
self.report.
|
|
1109
|
-
|
|
1098
|
+
self.report.warning(
|
|
1099
|
+
"Error resolving lineage for connector",
|
|
1100
|
+
self.connector_manifest.name,
|
|
1101
|
+
exc=e,
|
|
1110
1102
|
)
|
|
1111
1103
|
|
|
1112
1104
|
return
|
|
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1155
1147
|
)
|
|
1156
1148
|
self.session.auth = (self.config.username, self.config.password)
|
|
1157
1149
|
|
|
1158
|
-
test_response = self.session.get(f"{self.config.connect_uri}")
|
|
1150
|
+
test_response = self.session.get(f"{self.config.connect_uri}/connectors")
|
|
1159
1151
|
test_response.raise_for_status()
|
|
1160
1152
|
logger.info(f"Connection to {self.config.connect_uri} is ok")
|
|
1161
1153
|
if not jpype.isJVMStarted():
|
|
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1178
1170
|
|
|
1179
1171
|
payload = connector_response.json()
|
|
1180
1172
|
|
|
1181
|
-
for
|
|
1182
|
-
connector_url = f"{self.config.connect_uri}/connectors/{
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
if
|
|
1187
|
-
|
|
1173
|
+
for connector_name in payload:
|
|
1174
|
+
connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
|
|
1175
|
+
connector_manifest = self._get_connector_manifest(
|
|
1176
|
+
connector_name, connector_url
|
|
1177
|
+
)
|
|
1178
|
+
if (
|
|
1179
|
+
connector_manifest is None
|
|
1180
|
+
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
1181
|
+
):
|
|
1182
|
+
self.report.report_dropped(connector_name)
|
|
1188
1183
|
continue
|
|
1189
1184
|
|
|
1190
1185
|
if self.config.provided_configs:
|
|
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1195
1190
|
connector_manifest.lineages = list()
|
|
1196
1191
|
connector_manifest.url = connector_url
|
|
1197
1192
|
|
|
1198
|
-
|
|
1199
|
-
f"{self.config.connect_uri}/connectors/{c}/topics",
|
|
1200
|
-
).json()
|
|
1201
|
-
|
|
1202
|
-
connector_manifest.topic_names = topics[c]["topics"]
|
|
1193
|
+
connector_manifest.topic_names = self._get_connector_topics(connector_name)
|
|
1203
1194
|
|
|
1204
1195
|
# Populate Source Connector metadata
|
|
1205
1196
|
if connector_manifest.type == SOURCE:
|
|
1206
|
-
tasks = self.
|
|
1207
|
-
f"{self.config.connect_uri}/connectors/{c}/tasks",
|
|
1208
|
-
).json()
|
|
1209
|
-
|
|
1210
|
-
connector_manifest.tasks = tasks
|
|
1197
|
+
connector_manifest.tasks = self._get_connector_tasks(connector_name)
|
|
1211
1198
|
|
|
1212
1199
|
# JDBC source connector lineages
|
|
1213
1200
|
if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
|
|
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1246
1233
|
)
|
|
1247
1234
|
continue
|
|
1248
1235
|
|
|
1249
|
-
for topic in
|
|
1236
|
+
for topic in connector_manifest.topic_names:
|
|
1250
1237
|
lineage = KafkaConnectLineage(
|
|
1251
1238
|
source_dataset=target_connector.source_dataset,
|
|
1252
1239
|
source_platform=target_connector.source_platform,
|
|
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1286
1273
|
|
|
1287
1274
|
return connectors_manifest
|
|
1288
1275
|
|
|
1276
|
+
def _get_connector_manifest(
|
|
1277
|
+
self, connector_name: str, connector_url: str
|
|
1278
|
+
) -> Optional[ConnectorManifest]:
|
|
1279
|
+
try:
|
|
1280
|
+
connector_response = self.session.get(connector_url)
|
|
1281
|
+
connector_response.raise_for_status()
|
|
1282
|
+
except Exception as e:
|
|
1283
|
+
self.report.warning(
|
|
1284
|
+
"Failed to get connector details", connector_name, exc=e
|
|
1285
|
+
)
|
|
1286
|
+
return None
|
|
1287
|
+
manifest = connector_response.json()
|
|
1288
|
+
connector_manifest = ConnectorManifest(**manifest)
|
|
1289
|
+
return connector_manifest
|
|
1290
|
+
|
|
1291
|
+
def _get_connector_tasks(self, connector_name: str) -> dict:
|
|
1292
|
+
try:
|
|
1293
|
+
response = self.session.get(
|
|
1294
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
|
|
1295
|
+
)
|
|
1296
|
+
response.raise_for_status()
|
|
1297
|
+
except Exception as e:
|
|
1298
|
+
self.report.warning(
|
|
1299
|
+
"Error getting connector tasks", context=connector_name, exc=e
|
|
1300
|
+
)
|
|
1301
|
+
return {}
|
|
1302
|
+
|
|
1303
|
+
return response.json()
|
|
1304
|
+
|
|
1305
|
+
def _get_connector_topics(self, connector_name: str) -> List[str]:
|
|
1306
|
+
try:
|
|
1307
|
+
response = self.session.get(
|
|
1308
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/topics",
|
|
1309
|
+
)
|
|
1310
|
+
response.raise_for_status()
|
|
1311
|
+
except Exception as e:
|
|
1312
|
+
self.report.warning(
|
|
1313
|
+
"Error getting connector topics", context=connector_name, exc=e
|
|
1314
|
+
)
|
|
1315
|
+
return []
|
|
1316
|
+
|
|
1317
|
+
return response.json()[connector_name]["topics"]
|
|
1318
|
+
|
|
1289
1319
|
def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
|
|
1290
1320
|
connector_name = connector.name
|
|
1291
1321
|
connector_type = connector.type
|
|
@@ -413,9 +413,10 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
413
413
|
return UpstreamLineageEdge.parse_obj(db_row)
|
|
414
414
|
except Exception as e:
|
|
415
415
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
416
|
+
upstream_tables = db_row.get("UPSTREAM_TABLES")
|
|
416
417
|
self.structured_reporter.warning(
|
|
417
418
|
"Failed to parse lineage edge",
|
|
418
|
-
context=db_row.get(
|
|
419
|
+
context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
|
|
419
420
|
exc=e,
|
|
420
421
|
)
|
|
421
422
|
return None
|
|
@@ -237,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
|
|
|
237
237
|
LIMIT {limit} {from_clause};
|
|
238
238
|
"""
|
|
239
239
|
|
|
240
|
+
@staticmethod
|
|
241
|
+
def get_secure_view_definitions() -> str:
|
|
242
|
+
# https://docs.snowflake.com/en/sql-reference/account-usage/views
|
|
243
|
+
return """
|
|
244
|
+
SELECT
|
|
245
|
+
TABLE_CATALOG as "TABLE_CATALOG",
|
|
246
|
+
TABLE_SCHEMA as "TABLE_SCHEMA",
|
|
247
|
+
TABLE_NAME as "TABLE_NAME",
|
|
248
|
+
VIEW_DEFINITION as "VIEW_DEFINITION"
|
|
249
|
+
FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
|
|
250
|
+
WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
|
|
251
|
+
"""
|
|
252
|
+
|
|
240
253
|
@staticmethod
|
|
241
254
|
def columns_for_schema(
|
|
242
255
|
schema_name: str,
|
|
@@ -266,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
266
266
|
snowflake_schemas.append(snowflake_schema)
|
|
267
267
|
return snowflake_schemas
|
|
268
268
|
|
|
269
|
+
@serialized_lru_cache(maxsize=1)
|
|
270
|
+
def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
|
|
271
|
+
secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
|
|
272
|
+
lambda: defaultdict(lambda: defaultdict())
|
|
273
|
+
)
|
|
274
|
+
cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
|
|
275
|
+
for view in cur:
|
|
276
|
+
db_name = view["TABLE_CATALOG"]
|
|
277
|
+
schema_name = view["TABLE_SCHEMA"]
|
|
278
|
+
view_name = view["TABLE_NAME"]
|
|
279
|
+
secure_view_definitions[db_name][schema_name][view_name] = view[
|
|
280
|
+
"VIEW_DEFINITION"
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
return secure_view_definitions
|
|
284
|
+
|
|
269
285
|
@serialized_lru_cache(maxsize=1)
|
|
270
286
|
def get_tables_for_database(
|
|
271
287
|
self, db_name: str
|