acryl-datahub 0.15.0rc20__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=fYgu28dsndrekGv9Pq_ENw7G6Erm7qtsY5H6W3cKFDU,575
1
+ datahub/__init__.py,sha256=caUPlyD6P05EsMKzRYtlTS611d82sT4szr8_WAu_rJ4,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
52
52
  datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
54
54
  datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
- datahub/api/entities/structuredproperties/structuredproperties.py,sha256=PcTX5gI7pg_Aq9JeIvUNZ5JYrQ2XS1uUEJZ73ORgYgA,9434
55
+ datahub/api/entities/structuredproperties/structuredproperties.py,sha256=YO4mdn6BziOzvzoFe-g2KfZlOZy8gqwMyyzj_7vF4BY,8845
56
56
  datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
57
57
  datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
58
58
  datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
119
119
  datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
120
120
  datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
121
121
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
122
- datahub/emitter/rest_emitter.py,sha256=rIWqEJjcSIM16_8DXqNqZ_h5s_nj46DTiyRKA5EQHXQ,15021
122
+ datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
123
123
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
124
124
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
125
125
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -180,7 +180,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
180
180
  datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
181
181
  datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
182
182
  datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
183
- datahub/ingestion/sink/datahub_rest.py,sha256=pU9z-vR-R7kGogqxkC7-9AZNctR9oUfAmfhhoD0-hwQ,12245
183
+ datahub/ingestion/sink/datahub_rest.py,sha256=ME8OygJgd7AowrokJLmdjYHxIQEy5jXWS0yKwOLR934,12592
184
184
  datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
185
185
  datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
186
186
  datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -321,7 +321,7 @@ datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR
321
321
  datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
322
322
  datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
323
323
  datahub/ingestion/source/kafka/kafka.py,sha256=9SR7bqp9J0rPYde5IClhnAuVNy9ItsB8-ZeXtTc_mEY,26442
324
- datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
324
+ datahub/ingestion/source/kafka/kafka_connect.py,sha256=Jm1MYky_OPIwvVHuEjgOjK0e6-jA-dYnsLZ7r-Y_9mA,56208
325
325
  datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
326
326
  datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
327
327
  datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
@@ -427,13 +427,13 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
427
427
  datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
428
428
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
429
429
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
430
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=w2CPm5XEU-KMUSIpb58aKOaxTDHfM5NvghutCVRicy4,23247
430
+ datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=EnTJoRIQKcZOIYfb_NUff_YA8IdIroaFD1JHUn-M6ok,23346
431
431
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
432
432
  datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
433
- datahub/ingestion/source/snowflake/snowflake_query.py,sha256=PuqoseJbqkQEIYkmlLvPJxcVOGG7HVs4U-WWFQgQEWs,38211
433
+ datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
434
434
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
435
- datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=K31vJ19ZCIqtJkszsJWF1eppu8U23gkZYfb5jw231dc,20997
436
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=st4qoOdMGuo6fJQh-cJf_2hnczIuv6VRXGO4x3p1MgQ,39416
435
+ datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
436
+ datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
437
437
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
438
438
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
439
439
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
@@ -559,12 +559,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
559
559
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
560
560
  datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
561
561
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
562
- datahub/metadata/_schema_classes.py,sha256=iPeBXGvbNEm0vw5pYwunnvx7bTtBdmIQVtzMOlS6bSI,955042
563
- datahub/metadata/schema.avsc,sha256=Xx93OdPzQfBb2CtntIYE-HAeKNg-JZcCtRU95v7ZZCs,677728
562
+ datahub/metadata/_schema_classes.py,sha256=FTLom36n7gr6zxYfPWWoy9AmdnB4KOIXYRoVZbS9kog,955042
563
+ datahub/metadata/schema.avsc,sha256=D-rNu2SC2tyvqju8pQwGNGGT9zy1_fzxzoigH5YmUvo,722242
564
564
  datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
565
565
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
566
566
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
567
- datahub/metadata/_urns/urn_defs.py,sha256=WBHf7Ze2qBvR-uWpcdMqEy-T2AIBzf8ioS-wJMMXXOo,107119
567
+ datahub/metadata/_urns/urn_defs.py,sha256=LFHZGzHlDA0KJes1Xg7-lWetXusi7bubA7Q5hu4ER88,107119
568
568
  datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
569
569
  datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
570
570
  datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
974
974
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
975
975
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
976
976
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
977
- acryl_datahub-0.15.0rc20.dist-info/METADATA,sha256=KuTZA5lnEW-UAvSPqqkBsDFKkwlJF8WzYbcphVMW_aE,173559
978
- acryl_datahub-0.15.0rc20.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
- acryl_datahub-0.15.0rc20.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
- acryl_datahub-0.15.0rc20.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
- acryl_datahub-0.15.0rc20.dist-info/RECORD,,
977
+ acryl_datahub-0.15.0rc21.dist-info/METADATA,sha256=e3Tw7Cix7Z1uR8zyUtppjUv0ztJa2Kga0yl7nwPMbF8,173559
978
+ acryl_datahub-0.15.0rc21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
+ acryl_datahub-0.15.0rc21.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
+ acryl_datahub-0.15.0rc21.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
+ acryl_datahub-0.15.0rc21.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc20"
6
+ __version__ = "0.15.0rc21"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,8 +1,7 @@
1
1
  import logging
2
- from contextlib import contextmanager
3
2
  from enum import Enum
4
3
  from pathlib import Path
5
- from typing import Generator, List, Optional
4
+ from typing import List, Optional
6
5
 
7
6
  import yaml
8
7
  from pydantic import validator
@@ -10,6 +9,7 @@ from ruamel.yaml import YAML
10
9
 
11
10
  from datahub.configuration.common import ConfigModel
12
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
+ from datahub.ingestion.api.global_context import get_graph_context, set_graph_context
13
13
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
14
14
  from datahub.metadata.schema_classes import (
15
15
  PropertyValueClass,
@@ -24,23 +24,10 @@ logger = logging.getLogger(__name__)
24
24
  class StructuredPropertiesConfig:
25
25
  """Configuration class to hold the graph client"""
26
26
 
27
- _graph: Optional[DataHubGraph] = None
28
-
29
- @classmethod
30
- @contextmanager
31
- def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
32
- """Context manager to temporarily set a custom graph"""
33
- previous_graph = cls._graph
34
- cls._graph = graph
35
- try:
36
- yield
37
- finally:
38
- cls._graph = previous_graph
39
-
40
27
  @classmethod
41
- def get_graph(cls) -> DataHubGraph:
28
+ def get_graph_required(cls) -> DataHubGraph:
42
29
  """Get the current graph, falling back to default if none set"""
43
- return cls._graph if cls._graph is not None else get_default_graph()
30
+ return get_graph_context() or get_default_graph()
44
31
 
45
32
 
46
33
  class AllowedTypes(Enum):
@@ -79,7 +66,7 @@ class TypeQualifierAllowedTypes(ConfigModel):
79
66
  @validator("allowed_types", each_item=True)
80
67
  def validate_allowed_types(cls, v):
81
68
  if v:
82
- graph = StructuredPropertiesConfig.get_graph()
69
+ graph = StructuredPropertiesConfig.get_graph_required()
83
70
  validated_urn = Urn.make_entity_type_urn(v)
84
71
  if not graph.exists(validated_urn):
85
72
  raise ValueError(
@@ -106,7 +93,7 @@ class StructuredProperties(ConfigModel):
106
93
  @validator("entity_types", each_item=True)
107
94
  def validate_entity_types(cls, v):
108
95
  if v:
109
- graph = StructuredPropertiesConfig.get_graph()
96
+ graph = StructuredPropertiesConfig.get_graph_required()
110
97
  validated_urn = Urn.make_entity_type_urn(v)
111
98
  if not graph.exists(validated_urn):
112
99
  raise ValueError(
@@ -136,63 +123,64 @@ class StructuredProperties(ConfigModel):
136
123
 
137
124
  @staticmethod
138
125
  def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
139
- emitter: DataHubGraph = graph if graph else get_default_graph()
140
- with StructuredPropertiesConfig.use_graph(emitter):
141
- print("Using graph")
126
+ with set_graph_context(graph):
127
+ graph = StructuredPropertiesConfig.get_graph_required()
128
+
142
129
  with open(file) as fp:
143
130
  structuredproperties: List[dict] = yaml.safe_load(fp)
144
- for structuredproperty_raw in structuredproperties:
145
- structuredproperty = StructuredProperties.parse_obj(
146
- structuredproperty_raw
131
+ for structuredproperty_raw in structuredproperties:
132
+ structuredproperty = StructuredProperties.parse_obj(
133
+ structuredproperty_raw
134
+ )
135
+
136
+ if not structuredproperty.type.islower():
137
+ structuredproperty.type = structuredproperty.type.lower()
138
+ logger.warning(
139
+ f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
147
140
  )
148
- if not structuredproperty.type.islower():
149
- structuredproperty.type = structuredproperty.type.lower()
150
- logger.warn(
151
- f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
152
- )
153
- if not AllowedTypes.check_allowed_type(structuredproperty.type):
154
- raise ValueError(
155
- f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
156
- )
157
- mcp = MetadataChangeProposalWrapper(
158
- entityUrn=structuredproperty.urn,
159
- aspect=StructuredPropertyDefinitionClass(
160
- qualifiedName=structuredproperty.fqn,
161
- valueType=Urn.make_data_type_urn(structuredproperty.type),
162
- displayName=structuredproperty.display_name,
163
- description=structuredproperty.description,
164
- entityTypes=[
165
- Urn.make_entity_type_urn(entity_type)
166
- for entity_type in structuredproperty.entity_types or []
167
- ],
168
- cardinality=structuredproperty.cardinality,
169
- immutable=structuredproperty.immutable,
170
- allowedValues=(
171
- [
172
- PropertyValueClass(
173
- value=v.value, description=v.description
174
- )
175
- for v in structuredproperty.allowed_values
176
- ]
177
- if structuredproperty.allowed_values
178
- else None
179
- ),
180
- typeQualifier=(
181
- {
182
- "allowedTypes": structuredproperty.type_qualifier.allowed_types
183
- }
184
- if structuredproperty.type_qualifier
185
- else None
186
- ),
187
- ),
141
+ if not AllowedTypes.check_allowed_type(structuredproperty.type):
142
+ raise ValueError(
143
+ f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
188
144
  )
189
- emitter.emit_mcp(mcp)
145
+ mcp = MetadataChangeProposalWrapper(
146
+ entityUrn=structuredproperty.urn,
147
+ aspect=StructuredPropertyDefinitionClass(
148
+ qualifiedName=structuredproperty.fqn,
149
+ valueType=Urn.make_data_type_urn(structuredproperty.type),
150
+ displayName=structuredproperty.display_name,
151
+ description=structuredproperty.description,
152
+ entityTypes=[
153
+ Urn.make_entity_type_urn(entity_type)
154
+ for entity_type in structuredproperty.entity_types or []
155
+ ],
156
+ cardinality=structuredproperty.cardinality,
157
+ immutable=structuredproperty.immutable,
158
+ allowedValues=(
159
+ [
160
+ PropertyValueClass(
161
+ value=v.value, description=v.description
162
+ )
163
+ for v in structuredproperty.allowed_values
164
+ ]
165
+ if structuredproperty.allowed_values
166
+ else None
167
+ ),
168
+ typeQualifier=(
169
+ {
170
+ "allowedTypes": structuredproperty.type_qualifier.allowed_types
171
+ }
172
+ if structuredproperty.type_qualifier
173
+ else None
174
+ ),
175
+ ),
176
+ )
177
+ graph.emit_mcp(mcp)
190
178
 
191
- logger.info(f"Created structured property {structuredproperty.urn}")
179
+ logger.info(f"Created structured property {structuredproperty.urn}")
192
180
 
193
181
  @classmethod
194
182
  def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
195
- with StructuredPropertiesConfig.use_graph(graph):
183
+ with set_graph_context(graph):
196
184
  structured_property: Optional[
197
185
  StructuredPropertyDefinitionClass
198
186
  ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
46
46
  os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
47
47
  )
48
48
 
49
- # The limit is 16mb. We will use a max of 15mb to have some space for overhead.
50
- _MAX_BATCH_INGEST_PAYLOAD_SIZE = 15 * 1024 * 1024
49
+ # The limit is 16mb. We will use a max of 15mb to have some space
50
+ # for overhead like request headers.
51
+ # This applies to pretty much all calls to GMS.
52
+ INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
53
+
54
+ # This limit is somewhat arbitrary. All GMS endpoints will timeout
55
+ # and return a 500 if processing takes too long. To avoid sending
56
+ # too much to the backend and hitting a timeout, we try to limit
57
+ # the number of MCPs we send in a batch.
58
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
59
+ os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
60
+ )
51
61
 
52
62
 
53
63
  class DataHubRestEmitter(Closeable, Emitter):
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
290
300
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
291
301
  # If we will exceed the limit, we need to break it up into chunks.
292
302
  mcp_obj_chunks: List[List[str]] = []
293
- current_chunk_size = _MAX_BATCH_INGEST_PAYLOAD_SIZE
303
+ current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
294
304
  for mcp_obj in mcp_objs:
295
305
  mcp_obj_size = len(json.dumps(mcp_obj))
296
306
 
297
- if mcp_obj_size + current_chunk_size > _MAX_BATCH_INGEST_PAYLOAD_SIZE:
307
+ if (
308
+ mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309
+ or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310
+ ):
298
311
  mcp_obj_chunks.append([])
299
312
  current_chunk_size = 0
300
313
  mcp_obj_chunks[-1].append(mcp_obj)
@@ -18,7 +18,10 @@ from datahub.configuration.common import (
18
18
  )
19
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
20
  from datahub.emitter.mcp_builder import mcps_from_mce
21
- from datahub.emitter.rest_emitter import DataHubRestEmitter
21
+ from datahub.emitter.rest_emitter import (
22
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
+ DataHubRestEmitter,
24
+ )
22
25
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
23
26
  from datahub.ingestion.api.sink import (
24
27
  NoopWriteCallback,
@@ -71,6 +74,14 @@ class DatahubRestSinkConfig(DatahubClientConfig):
71
74
  # Only applies in async batch mode.
72
75
  max_per_batch: pydantic.PositiveInt = 100
73
76
 
77
+ @pydantic.validator("max_per_batch", always=True)
78
+ def validate_max_per_batch(cls, v):
79
+ if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
80
+ raise ValueError(
81
+ f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
82
+ )
83
+ return v
84
+
74
85
 
75
86
  @dataclasses.dataclass
76
87
  class DataHubRestSinkReport(SinkReport):
@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
282
282
  query: str
283
283
  transforms: list
284
284
 
285
- def report_warning(self, key: str, reason: str) -> None:
286
- logger.warning(f"{key}: {reason}")
287
- self.report.report_warning(key, reason)
288
-
289
285
  def get_parser(
290
286
  self,
291
287
  connector_manifest: ConnectorManifest,
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
355
351
  source_table = f"{table_name_tuple[-2]}.{source_table}"
356
352
  else:
357
353
  include_source_dataset = False
358
- self.report_warning(
359
- self.connector_manifest.name,
360
- f"could not find schema for table {source_table}",
354
+ self.report.warning(
355
+ "Could not find schema for table"
356
+ f"{self.connector_manifest.name} : {source_table}",
361
357
  )
362
358
  dataset_name: str = get_dataset_name(database_name, source_table)
363
359
  lineage = KafkaConnectLineage(
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
457
453
  target_platform=KAFKA,
458
454
  )
459
455
  lineages.append(lineage)
460
- self.report_warning(
456
+ self.report.warning(
457
+ "Could not find input dataset, the connector has query configuration set",
461
458
  self.connector_manifest.name,
462
- "could not find input dataset, the connector has query configuration set",
463
459
  )
464
460
  self.connector_manifest.lineages = lineages
465
461
  return
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
535
531
  include_source_dataset=False,
536
532
  )
537
533
  )
538
- self.report_warning(
539
- self.connector_manifest.name,
540
- f"could not find input dataset, for connector topics {topic_names}",
534
+ self.report.warning(
535
+ "Could not find input dataset for connector topics",
536
+ f"{self.connector_manifest.name} : {topic_names}",
541
537
  )
542
538
  self.connector_manifest.lineages = lineages
543
539
  return
544
540
  else:
545
541
  include_source_dataset = True
546
542
  if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
547
- self.report_warning(
548
- self.connector_manifest.name,
549
- f"could not find input dataset, connector has unknown transform - {transforms[0]['type']}",
543
+ self.report.warning(
544
+ "Could not find input dataset, connector has unknown transform",
545
+ f"{self.connector_manifest.name} : {transforms[0]['type']}",
550
546
  )
551
547
  include_source_dataset = False
552
548
  if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
553
- self.report_warning(
549
+ self.report.warning(
550
+ "Could not find input dataset, connector has one or more unknown transforms",
554
551
  self.connector_manifest.name,
555
- "could not find input dataset, connector has one or more unknown transforms",
556
552
  )
557
553
  include_source_dataset = False
558
554
  lineages = self.default_get_lineages(
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
753
749
  lineages.append(lineage)
754
750
  self.connector_manifest.lineages = lineages
755
751
  except Exception as e:
756
- self.report.report_warning(
757
- self.connector_manifest.name, f"Error resolving lineage: {e}"
752
+ self.report.warning(
753
+ "Error resolving lineage for connector",
754
+ self.connector_manifest.name,
755
+ exc=e,
758
756
  )
759
757
 
760
758
  return
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
783
781
  defaultDataset: Optional[str] = None
784
782
  version: str = "v1"
785
783
 
786
- def report_warning(self, key: str, reason: str) -> None:
787
- logger.warning(f"{key}: {reason}")
788
- self.report.report_warning(key, reason)
789
-
790
784
  def get_parser(
791
785
  self,
792
786
  connector_manifest: ConnectorManifest,
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
917
911
  transformed_topic = self.apply_transformations(topic, transforms)
918
912
  dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
919
913
  if dataset_table is None:
920
- self.report_warning(
921
- self.connector_manifest.name,
922
- f"could not find target dataset for topic {transformed_topic}, please check your connector configuration",
914
+ self.report.warning(
915
+ "Could not find target dataset for topic, please check your connector configuration"
916
+ f"{self.connector_manifest.name} : {transformed_topic} ",
923
917
  )
924
918
  continue
925
919
  target_dataset = f"{project}.{dataset_table}"
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
954
948
  schema_name: str
955
949
  topics_to_tables: Dict[str, str]
956
950
 
957
- def report_warning(self, key: str, reason: str) -> None:
958
- logger.warning(f"{key}: {reason}")
959
- self.report.report_warning(key, reason)
960
-
961
951
  def get_table_name_from_topic_name(self, topic_name: str) -> str:
962
952
  """
963
953
  This function converts the topic name to a valid Snowflake table name using some rules.
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
1105
1095
  )
1106
1096
  self.connector_manifest.lineages = lineages
1107
1097
  except Exception as e:
1108
- self.report.report_warning(
1109
- self.connector_manifest.name, f"Error resolving lineage: {e}"
1098
+ self.report.warning(
1099
+ "Error resolving lineage for connector",
1100
+ self.connector_manifest.name,
1101
+ exc=e,
1110
1102
  )
1111
1103
 
1112
1104
  return
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1155
1147
  )
1156
1148
  self.session.auth = (self.config.username, self.config.password)
1157
1149
 
1158
- test_response = self.session.get(f"{self.config.connect_uri}")
1150
+ test_response = self.session.get(f"{self.config.connect_uri}/connectors")
1159
1151
  test_response.raise_for_status()
1160
1152
  logger.info(f"Connection to {self.config.connect_uri} is ok")
1161
1153
  if not jpype.isJVMStarted():
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1178
1170
 
1179
1171
  payload = connector_response.json()
1180
1172
 
1181
- for c in payload:
1182
- connector_url = f"{self.config.connect_uri}/connectors/{c}"
1183
- connector_response = self.session.get(connector_url)
1184
- manifest = connector_response.json()
1185
- connector_manifest = ConnectorManifest(**manifest)
1186
- if not self.config.connector_patterns.allowed(connector_manifest.name):
1187
- self.report.report_dropped(connector_manifest.name)
1173
+ for connector_name in payload:
1174
+ connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
1175
+ connector_manifest = self._get_connector_manifest(
1176
+ connector_name, connector_url
1177
+ )
1178
+ if (
1179
+ connector_manifest is None
1180
+ or not self.config.connector_patterns.allowed(connector_manifest.name)
1181
+ ):
1182
+ self.report.report_dropped(connector_name)
1188
1183
  continue
1189
1184
 
1190
1185
  if self.config.provided_configs:
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1195
1190
  connector_manifest.lineages = list()
1196
1191
  connector_manifest.url = connector_url
1197
1192
 
1198
- topics = self.session.get(
1199
- f"{self.config.connect_uri}/connectors/{c}/topics",
1200
- ).json()
1201
-
1202
- connector_manifest.topic_names = topics[c]["topics"]
1193
+ connector_manifest.topic_names = self._get_connector_topics(connector_name)
1203
1194
 
1204
1195
  # Populate Source Connector metadata
1205
1196
  if connector_manifest.type == SOURCE:
1206
- tasks = self.session.get(
1207
- f"{self.config.connect_uri}/connectors/{c}/tasks",
1208
- ).json()
1209
-
1210
- connector_manifest.tasks = tasks
1197
+ connector_manifest.tasks = self._get_connector_tasks(connector_name)
1211
1198
 
1212
1199
  # JDBC source connector lineages
1213
1200
  if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1246
1233
  )
1247
1234
  continue
1248
1235
 
1249
- for topic in topics:
1236
+ for topic in connector_manifest.topic_names:
1250
1237
  lineage = KafkaConnectLineage(
1251
1238
  source_dataset=target_connector.source_dataset,
1252
1239
  source_platform=target_connector.source_platform,
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1286
1273
 
1287
1274
  return connectors_manifest
1288
1275
 
1276
+ def _get_connector_manifest(
1277
+ self, connector_name: str, connector_url: str
1278
+ ) -> Optional[ConnectorManifest]:
1279
+ try:
1280
+ connector_response = self.session.get(connector_url)
1281
+ connector_response.raise_for_status()
1282
+ except Exception as e:
1283
+ self.report.warning(
1284
+ "Failed to get connector details", connector_name, exc=e
1285
+ )
1286
+ return None
1287
+ manifest = connector_response.json()
1288
+ connector_manifest = ConnectorManifest(**manifest)
1289
+ return connector_manifest
1290
+
1291
+ def _get_connector_tasks(self, connector_name: str) -> dict:
1292
+ try:
1293
+ response = self.session.get(
1294
+ f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
1295
+ )
1296
+ response.raise_for_status()
1297
+ except Exception as e:
1298
+ self.report.warning(
1299
+ "Error getting connector tasks", context=connector_name, exc=e
1300
+ )
1301
+ return {}
1302
+
1303
+ return response.json()
1304
+
1305
+ def _get_connector_topics(self, connector_name: str) -> List[str]:
1306
+ try:
1307
+ response = self.session.get(
1308
+ f"{self.config.connect_uri}/connectors/{connector_name}/topics",
1309
+ )
1310
+ response.raise_for_status()
1311
+ except Exception as e:
1312
+ self.report.warning(
1313
+ "Error getting connector topics", context=connector_name, exc=e
1314
+ )
1315
+ return []
1316
+
1317
+ return response.json()[connector_name]["topics"]
1318
+
1289
1319
  def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
1290
1320
  connector_name = connector.name
1291
1321
  connector_type = connector.type
@@ -413,9 +413,10 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
413
413
  return UpstreamLineageEdge.parse_obj(db_row)
414
414
  except Exception as e:
415
415
  self.report.num_upstream_lineage_edge_parsing_failed += 1
416
+ upstream_tables = db_row.get("UPSTREAM_TABLES")
416
417
  self.structured_reporter.warning(
417
418
  "Failed to parse lineage edge",
418
- context=db_row.get("DOWNSTREAM_TABLE_NAME") or None,
419
+ context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
419
420
  exc=e,
420
421
  )
421
422
  return None
@@ -237,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
237
237
  LIMIT {limit} {from_clause};
238
238
  """
239
239
 
240
+ @staticmethod
241
+ def get_secure_view_definitions() -> str:
242
+ # https://docs.snowflake.com/en/sql-reference/account-usage/views
243
+ return """
244
+ SELECT
245
+ TABLE_CATALOG as "TABLE_CATALOG",
246
+ TABLE_SCHEMA as "TABLE_SCHEMA",
247
+ TABLE_NAME as "TABLE_NAME",
248
+ VIEW_DEFINITION as "VIEW_DEFINITION"
249
+ FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
250
+ WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
251
+ """
252
+
240
253
  @staticmethod
241
254
  def columns_for_schema(
242
255
  schema_name: str,
@@ -266,6 +266,22 @@ class SnowflakeDataDictionary(SupportsAsObj):
266
266
  snowflake_schemas.append(snowflake_schema)
267
267
  return snowflake_schemas
268
268
 
269
+ @serialized_lru_cache(maxsize=1)
270
+ def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
271
+ secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
272
+ lambda: defaultdict(lambda: defaultdict())
273
+ )
274
+ cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
275
+ for view in cur:
276
+ db_name = view["TABLE_CATALOG"]
277
+ schema_name = view["TABLE_SCHEMA"]
278
+ view_name = view["TABLE_NAME"]
279
+ secure_view_definitions[db_name][schema_name][view_name] = view[
280
+ "VIEW_DEFINITION"
281
+ ]
282
+
283
+ return secure_view_definitions
284
+
269
285
  @serialized_lru_cache(maxsize=1)
270
286
  def get_tables_for_database(
271
287
  self, db_name: str