acryl-datahub 1.0.0.4rc6__py3-none-any.whl → 1.0.0.4rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.4rc6.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.4rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=lFUyplnN9ea9C0jyvH1T6H0kVRxk5DIQVEdHH_34ut4,323
4
+ datahub/_version.py,sha256=8uGokiOv-wIz3FMuxJJSoDgRcEr4zbywxDQ8l-_DLVs,323
5
5
  datahub/entrypoints.py,sha256=AQN5MzCe6q3LKI4SS6WmwN56kgjF6AC1ld7yELWVP2w,8953
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -266,8 +266,8 @@ datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=_5cAXVU8b8T_nAP
266
266
  datahub/ingestion/source/bigquery_v2/usage.py,sha256=A9c-ofclaRk0NSnc4IRaqJYqMPv6ecCld_TPy3V2qFs,40748
267
267
  datahub/ingestion/source/cassandra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
268
268
  datahub/ingestion/source/cassandra/cassandra.py,sha256=lKvPP0Uahi9xw_yh9cArPPtwvAauXolaEk-6f-jhpz4,14558
269
- datahub/ingestion/source/cassandra/cassandra_api.py,sha256=UVGQTsk6O57Q6wrWo54bQPLtStTWhw_Fq6fgW3Bjgk8,12515
270
- datahub/ingestion/source/cassandra/cassandra_config.py,sha256=vIMUOzazWTGi03B51vI0-YMxaMJHUGmCxJJgd8pKhC8,3791
269
+ datahub/ingestion/source/cassandra/cassandra_api.py,sha256=b7MApc3_tEfHoj-6ub6snkcv_DweL1wi_TGJjAA1-yU,13516
270
+ datahub/ingestion/source/cassandra/cassandra_config.py,sha256=Ga9915cDZukR5-u2tMNx5Jkf8eza2oAE5YS_sQIVEVQ,4222
271
271
  datahub/ingestion/source/cassandra/cassandra_profiling.py,sha256=ZqsAY8NFsrrLqOduV7Aem2eJLtc2_OU9tW4tc_dh0V8,10984
272
272
  datahub/ingestion/source/cassandra/cassandra_utils.py,sha256=j-LidYkaCTmGnpUVNLsax_c3z32PsQbsbHeYojygd1s,5105
273
273
  datahub/ingestion/source/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -482,7 +482,7 @@ datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfv
482
482
  datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
483
483
  datahub/ingestion/source/sql/oracle.py,sha256=ftnrk3iiEelyv9PBHPYbairuP1WgxZbi1gu6YdqY69E,29965
484
484
  datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
485
- datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
485
+ datahub/ingestion/source/sql/presto.py,sha256=tATa0M2q0PjUC_E9W_jSUsmKTP7cVJayLgrFMzG_eao,4223
486
486
  datahub/ingestion/source/sql/sql_common.py,sha256=jsweel_-vesNtcPonnfS11OUrlcZnS3wGt5r0dYTPnM,48637
487
487
  datahub/ingestion/source/sql/sql_config.py,sha256=u3nGZYYl1WtaxfNsDU5bglgZ5Jq3Fxk9xei_CUIAXB0,8222
488
488
  datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
@@ -494,7 +494,7 @@ datahub/ingestion/source/sql/sqlalchemy_data_reader.py,sha256=FvHZ4JEK3aR2DYOBZi
494
494
  datahub/ingestion/source/sql/sqlalchemy_uri.py,sha256=u0ZvgdJjXZdo_vl7YIQfYuuWbGwpnH6OSozI2e8ZV4I,858
495
495
  datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py,sha256=KOpbmDIE2h1hyYEsbVHJi2B7FlsyUMTXZx4diyzltQg,1826
496
496
  datahub/ingestion/source/sql/teradata.py,sha256=9WdrxDy02lRJi9IZgsAATFsmxcQnIw5Gr6yCqHJQy5k,33507
497
- datahub/ingestion/source/sql/trino.py,sha256=gSLDyETKavSVR8l9wdebrfoc41cqAWz6ApqIicW0BF8,17892
497
+ datahub/ingestion/source/sql/trino.py,sha256=zIfQ6GvW8Sbw4sxqsTcnibT51STka_nzNYvmld6HfHw,18947
498
498
  datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=AB3Gtx4omAy_08zadHQpmUGmIGufkZ6o_ihWNnfvzYc,5783
499
499
  datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
500
500
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
@@ -1054,8 +1054,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1054
1054
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1055
1055
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1056
1056
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1057
- acryl_datahub-1.0.0.4rc6.dist-info/METADATA,sha256=Ry6qydIyyjAptNjSQr46P_yzOTtYyW2WaIzk8SQI6Rw,179949
1058
- acryl_datahub-1.0.0.4rc6.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
1059
- acryl_datahub-1.0.0.4rc6.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1060
- acryl_datahub-1.0.0.4rc6.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1061
- acryl_datahub-1.0.0.4rc6.dist-info/RECORD,,
1057
+ acryl_datahub-1.0.0.4rc7.dist-info/METADATA,sha256=GIO2L9JPpY18rRVypRWukqtZ0AEjbEN6whMxK5MChZY,179949
1058
+ acryl_datahub-1.0.0.4rc7.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
1059
+ acryl_datahub-1.0.0.4rc7.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1060
+ acryl_datahub-1.0.0.4rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1061
+ acryl_datahub-1.0.0.4rc7.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.4rc6"
3
+ __version__ = "1.0.0.4rc7"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -1,3 +1,4 @@
1
+ import ssl
1
2
  from dataclasses import dataclass, field
2
3
  from typing import Any, Dict, List, Optional
3
4
 
@@ -128,6 +129,23 @@ class CassandraAPI:
128
129
 
129
130
  self._cassandra_session = cluster.connect()
130
131
  return True
132
+
133
+ ssl_context = None
134
+ if self.config.ssl_ca_certs:
135
+ ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
136
+ ssl_context.load_verify_locations(self.config.ssl_ca_certs)
137
+ if self.config.ssl_certfile and self.config.ssl_keyfile:
138
+ ssl_context.load_cert_chain(
139
+ certfile=self.config.ssl_certfile,
140
+ keyfile=self.config.ssl_keyfile,
141
+ )
142
+ elif self.config.ssl_certfile or self.config.ssl_keyfile:
143
+ # If one is provided, the other must be too.
144
+ # This is a simplification; real-world scenarios might allow one without the other depending on setup.
145
+ raise ValueError(
146
+ "Both ssl_certfile and ssl_keyfile must be provided if one is specified."
147
+ )
148
+
131
149
  if self.config.username and self.config.password:
132
150
  auth_provider = PlainTextAuthProvider(
133
151
  username=self.config.username, password=self.config.password
@@ -136,12 +154,14 @@ class CassandraAPI:
136
154
  [self.config.contact_point],
137
155
  port=self.config.port,
138
156
  auth_provider=auth_provider,
157
+ ssl_context=ssl_context,
139
158
  load_balancing_policy=None,
140
159
  )
141
160
  else:
142
161
  cluster = Cluster(
143
162
  [self.config.contact_point],
144
163
  port=self.config.port,
164
+ ssl_context=ssl_context,
145
165
  load_balancing_policy=None,
146
166
  )
147
167
 
@@ -79,6 +79,21 @@ class CassandraSourceConfig(
79
79
  description="Configuration for cloud-based Cassandra, such as DataStax Astra DB.",
80
80
  )
81
81
 
82
+ ssl_ca_certs: Optional[str] = Field(
83
+ default=None,
84
+ description="Path to the CA certificate file for SSL connections.",
85
+ )
86
+
87
+ ssl_certfile: Optional[str] = Field(
88
+ default=None,
89
+ description="Path to the SSL certificate file for SSL connections.",
90
+ )
91
+
92
+ ssl_keyfile: Optional[str] = Field(
93
+ default=None,
94
+ description="Path to the SSL key file for SSL connections.",
95
+ )
96
+
82
97
  keyspace_pattern: AllowDenyPattern = Field(
83
98
  default=AllowDenyPattern.allow_all(),
84
99
  description="Regex patterns to filter keyspaces for ingestion.",
@@ -1,10 +1,12 @@
1
+ import functools
1
2
  from textwrap import dedent
2
- from typing import Optional
3
+ from typing import Dict, Optional
3
4
 
4
5
  from pydantic.fields import Field
5
6
  from pyhive.sqlalchemy_presto import PrestoDialect
6
7
  from sqlalchemy import exc, sql
7
8
  from sqlalchemy.engine import reflection
9
+ from sqlalchemy.engine.base import Engine
8
10
 
9
11
  from datahub.ingestion.api.common import PipelineContext
10
12
  from datahub.ingestion.api.decorators import (
@@ -114,3 +116,18 @@ class PrestoSource(TrinoSource):
114
116
  def create(cls, config_dict, ctx):
115
117
  config = PrestoConfig.parse_obj(config_dict)
116
118
  return cls(config, ctx)
119
+
120
+
121
+ # Unfortunately, the Presto dialect provide catalog_name as a column
122
+ # therefore we we need some workaround to not fail.
123
+ # This is a workaround to not fail which casuses to only get the table comment as property which is still better than to fail.
124
+ @functools.lru_cache
125
+ def gen_catalog_connector_dict(engine: Engine) -> Dict[str, str]:
126
+ query = dedent(
127
+ """
128
+ SELECT *
129
+ FROM "system"."metadata"."catalogs"
130
+ """
131
+ ).strip()
132
+ res = engine.execute(sql.text(query))
133
+ return {row.catalog_name: "" for row in res}
@@ -134,19 +134,41 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
134
134
  ):
135
135
  properties_table = self._get_full_table(f"{table_name}$properties", schema)
136
136
  query = f"SELECT * FROM {properties_table}"
137
- row = connection.execute(sql.text(query)).fetchone()
137
+ rows = connection.execute(sql.text(query)).fetchall()
138
138
 
139
139
  # Generate properties dictionary.
140
140
  properties = {}
141
- if row:
141
+
142
+ if len(rows) == 0:
143
+ # No properties found, return empty dictionary
144
+ return {}
145
+
146
+ # Check if using the old format (key, value columns)
147
+ if (
148
+ connector_name == "iceberg"
149
+ and len(rows[0]) == 2
150
+ and "key" in rows[0]
151
+ and "value" in rows[0]
152
+ ):
153
+ # https://trino.io/docs/current/connector/iceberg.html#properties-table
154
+ for row in rows:
155
+ if row["value"] is not None:
156
+ properties[row["key"]] = row["value"]
157
+ return {"text": properties.get("comment"), "properties": properties}
158
+ elif connector_name == "hive" and len(rows[0]) > 1 and len(rows) == 1:
159
+ # https://trino.io/docs/current/connector/hive.html#properties-table
160
+ row = rows[0]
142
161
  for col_name, col_value in row.items():
143
162
  if col_value is not None:
144
163
  properties[col_name] = col_value
164
+ return {"text": properties.get("comment"), "properties": properties}
145
165
 
146
- return {"text": properties.get("comment"), "properties": properties}
147
- else:
148
- return self.get_table_comment_default(connection, table_name, schema)
149
- except Exception:
166
+ # If we can't get the properties we still fallback to the default
167
+ return self.get_table_comment_default(connection, table_name, schema)
168
+ except Exception as e:
169
+ logging.warning(
170
+ f"Failed to get table comment for {table_name} in {schema}: {e}"
171
+ )
150
172
  return {}
151
173
 
152
174