acryl-datahub 1.1.1rc2__py3-none-any.whl → 1.1.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/METADATA +2612 -2610
- {acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/RECORD +35 -33
- {acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +9 -8
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/ingest_cli.py +9 -1
- datahub/emitter/mce_builder.py +3 -1
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +30 -11
- datahub/ingestion/source/hex/query_fetcher.py +9 -3
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
- datahub/ingestion/source/sql/sql_types.py +5 -2
- datahub/metadata/_internal_schema_classes.py +515 -515
- datahub/metadata/_urns/urn_defs.py +1785 -1785
- datahub/metadata/schema.avsc +17269 -17639
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +4 -0
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +2 -1
- datahub/sdk/dataflow.py +302 -0
- datahub/sdk/datajob.py +335 -0
- datahub/sdk/entity_client.py +8 -0
- {acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import contextlib
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
3
|
+
import time
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
|
|
6
6
|
|
|
7
|
-
from sqlalchemy import create_engine
|
|
7
|
+
from sqlalchemy import create_engine, text
|
|
8
8
|
|
|
9
9
|
from datahub.emitter.aspect import ASPECT_MAP
|
|
10
10
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
# Should work for at least mysql, mariadb, postgres
|
|
21
21
|
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
|
|
22
|
+
DATE_FORMAT = "%Y-%m-%d"
|
|
22
23
|
|
|
23
24
|
ROW = TypeVar("ROW", bound=Dict[str, Any])
|
|
24
25
|
|
|
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
|
|
|
85
86
|
**connection_config.options,
|
|
86
87
|
)
|
|
87
88
|
|
|
89
|
+
# Cache for available dates to avoid redundant queries
|
|
90
|
+
self.available_dates_cache: Optional[List[datetime]] = None
|
|
91
|
+
|
|
88
92
|
@property
|
|
89
93
|
def soft_deleted_urns_query(self) -> str:
|
|
90
94
|
return f"""
|
|
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
|
|
|
100
104
|
ORDER BY mav.urn
|
|
101
105
|
"""
|
|
102
106
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
def query(self, set_structured_properties_filter: bool) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Main query that gets data for specified date range with appropriate filters.
|
|
110
|
+
"""
|
|
111
|
+
structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
|
|
107
112
|
|
|
108
|
-
# Ensures stable order, chronological per (urn, aspect)
|
|
109
|
-
# Relies on createdon order to reflect version order
|
|
110
|
-
# Ordering of entries with the same createdon is handled by VersionOrderer
|
|
111
113
|
return f"""
|
|
112
114
|
SELECT *
|
|
113
115
|
FROM (
|
|
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
|
|
|
132
134
|
{"" if self.config.include_all_versions else "AND mav.version = 0"}
|
|
133
135
|
{"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
|
|
134
136
|
AND mav.createdon >= %(since_createdon)s
|
|
137
|
+
AND mav.createdon < %(end_createdon)s
|
|
135
138
|
ORDER BY
|
|
136
139
|
createdon,
|
|
137
140
|
urn,
|
|
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
|
|
|
139
142
|
version
|
|
140
143
|
) as t
|
|
141
144
|
WHERE 1=1
|
|
142
|
-
{"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
|
|
145
|
+
{"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
|
|
146
|
+
{structured_prop_filter}
|
|
143
147
|
ORDER BY
|
|
144
148
|
createdon,
|
|
145
149
|
urn,
|
|
146
150
|
aspect,
|
|
147
151
|
version
|
|
152
|
+
LIMIT %(limit)s
|
|
153
|
+
OFFSET %(offset)s
|
|
148
154
|
"""
|
|
149
155
|
|
|
156
|
+
def execute_with_params(
|
|
157
|
+
self, query: str, params: Dict[str, Any]
|
|
158
|
+
) -> List[Dict[str, Any]]:
|
|
159
|
+
"""Execute query with proper parameter binding that works with your database"""
|
|
160
|
+
with self.engine.connect() as conn:
|
|
161
|
+
result = conn.execute(query, params or {})
|
|
162
|
+
return [dict(row) for row in result.fetchall()]
|
|
163
|
+
|
|
150
164
|
def execute_server_cursor(
|
|
151
165
|
self, query: str, params: Dict[str, Any]
|
|
152
166
|
) -> Iterable[Dict[str, Any]]:
|
|
167
|
+
"""Execute a query with server-side cursor"""
|
|
153
168
|
with self.engine.connect() as conn:
|
|
154
169
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
170
|
with (
|
|
156
171
|
conn.begin()
|
|
157
172
|
): # Transaction required for PostgreSQL server-side cursor
|
|
158
|
-
#
|
|
159
|
-
|
|
173
|
+
# Set query timeout at the connection level
|
|
174
|
+
if self.config.query_timeout:
|
|
175
|
+
if self.engine.dialect.name == "postgresql":
|
|
176
|
+
conn.execute(
|
|
177
|
+
text(
|
|
178
|
+
f"SET statement_timeout = {self.config.query_timeout * 1000}"
|
|
179
|
+
)
|
|
180
|
+
) # milliseconds
|
|
181
|
+
elif self.engine.dialect.name in ["mysql", "mariadb"]:
|
|
182
|
+
conn.execute(
|
|
183
|
+
text(
|
|
184
|
+
f"SET max_execution_time = {self.config.query_timeout * 1000}"
|
|
185
|
+
)
|
|
186
|
+
) # milliseconds
|
|
187
|
+
|
|
188
|
+
# Stream results with batch size
|
|
160
189
|
conn = conn.execution_options(
|
|
161
190
|
stream_results=True,
|
|
162
191
|
yield_per=self.config.database_query_batch_size,
|
|
163
192
|
)
|
|
193
|
+
|
|
194
|
+
# Execute query - using native parameterization without text()
|
|
195
|
+
# to maintain compatibility with your original code
|
|
164
196
|
result = conn.execute(query, params)
|
|
165
197
|
for row in result:
|
|
166
198
|
yield dict(row)
|
|
199
|
+
|
|
200
|
+
return # Success, exit the retry loop
|
|
167
201
|
else:
|
|
168
202
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
169
203
|
|
|
170
204
|
def _get_rows(
|
|
171
|
-
self,
|
|
205
|
+
self,
|
|
206
|
+
start_date: datetime,
|
|
207
|
+
end_date: datetime,
|
|
208
|
+
set_structured_properties_filter: bool,
|
|
209
|
+
limit: int,
|
|
172
210
|
) -> Iterable[Dict[str, Any]]:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
176
|
-
}
|
|
177
|
-
yield from self.execute_server_cursor(self.query, params)
|
|
211
|
+
"""
|
|
212
|
+
Retrieves data rows within a specified date range using pagination.
|
|
178
213
|
|
|
179
|
-
|
|
214
|
+
Implements a hybrid pagination strategy that switches between time-based and
|
|
215
|
+
offset-based approaches depending on the returned data. Uses server-side
|
|
216
|
+
cursors for efficient memory usage.
|
|
217
|
+
|
|
218
|
+
Note: May return duplicate rows across batch boundaries when multiple rows
|
|
219
|
+
share the same 'createdon' timestamp. This is expected behavior when
|
|
220
|
+
transitioning between pagination methods.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
start_date: Beginning of date range (inclusive)
|
|
224
|
+
end_date: End of date range (exclusive)
|
|
225
|
+
set_structured_properties_filter: Whether to apply structured filtering
|
|
226
|
+
limit: Maximum rows to fetch per query
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
An iterable of database rows as dictionaries
|
|
230
|
+
"""
|
|
231
|
+
offset = 0
|
|
232
|
+
last_createdon = None
|
|
233
|
+
first_iteration = True
|
|
234
|
+
|
|
235
|
+
while True:
|
|
236
|
+
try:
|
|
237
|
+
# Set up query and parameters - using named parameters
|
|
238
|
+
query = self.query(set_structured_properties_filter)
|
|
239
|
+
params: Dict[str, Any] = {
|
|
240
|
+
"since_createdon": start_date.strftime(DATETIME_FORMAT),
|
|
241
|
+
"end_createdon": end_date.strftime(DATETIME_FORMAT),
|
|
242
|
+
"limit": limit,
|
|
243
|
+
"offset": offset,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# Add exclude_aspects if needed
|
|
247
|
+
if (
|
|
248
|
+
hasattr(self.config, "exclude_aspects")
|
|
249
|
+
and self.config.exclude_aspects
|
|
250
|
+
):
|
|
251
|
+
params["exclude_aspects"] = tuple(self.config.exclude_aspects)
|
|
252
|
+
|
|
253
|
+
logger.info(
|
|
254
|
+
f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
|
|
255
|
+
f"with limit {limit} and offset {offset} (inclusive range)"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Execute query with server-side cursor
|
|
259
|
+
rows = self.execute_server_cursor(query, params)
|
|
260
|
+
# Process and yield rows
|
|
261
|
+
rows_processed = 0
|
|
262
|
+
for row in rows:
|
|
263
|
+
if first_iteration:
|
|
264
|
+
start_date = row.get("createdon", start_date)
|
|
265
|
+
first_iteration = False
|
|
266
|
+
|
|
267
|
+
last_createdon = row.get("createdon")
|
|
268
|
+
rows_processed += 1
|
|
269
|
+
yield row
|
|
270
|
+
|
|
271
|
+
# If we processed fewer than the limit or no last_createdon, we're done
|
|
272
|
+
if rows_processed < limit or not last_createdon:
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
# Update parameters for next iteration
|
|
276
|
+
if start_date != last_createdon:
|
|
277
|
+
start_date = last_createdon
|
|
278
|
+
offset = 0
|
|
279
|
+
else:
|
|
280
|
+
offset += limit
|
|
281
|
+
|
|
282
|
+
logger.info(
|
|
283
|
+
f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(
|
|
288
|
+
f"Error processing date range {start_date} to {end_date}: {str(e)}"
|
|
289
|
+
)
|
|
290
|
+
# Re-raise the exception after logging
|
|
291
|
+
raise
|
|
292
|
+
|
|
293
|
+
def get_all_aspects(
|
|
180
294
|
self, from_createdon: datetime, stop_time: datetime
|
|
295
|
+
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
296
|
+
logger.info("Fetching Structured properties aspects")
|
|
297
|
+
yield from self.get_aspects(
|
|
298
|
+
from_createdon=from_createdon,
|
|
299
|
+
stop_time=stop_time,
|
|
300
|
+
set_structured_properties_filter=True,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
logger.info(
|
|
304
|
+
f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
time.sleep(
|
|
308
|
+
self.config.structured_properties_template_cache_invalidation_interval
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
logger.info("Fetching aspects")
|
|
312
|
+
yield from self.get_aspects(
|
|
313
|
+
from_createdon=from_createdon,
|
|
314
|
+
stop_time=stop_time,
|
|
315
|
+
set_structured_properties_filter=False,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def get_aspects(
|
|
319
|
+
self,
|
|
320
|
+
from_createdon: datetime,
|
|
321
|
+
stop_time: datetime,
|
|
322
|
+
set_structured_properties_filter: bool = False,
|
|
181
323
|
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
182
324
|
orderer = VersionOrderer[Dict[str, Any]](
|
|
183
325
|
enabled=self.config.include_all_versions
|
|
184
326
|
)
|
|
185
|
-
rows = self._get_rows(
|
|
327
|
+
rows = self._get_rows(
|
|
328
|
+
start_date=from_createdon,
|
|
329
|
+
end_date=stop_time,
|
|
330
|
+
set_structured_properties_filter=set_structured_properties_filter,
|
|
331
|
+
limit=self.config.database_query_batch_size,
|
|
332
|
+
)
|
|
186
333
|
for row in orderer(rows):
|
|
187
334
|
mcp = self._parse_row(row)
|
|
188
335
|
if mcp:
|
|
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
|
|
|
190
337
|
|
|
191
338
|
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
|
|
192
339
|
"""
|
|
193
|
-
Fetches all soft-deleted entities from the database.
|
|
340
|
+
Fetches all soft-deleted entities from the database using pagination.
|
|
194
341
|
|
|
195
342
|
Yields:
|
|
196
343
|
Row objects containing URNs of soft-deleted entities
|
|
197
344
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
logger.debug("
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
345
|
+
try:
|
|
346
|
+
params: Dict = {}
|
|
347
|
+
|
|
348
|
+
logger.debug("Fetching soft-deleted URNs")
|
|
349
|
+
|
|
350
|
+
# Use server-side cursor implementation
|
|
351
|
+
rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
|
|
352
|
+
processed_rows = 0
|
|
353
|
+
# Process and yield rows
|
|
354
|
+
for row in rows:
|
|
355
|
+
processed_rows += 1
|
|
356
|
+
yield row
|
|
357
|
+
|
|
358
|
+
logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
|
|
359
|
+
|
|
360
|
+
except Exception:
|
|
361
|
+
logger.exception("Error fetching soft-deleted row", exc_info=True)
|
|
362
|
+
raise
|
|
210
363
|
|
|
211
364
|
def _parse_row(
|
|
212
365
|
self, row: Dict[str, Any]
|
|
@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
117
117
|
) -> Iterable[MetadataWorkUnit]:
|
|
118
118
|
logger.info(f"Fetching database aspects starting from {from_createdon}")
|
|
119
119
|
progress = ProgressTimer(report_every=timedelta(seconds=60))
|
|
120
|
-
mcps = reader.
|
|
120
|
+
mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
|
|
121
121
|
for i, (mcp, createdon) in enumerate(mcps):
|
|
122
122
|
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
|
|
123
123
|
continue
|
|
@@ -132,6 +132,12 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
|
|
|
132
132
|
sql_parser_column_errors: int = 0
|
|
133
133
|
sql_parser_successes: int = 0
|
|
134
134
|
|
|
135
|
+
# Details on where column info comes from.
|
|
136
|
+
nodes_with_catalog_columns: int = 0
|
|
137
|
+
nodes_with_inferred_columns: int = 0
|
|
138
|
+
nodes_with_graph_columns: int = 0
|
|
139
|
+
nodes_with_no_columns: int = 0
|
|
140
|
+
|
|
135
141
|
sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
|
|
136
142
|
sql_parser_detach_ctes_failures_list: LossyList[str] = field(
|
|
137
143
|
default_factory=LossyList
|
|
@@ -619,14 +625,8 @@ class DBTNode:
|
|
|
619
625
|
def exists_in_target_platform(self):
|
|
620
626
|
return not (self.is_ephemeral_model() or self.node_type == "test")
|
|
621
627
|
|
|
622
|
-
def
|
|
623
|
-
"""
|
|
624
|
-
Update the column list if they are not already set.
|
|
625
|
-
"""
|
|
626
|
-
|
|
627
|
-
if self.columns:
|
|
628
|
-
# If we already have columns, don't overwrite them.
|
|
629
|
-
return
|
|
628
|
+
def set_columns(self, schema_fields: List[SchemaField]) -> None:
|
|
629
|
+
"""Update the column list."""
|
|
630
630
|
|
|
631
631
|
self.columns = [
|
|
632
632
|
DBTColumn(
|
|
@@ -1248,9 +1248,28 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1248
1248
|
target_node_urn, self._to_schema_info(inferred_schema_fields)
|
|
1249
1249
|
)
|
|
1250
1250
|
|
|
1251
|
-
#
|
|
1252
|
-
|
|
1253
|
-
|
|
1251
|
+
# When updating the node's columns, our order of preference is:
|
|
1252
|
+
# 1. Schema from the dbt catalog
|
|
1253
|
+
# 2. Inferred schema
|
|
1254
|
+
# 3. Schema fetched from the graph
|
|
1255
|
+
if node.columns:
|
|
1256
|
+
self.report.nodes_with_catalog_columns += 1
|
|
1257
|
+
pass # we already have columns from the dbt catalog
|
|
1258
|
+
elif inferred_schema_fields:
|
|
1259
|
+
logger.debug(
|
|
1260
|
+
f"Using {len(inferred_schema_fields)} inferred columns for {node.dbt_name}"
|
|
1261
|
+
)
|
|
1262
|
+
self.report.nodes_with_inferred_columns += 1
|
|
1263
|
+
node.set_columns(inferred_schema_fields)
|
|
1264
|
+
elif schema_fields:
|
|
1265
|
+
logger.debug(
|
|
1266
|
+
f"Using {len(schema_fields)} graph columns for {node.dbt_name}"
|
|
1267
|
+
)
|
|
1268
|
+
self.report.nodes_with_graph_columns += 1
|
|
1269
|
+
node.set_columns(schema_fields)
|
|
1270
|
+
else:
|
|
1271
|
+
logger.debug(f"No columns found for {node.dbt_name}")
|
|
1272
|
+
self.report.nodes_with_no_columns += 1
|
|
1254
1273
|
|
|
1255
1274
|
def _parse_cll(
|
|
1256
1275
|
self,
|
|
@@ -18,8 +18,12 @@ from datahub.utilities.time import datetime_to_ts_millis
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
# Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
|
|
21
|
-
#
|
|
22
|
-
|
|
21
|
+
# Context values:
|
|
22
|
+
# - SCHEDULED_RUN: The query was executed during a scheduled run of a published Hex app.
|
|
23
|
+
# - LOGIC_VIEW: The query was executed from the Hex project's notebook view. This happens when a user is actively editing a Hex notebook: When they first open and run it or when they rerun without cached results.
|
|
24
|
+
# - APP_VIEW: The query was executed during a published app session. This happens when a user opens up a published app or reruns the app without cached results.
|
|
25
|
+
# Only match metadata with "context": "SCHEDULED_RUN|APP_VIEW" to filter out those from notebook, which may bring more noise from development than value
|
|
26
|
+
HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "(?:SCHEDULED_RUN|APP_VIEW)".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
|
|
23
27
|
|
|
24
28
|
|
|
25
29
|
@dataclass
|
|
@@ -197,13 +201,15 @@ class HexQueryFetcher:
|
|
|
197
201
|
Example:
|
|
198
202
|
-- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
|
|
199
203
|
|
|
200
|
-
|
|
204
|
+
TODO: Consider supporting multiline metadata format in the future:
|
|
201
205
|
# -- Hex query metadata: {
|
|
202
206
|
# -- "categories": ["Scratchpad"],
|
|
203
207
|
# -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
|
|
204
208
|
# -- ...
|
|
205
209
|
# -- }
|
|
206
210
|
|
|
211
|
+
TODO: Extract based on pattern matching is strict on the order of the keys in the metadata. Consider using a more flexible approach like JSON parsing.
|
|
212
|
+
|
|
207
213
|
Returns:
|
|
208
214
|
A tuple of (project_id, workspace_name) if both are successfully extracted
|
|
209
215
|
None if extraction fails for any reason
|
|
@@ -82,6 +82,9 @@ class OpenApiConfig(ConfigModel):
|
|
|
82
82
|
get_token: dict = Field(
|
|
83
83
|
default={}, description="Retrieving a token from the endpoint."
|
|
84
84
|
)
|
|
85
|
+
verify_ssl: bool = Field(
|
|
86
|
+
default=True, description="Enable SSL certificate verification"
|
|
87
|
+
)
|
|
85
88
|
|
|
86
89
|
@validator("bearer_token", always=True)
|
|
87
90
|
def ensure_only_one_token(
|
|
@@ -129,12 +132,14 @@ class OpenApiConfig(ConfigModel):
|
|
|
129
132
|
tok_url=url4req,
|
|
130
133
|
method=self.get_token["request_type"],
|
|
131
134
|
proxies=self.proxies,
|
|
135
|
+
verify_ssl=self.verify_ssl,
|
|
132
136
|
)
|
|
133
137
|
sw_dict = get_swag_json(
|
|
134
138
|
self.url,
|
|
135
139
|
token=self.token,
|
|
136
140
|
swagger_file=self.swagger_file,
|
|
137
141
|
proxies=self.proxies,
|
|
142
|
+
verify_ssl=self.verify_ssl,
|
|
138
143
|
) # load the swagger file
|
|
139
144
|
|
|
140
145
|
else: # using basic auth for accessing endpoints
|
|
@@ -144,6 +149,7 @@ class OpenApiConfig(ConfigModel):
|
|
|
144
149
|
password=self.password,
|
|
145
150
|
swagger_file=self.swagger_file,
|
|
146
151
|
proxies=self.proxies,
|
|
152
|
+
verify_ssl=self.verify_ssl,
|
|
147
153
|
)
|
|
148
154
|
return sw_dict
|
|
149
155
|
|
|
@@ -343,6 +349,7 @@ class APISource(Source, ABC):
|
|
|
343
349
|
tot_url,
|
|
344
350
|
token=config.token,
|
|
345
351
|
proxies=config.proxies,
|
|
352
|
+
verify_ssl=config.verify_ssl,
|
|
346
353
|
)
|
|
347
354
|
else:
|
|
348
355
|
response = request_call(
|
|
@@ -350,6 +357,7 @@ class APISource(Source, ABC):
|
|
|
350
357
|
username=config.username,
|
|
351
358
|
password=config.password,
|
|
352
359
|
proxies=config.proxies,
|
|
360
|
+
verify_ssl=config.verify_ssl,
|
|
353
361
|
)
|
|
354
362
|
if response.status_code == 200:
|
|
355
363
|
fields2add, root_dataset_samples[dataset_name] = extract_fields(
|
|
@@ -380,6 +388,7 @@ class APISource(Source, ABC):
|
|
|
380
388
|
tot_url,
|
|
381
389
|
token=config.token,
|
|
382
390
|
proxies=config.proxies,
|
|
391
|
+
verify_ssl=config.verify_ssl,
|
|
383
392
|
)
|
|
384
393
|
else:
|
|
385
394
|
response = request_call(
|
|
@@ -387,6 +396,7 @@ class APISource(Source, ABC):
|
|
|
387
396
|
username=config.username,
|
|
388
397
|
password=config.password,
|
|
389
398
|
proxies=config.proxies,
|
|
399
|
+
verify_ssl=config.verify_ssl,
|
|
390
400
|
)
|
|
391
401
|
if response.status_code == 200:
|
|
392
402
|
fields2add, _ = extract_fields(response, dataset_name)
|
|
@@ -415,6 +425,7 @@ class APISource(Source, ABC):
|
|
|
415
425
|
tot_url,
|
|
416
426
|
token=config.token,
|
|
417
427
|
proxies=config.proxies,
|
|
428
|
+
verify_ssl=config.verify_ssl,
|
|
418
429
|
)
|
|
419
430
|
else:
|
|
420
431
|
response = request_call(
|
|
@@ -422,6 +433,7 @@ class APISource(Source, ABC):
|
|
|
422
433
|
username=config.username,
|
|
423
434
|
password=config.password,
|
|
424
435
|
proxies=config.proxies,
|
|
436
|
+
verify_ssl=config.verify_ssl,
|
|
425
437
|
)
|
|
426
438
|
if response.status_code == 200:
|
|
427
439
|
fields2add, _ = extract_fields(response, dataset_name)
|
|
@@ -59,17 +59,21 @@ def request_call(
|
|
|
59
59
|
username: Optional[str] = None,
|
|
60
60
|
password: Optional[str] = None,
|
|
61
61
|
proxies: Optional[dict] = None,
|
|
62
|
+
verify_ssl: bool = True,
|
|
62
63
|
) -> requests.Response:
|
|
63
64
|
headers = {"accept": "application/json"}
|
|
64
65
|
if username is not None and password is not None:
|
|
65
66
|
return requests.get(
|
|
66
|
-
url,
|
|
67
|
+
url,
|
|
68
|
+
headers=headers,
|
|
69
|
+
auth=HTTPBasicAuth(username, password),
|
|
70
|
+
verify=verify_ssl,
|
|
67
71
|
)
|
|
68
72
|
elif token is not None:
|
|
69
73
|
headers["Authorization"] = f"{token}"
|
|
70
|
-
return requests.get(url, proxies=proxies, headers=headers)
|
|
74
|
+
return requests.get(url, proxies=proxies, headers=headers, verify=verify_ssl)
|
|
71
75
|
else:
|
|
72
|
-
return requests.get(url, headers=headers)
|
|
76
|
+
return requests.get(url, headers=headers, verify=verify_ssl)
|
|
73
77
|
|
|
74
78
|
|
|
75
79
|
def get_swag_json(
|
|
@@ -79,10 +83,16 @@ def get_swag_json(
|
|
|
79
83
|
password: Optional[str] = None,
|
|
80
84
|
swagger_file: str = "",
|
|
81
85
|
proxies: Optional[dict] = None,
|
|
86
|
+
verify_ssl: bool = True,
|
|
82
87
|
) -> Dict:
|
|
83
88
|
tot_url = url + swagger_file
|
|
84
89
|
response = request_call(
|
|
85
|
-
url=tot_url,
|
|
90
|
+
url=tot_url,
|
|
91
|
+
token=token,
|
|
92
|
+
username=username,
|
|
93
|
+
password=password,
|
|
94
|
+
proxies=proxies,
|
|
95
|
+
verify_ssl=verify_ssl,
|
|
86
96
|
)
|
|
87
97
|
|
|
88
98
|
if response.status_code != 200:
|
|
@@ -127,37 +137,45 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
127
137
|
check_sw_version(sw_dict)
|
|
128
138
|
|
|
129
139
|
for p_k, p_o in sw_dict["paths"].items():
|
|
130
|
-
method
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
url_details[p_k]
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
140
|
+
for method, method_spec in p_o.items():
|
|
141
|
+
# skip non-method keys like "parameters"
|
|
142
|
+
if method.lower() not in [
|
|
143
|
+
"get",
|
|
144
|
+
"post",
|
|
145
|
+
"put",
|
|
146
|
+
"delete",
|
|
147
|
+
"patch",
|
|
148
|
+
"options",
|
|
149
|
+
"head",
|
|
150
|
+
]:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
responses = method_spec.get("responses", {})
|
|
154
|
+
base_res = responses.get("200") or responses.get(200)
|
|
155
|
+
if not base_res:
|
|
156
|
+
# if there is no 200 response, we skip this method
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# if the description is not present, we will use the summary
|
|
160
|
+
# if both are not present, we will use an empty string
|
|
161
|
+
desc = method_spec.get("description") or method_spec.get("summary", "")
|
|
162
|
+
|
|
163
|
+
# if the tags are not present, we will use an empty list
|
|
164
|
+
tags = method_spec.get("tags", [])
|
|
165
|
+
|
|
166
|
+
url_details[p_k] = {
|
|
167
|
+
"description": desc,
|
|
168
|
+
"tags": tags,
|
|
169
|
+
"method": method.upper(),
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
example_data = check_for_api_example_data(base_res, p_k)
|
|
173
|
+
if example_data:
|
|
174
|
+
url_details[p_k]["data"] = example_data
|
|
175
|
+
|
|
176
|
+
# checking whether there are defined parameters to execute the call...
|
|
177
|
+
if "parameters" in p_o[method]:
|
|
178
|
+
url_details[p_k]["parameters"] = p_o[method]["parameters"]
|
|
161
179
|
|
|
162
180
|
return dict(sorted(url_details.items()))
|
|
163
181
|
|
|
@@ -358,6 +376,7 @@ def get_tok(
|
|
|
358
376
|
tok_url: str = "",
|
|
359
377
|
method: str = "post",
|
|
360
378
|
proxies: Optional[dict] = None,
|
|
379
|
+
verify_ssl: bool = True,
|
|
361
380
|
) -> str:
|
|
362
381
|
"""
|
|
363
382
|
Trying to post username/password to get auth.
|
|
@@ -368,7 +387,7 @@ def get_tok(
|
|
|
368
387
|
# this will make a POST call with username and password
|
|
369
388
|
data = {"username": username, "password": password, "maxDuration": True}
|
|
370
389
|
# url2post = url + "api/authenticate/"
|
|
371
|
-
response = requests.post(url4req, proxies=proxies, json=data)
|
|
390
|
+
response = requests.post(url4req, proxies=proxies, json=data, verify=verify_ssl)
|
|
372
391
|
if response.status_code == 200:
|
|
373
392
|
cont = json.loads(response.content)
|
|
374
393
|
if "token" in cont: # other authentication scheme
|
|
@@ -377,7 +396,7 @@ def get_tok(
|
|
|
377
396
|
token = f"Bearer {cont['tokens']['access']}"
|
|
378
397
|
elif method == "get":
|
|
379
398
|
# this will make a GET call with username and password
|
|
380
|
-
response = requests.get(url4req)
|
|
399
|
+
response = requests.get(url4req, verify=verify_ssl)
|
|
381
400
|
if response.status_code == 200:
|
|
382
401
|
cont = json.loads(response.content)
|
|
383
402
|
token = cont["token"]
|
|
@@ -22,6 +22,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
|
|
|
22
22
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
23
23
|
ClassificationSourceConfigMixin,
|
|
24
24
|
)
|
|
25
|
+
from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
|
|
25
26
|
from datahub.ingestion.source.snowflake.snowflake_connection import (
|
|
26
27
|
SnowflakeConnectionConfig,
|
|
27
28
|
)
|
|
@@ -326,6 +327,18 @@ class SnowflakeV2Config(
|
|
|
326
327
|
" Map of share name -> details of share.",
|
|
327
328
|
)
|
|
328
329
|
|
|
330
|
+
known_snowflake_edition: Optional[SnowflakeEdition] = Field(
|
|
331
|
+
default=None,
|
|
332
|
+
description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Allows empty containers to be ingested before datasets are added, avoiding permission errors
|
|
336
|
+
warn_no_datasets: bool = Field(
|
|
337
|
+
hidden_from_docs=True,
|
|
338
|
+
default=False,
|
|
339
|
+
description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
|
|
340
|
+
)
|
|
341
|
+
|
|
329
342
|
include_assertion_results: bool = Field(
|
|
330
343
|
default=False,
|
|
331
344
|
description="Whether to ingest assertion run results for assertions created using Datahub"
|