acryl-datahub 1.1.0rc3__py3-none-any.whl → 1.1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2532 -2530
  2. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
  3. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +9 -8
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/restricted_text.py +247 -0
  10. datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
  11. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  12. datahub/cli/delete_cli.py +4 -4
  13. datahub/cli/ingest_cli.py +9 -1
  14. datahub/emitter/mce_builder.py +3 -1
  15. datahub/emitter/response_helper.py +86 -1
  16. datahub/emitter/rest_emitter.py +1 -1
  17. datahub/ingestion/graph/client.py +3 -3
  18. datahub/ingestion/source/apply/datahub_apply.py +4 -4
  19. datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
  20. datahub/ingestion/source/data_lake_common/object_store.py +644 -0
  21. datahub/ingestion/source/datahub/config.py +11 -0
  22. datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
  23. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  24. datahub/ingestion/source/dbt/dbt_common.py +30 -11
  25. datahub/ingestion/source/gcs/gcs_source.py +22 -7
  26. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  27. datahub/ingestion/source/hex/query_fetcher.py +9 -3
  28. datahub/ingestion/source/openapi.py +12 -0
  29. datahub/ingestion/source/openapi_parser.py +56 -37
  30. datahub/ingestion/source/s3/source.py +65 -6
  31. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
  33. datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
  34. datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
  35. datahub/ingestion/source/sql/athena.py +1 -0
  36. datahub/ingestion/source/sql/hive.py +2 -3
  37. datahub/ingestion/source/sql/sql_common.py +98 -34
  38. datahub/ingestion/source/sql/sql_types.py +5 -2
  39. datahub/ingestion/source/unity/config.py +5 -0
  40. datahub/ingestion/source/unity/proxy.py +117 -0
  41. datahub/ingestion/source/unity/source.py +167 -15
  42. datahub/ingestion/source/unity/tag_entities.py +295 -0
  43. datahub/metadata/_internal_schema_classes.py +667 -522
  44. datahub/metadata/_urns/urn_defs.py +1804 -1748
  45. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  46. datahub/metadata/schema.avsc +17358 -17584
  47. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  48. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  49. datahub/metadata/schemas/Applications.avsc +38 -0
  50. datahub/metadata/schemas/ChartKey.avsc +1 -0
  51. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  52. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  53. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  54. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  55. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  56. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  57. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  58. datahub/metadata/schemas/DatasetKey.avsc +1 -0
  59. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  60. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  61. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  62. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  63. datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
  64. datahub/metadata/schemas/MLModelKey.avsc +1 -0
  65. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  66. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  67. datahub/metadata/schemas/__init__.py +3 -3
  68. datahub/sdk/__init__.py +6 -0
  69. datahub/sdk/_all_entities.py +11 -0
  70. datahub/sdk/_shared.py +118 -1
  71. datahub/sdk/chart.py +315 -0
  72. datahub/sdk/container.py +7 -0
  73. datahub/sdk/dashboard.py +432 -0
  74. datahub/sdk/dataflow.py +309 -0
  75. datahub/sdk/datajob.py +342 -0
  76. datahub/sdk/dataset.py +8 -2
  77. datahub/sdk/entity_client.py +90 -2
  78. datahub/sdk/lineage_client.py +681 -82
  79. datahub/sdk/main_client.py +27 -8
  80. datahub/sdk/mlmodel.py +101 -38
  81. datahub/sdk/mlmodelgroup.py +7 -0
  82. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  83. datahub/testing/mce_helpers.py +421 -0
  84. datahub/testing/sdk_v2_helpers.py +18 -0
  85. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
- import contextlib
2
1
  import json
3
2
  import logging
3
+ import time
4
4
  from datetime import datetime
5
5
  from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
6
6
 
7
- from sqlalchemy import create_engine
7
+ from sqlalchemy import create_engine, text
8
8
 
9
9
  from datahub.emitter.aspect import ASPECT_MAP
10
10
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
19
19
 
20
20
  # Should work for at least mysql, mariadb, postgres
21
21
  DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
22
+ DATE_FORMAT = "%Y-%m-%d"
22
23
 
23
24
  ROW = TypeVar("ROW", bound=Dict[str, Any])
24
25
 
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
85
86
  **connection_config.options,
86
87
  )
87
88
 
89
+ # Cache for available dates to avoid redundant queries
90
+ self.available_dates_cache: Optional[List[datetime]] = None
91
+
88
92
  @property
89
93
  def soft_deleted_urns_query(self) -> str:
90
94
  return f"""
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
100
104
  ORDER BY mav.urn
101
105
  """
102
106
 
103
- @property
104
- def query(self) -> str:
105
- # May repeat rows for the same date
106
- # Offset is generally 0, unless we repeat the same createdon twice
107
+ def query(self, set_structured_properties_filter: bool) -> str:
108
+ """
109
+ Main query that gets data for specified date range with appropriate filters.
110
+ """
111
+ structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
107
112
 
108
- # Ensures stable order, chronological per (urn, aspect)
109
- # Relies on createdon order to reflect version order
110
- # Ordering of entries with the same createdon is handled by VersionOrderer
111
113
  return f"""
112
114
  SELECT *
113
115
  FROM (
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
132
134
  {"" if self.config.include_all_versions else "AND mav.version = 0"}
133
135
  {"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
134
136
  AND mav.createdon >= %(since_createdon)s
137
+ AND mav.createdon < %(end_createdon)s
135
138
  ORDER BY
136
139
  createdon,
137
140
  urn,
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
139
142
  version
140
143
  ) as t
141
144
  WHERE 1=1
142
- {"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
145
+ {"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
146
+ {structured_prop_filter}
143
147
  ORDER BY
144
148
  createdon,
145
149
  urn,
146
150
  aspect,
147
151
  version
152
+ LIMIT %(limit)s
153
+ OFFSET %(offset)s
148
154
  """
149
155
 
156
+ def execute_with_params(
157
+ self, query: str, params: Dict[str, Any]
158
+ ) -> List[Dict[str, Any]]:
159
+ """Execute query with proper parameter binding that works with your database"""
160
+ with self.engine.connect() as conn:
161
+ result = conn.execute(query, params or {})
162
+ return [dict(row) for row in result.fetchall()]
163
+
150
164
  def execute_server_cursor(
151
165
  self, query: str, params: Dict[str, Any]
152
166
  ) -> Iterable[Dict[str, Any]]:
167
+ """Execute a query with server-side cursor"""
153
168
  with self.engine.connect() as conn:
154
169
  if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
155
170
  with (
156
171
  conn.begin()
157
172
  ): # Transaction required for PostgreSQL server-side cursor
158
- # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
159
- # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
173
+ # Set query timeout at the connection level
174
+ if self.config.query_timeout:
175
+ if self.engine.dialect.name == "postgresql":
176
+ conn.execute(
177
+ text(
178
+ f"SET statement_timeout = {self.config.query_timeout * 1000}"
179
+ )
180
+ ) # milliseconds
181
+ elif self.engine.dialect.name in ["mysql", "mariadb"]:
182
+ conn.execute(
183
+ text(
184
+ f"SET max_execution_time = {self.config.query_timeout * 1000}"
185
+ )
186
+ ) # milliseconds
187
+
188
+ # Stream results with batch size
160
189
  conn = conn.execution_options(
161
190
  stream_results=True,
162
191
  yield_per=self.config.database_query_batch_size,
163
192
  )
193
+
194
+ # Execute query - using native parameterization without text()
195
+ # to maintain compatibility with your original code
164
196
  result = conn.execute(query, params)
165
197
  for row in result:
166
198
  yield dict(row)
199
+
200
+ return # Success, exit the retry loop
167
201
  else:
168
202
  raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
169
203
 
170
204
  def _get_rows(
171
- self, from_createdon: datetime, stop_time: datetime
205
+ self,
206
+ start_date: datetime,
207
+ end_date: datetime,
208
+ set_structured_properties_filter: bool,
209
+ limit: int,
172
210
  ) -> Iterable[Dict[str, Any]]:
173
- params = {
174
- "exclude_aspects": list(self.config.exclude_aspects),
175
- "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
176
- }
177
- yield from self.execute_server_cursor(self.query, params)
211
+ """
212
+ Retrieves data rows within a specified date range using pagination.
178
213
 
179
- def get_aspects(
214
+ Implements a hybrid pagination strategy that switches between time-based and
215
+ offset-based approaches depending on the returned data. Uses server-side
216
+ cursors for efficient memory usage.
217
+
218
+ Note: May return duplicate rows across batch boundaries when multiple rows
219
+ share the same 'createdon' timestamp. This is expected behavior when
220
+ transitioning between pagination methods.
221
+
222
+ Args:
223
+ start_date: Beginning of date range (inclusive)
224
+ end_date: End of date range (exclusive)
225
+ set_structured_properties_filter: Whether to apply structured filtering
226
+ limit: Maximum rows to fetch per query
227
+
228
+ Returns:
229
+ An iterable of database rows as dictionaries
230
+ """
231
+ offset = 0
232
+ last_createdon = None
233
+ first_iteration = True
234
+
235
+ while True:
236
+ try:
237
+ # Set up query and parameters - using named parameters
238
+ query = self.query(set_structured_properties_filter)
239
+ params: Dict[str, Any] = {
240
+ "since_createdon": start_date.strftime(DATETIME_FORMAT),
241
+ "end_createdon": end_date.strftime(DATETIME_FORMAT),
242
+ "limit": limit,
243
+ "offset": offset,
244
+ }
245
+
246
+ # Add exclude_aspects if needed
247
+ if (
248
+ hasattr(self.config, "exclude_aspects")
249
+ and self.config.exclude_aspects
250
+ ):
251
+ params["exclude_aspects"] = tuple(self.config.exclude_aspects)
252
+
253
+ logger.info(
254
+ f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
255
+ f"with limit {limit} and offset {offset} (inclusive range)"
256
+ )
257
+
258
+ # Execute query with server-side cursor
259
+ rows = self.execute_server_cursor(query, params)
260
+ # Process and yield rows
261
+ rows_processed = 0
262
+ for row in rows:
263
+ if first_iteration:
264
+ start_date = row.get("createdon", start_date)
265
+ first_iteration = False
266
+
267
+ last_createdon = row.get("createdon")
268
+ rows_processed += 1
269
+ yield row
270
+
271
+ # If we processed fewer than the limit or no last_createdon, we're done
272
+ if rows_processed < limit or not last_createdon:
273
+ break
274
+
275
+ # Update parameters for next iteration
276
+ if start_date != last_createdon:
277
+ start_date = last_createdon
278
+ offset = 0
279
+ else:
280
+ offset += limit
281
+
282
+ logger.info(
283
+ f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
284
+ )
285
+
286
+ except Exception as e:
287
+ logger.error(
288
+ f"Error processing date range {start_date} to {end_date}: {str(e)}"
289
+ )
290
+ # Re-raise the exception after logging
291
+ raise
292
+
293
+ def get_all_aspects(
180
294
  self, from_createdon: datetime, stop_time: datetime
295
+ ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
296
+ logger.info("Fetching Structured properties aspects")
297
+ yield from self.get_aspects(
298
+ from_createdon=from_createdon,
299
+ stop_time=stop_time,
300
+ set_structured_properties_filter=True,
301
+ )
302
+
303
+ logger.info(
304
+ f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
305
+ )
306
+
307
+ time.sleep(
308
+ self.config.structured_properties_template_cache_invalidation_interval
309
+ )
310
+
311
+ logger.info("Fetching aspects")
312
+ yield from self.get_aspects(
313
+ from_createdon=from_createdon,
314
+ stop_time=stop_time,
315
+ set_structured_properties_filter=False,
316
+ )
317
+
318
+ def get_aspects(
319
+ self,
320
+ from_createdon: datetime,
321
+ stop_time: datetime,
322
+ set_structured_properties_filter: bool = False,
181
323
  ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
182
324
  orderer = VersionOrderer[Dict[str, Any]](
183
325
  enabled=self.config.include_all_versions
184
326
  )
185
- rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
327
+ rows = self._get_rows(
328
+ start_date=from_createdon,
329
+ end_date=stop_time,
330
+ set_structured_properties_filter=set_structured_properties_filter,
331
+ limit=self.config.database_query_batch_size,
332
+ )
186
333
  for row in orderer(rows):
187
334
  mcp = self._parse_row(row)
188
335
  if mcp:
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
190
337
 
191
338
  def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
192
339
  """
193
- Fetches all soft-deleted entities from the database.
340
+ Fetches all soft-deleted entities from the database using pagination.
194
341
 
195
342
  Yields:
196
343
  Row objects containing URNs of soft-deleted entities
197
344
  """
198
- with self.engine.connect() as conn, contextlib.closing(
199
- conn.connection.cursor()
200
- ) as cursor:
201
- logger.debug("Polling soft-deleted urns from database")
202
- cursor.execute(self.soft_deleted_urns_query)
203
- columns = [desc[0] for desc in cursor.description]
204
- while True:
205
- rows = cursor.fetchmany(self.config.database_query_batch_size)
206
- if not rows:
207
- return
208
- for row in rows:
209
- yield dict(zip(columns, row))
345
+ try:
346
+ params: Dict = {}
347
+
348
+ logger.debug("Fetching soft-deleted URNs")
349
+
350
+ # Use server-side cursor implementation
351
+ rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
352
+ processed_rows = 0
353
+ # Process and yield rows
354
+ for row in rows:
355
+ processed_rows += 1
356
+ yield row
357
+
358
+ logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
359
+
360
+ except Exception:
361
+ logger.exception("Error fetching soft-deleted row", exc_info=True)
362
+ raise
210
363
 
211
364
  def _parse_row(
212
365
  self, row: Dict[str, Any]
@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
117
117
  ) -> Iterable[MetadataWorkUnit]:
118
118
  logger.info(f"Fetching database aspects starting from {from_createdon}")
119
119
  progress = ProgressTimer(report_every=timedelta(seconds=60))
120
- mcps = reader.get_aspects(from_createdon, self.report.stop_time)
120
+ mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
121
121
  for i, (mcp, createdon) in enumerate(mcps):
122
122
  if not self.urn_pattern.allowed(str(mcp.entityUrn)):
123
123
  continue
@@ -132,6 +132,12 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
132
132
  sql_parser_column_errors: int = 0
133
133
  sql_parser_successes: int = 0
134
134
 
135
+ # Details on where column info comes from.
136
+ nodes_with_catalog_columns: int = 0
137
+ nodes_with_inferred_columns: int = 0
138
+ nodes_with_graph_columns: int = 0
139
+ nodes_with_no_columns: int = 0
140
+
135
141
  sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
136
142
  sql_parser_detach_ctes_failures_list: LossyList[str] = field(
137
143
  default_factory=LossyList
@@ -619,14 +625,8 @@ class DBTNode:
619
625
  def exists_in_target_platform(self):
620
626
  return not (self.is_ephemeral_model() or self.node_type == "test")
621
627
 
622
- def columns_setdefault(self, schema_fields: List[SchemaField]) -> None:
623
- """
624
- Update the column list if they are not already set.
625
- """
626
-
627
- if self.columns:
628
- # If we already have columns, don't overwrite them.
629
- return
628
+ def set_columns(self, schema_fields: List[SchemaField]) -> None:
629
+ """Update the column list."""
630
630
 
631
631
  self.columns = [
632
632
  DBTColumn(
@@ -1248,9 +1248,28 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1248
1248
  target_node_urn, self._to_schema_info(inferred_schema_fields)
1249
1249
  )
1250
1250
 
1251
- # Save the inferred schema fields into the dbt node.
1252
- if inferred_schema_fields:
1253
- node.columns_setdefault(inferred_schema_fields)
1251
+ # When updating the node's columns, our order of preference is:
1252
+ # 1. Schema from the dbt catalog
1253
+ # 2. Inferred schema
1254
+ # 3. Schema fetched from the graph
1255
+ if node.columns:
1256
+ self.report.nodes_with_catalog_columns += 1
1257
+ pass # we already have columns from the dbt catalog
1258
+ elif inferred_schema_fields:
1259
+ logger.debug(
1260
+ f"Using {len(inferred_schema_fields)} inferred columns for {node.dbt_name}"
1261
+ )
1262
+ self.report.nodes_with_inferred_columns += 1
1263
+ node.set_columns(inferred_schema_fields)
1264
+ elif schema_fields:
1265
+ logger.debug(
1266
+ f"Using {len(schema_fields)} graph columns for {node.dbt_name}"
1267
+ )
1268
+ self.report.nodes_with_graph_columns += 1
1269
+ node.set_columns(schema_fields)
1270
+ else:
1271
+ logger.debug(f"No columns found for {node.dbt_name}")
1272
+ self.report.nodes_with_no_columns += 1
1254
1273
 
1255
1274
  def _parse_cll(
1256
1275
  self,
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from typing import Dict, Iterable, List, Optional
3
- from urllib.parse import unquote
4
3
 
5
4
  from pydantic import Field, SecretStr, validator
6
5
 
@@ -19,6 +18,9 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
19
18
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
20
19
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
21
20
  from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
21
+ from datahub.ingestion.source.data_lake_common.object_store import (
22
+ create_object_store_adapter,
23
+ )
22
24
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec, is_gcs_uri
23
25
  from datahub.ingestion.source.s3.config import DataLakeSourceConfig
24
26
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -136,16 +138,29 @@ class GCSSource(StatefulIngestionSourceBase):
136
138
 
137
139
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
138
140
  config = self.create_equivalent_s3_config()
139
- return self.s3_source_overrides(S3Source(config, PipelineContext(ctx.run_id)))
141
+ s3_source = S3Source(config, PipelineContext(ctx.run_id))
142
+ return self.s3_source_overrides(s3_source)
140
143
 
141
144
  def s3_source_overrides(self, source: S3Source) -> S3Source:
142
- source.source_config.platform = PLATFORM_GCS
145
+ """
146
+ Override S3Source methods with GCS-specific implementations using the adapter pattern.
147
+
148
+ This method customizes the S3Source instance to behave like a GCS source by
149
+ applying the GCS-specific adapter that replaces the necessary functionality.
143
150
 
144
- source.is_s3_platform = lambda: True # type: ignore
145
- source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
146
- f"s3://{bucket_name}/{key}"
151
+ Args:
152
+ source: The S3Source instance to customize
153
+
154
+ Returns:
155
+ The modified S3Source instance with GCS behavior
156
+ """
157
+ # Create a GCS adapter with project ID and region from our config
158
+ adapter = create_object_store_adapter(
159
+ "gcs",
147
160
  )
148
- return source
161
+
162
+ # Apply all customizations to the source
163
+ return adapter.apply_customizations(source)
149
164
 
150
165
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
151
166
  return [
@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
4
4
 
5
5
 
6
6
  def is_gcs_uri(uri: str) -> bool:
7
+ """
8
+ Check if a URI is a GCS URI (starts with gs://).
9
+
10
+ For more general URI handling, consider using object_store.get_object_store_for_uri.
11
+ """
7
12
  return uri.startswith(GCS_PREFIX)
8
13
 
9
14
 
10
15
  def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
16
+ """
17
+ Get the GCS prefix (gs://) if the URI is a GCS URI.
18
+
19
+ For more general URI handling, consider using object_store.get_object_store_for_uri.
20
+ """
11
21
  if gcs_uri.startswith(GCS_PREFIX):
12
22
  return GCS_PREFIX
13
23
  return None
14
24
 
15
25
 
16
26
  def strip_gcs_prefix(gcs_uri: str) -> str:
17
- # remove GCS prefix (gs://)
27
+ """
28
+ Remove the GCS prefix (gs://) from a GCS URI.
29
+
30
+ For more general URI handling, consider using the object_store module.
31
+
32
+ Args:
33
+ gcs_uri: A GCS URI starting with gs://
34
+
35
+ Returns:
36
+ The URI without the gs:// prefix
37
+
38
+ Raises:
39
+ ValueError: If the URI doesn't start with gs://
40
+ """
18
41
  prefix = get_gcs_prefix(gcs_uri)
19
42
  if not prefix:
20
- raise ValueError(f"Not an GCS URI. Must start with prefix: {GCS_PREFIX}")
43
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
21
44
 
22
45
  return gcs_uri[len(GCS_PREFIX) :]
23
46
 
24
47
 
25
- def get_gcs_bucket_name(path):
26
- if not is_gcs_uri(path):
27
- raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
28
- return strip_gcs_prefix(path).split("/")[0]
29
-
30
-
31
48
  def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
49
+ """
50
+ Get the path relative to the bucket from a GCS URI.
51
+
52
+ For more general URI handling, consider using object_store.get_object_key.
53
+ """
32
54
  return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
33
55
 
34
56
 
35
57
  def get_gcs_key_prefix(gcs_uri: str) -> str:
58
+ """
59
+ Get the key prefix (first path component after bucket) from a GCS URI.
60
+
61
+ For more general URI handling, consider using object_store.get_object_key.
62
+ """
36
63
  if not is_gcs_uri(gcs_uri):
37
- raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
64
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
38
65
  return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]
@@ -18,8 +18,12 @@ from datahub.utilities.time import datetime_to_ts_millis
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
  # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
21
- # Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
22
- HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
21
+ # Context values:
22
+ # - SCHEDULED_RUN: The query was executed during a scheduled run of a published Hex app.
23
+ # - LOGIC_VIEW: The query was executed from the Hex project's notebook view. This happens when a user is actively editing a Hex notebook: When they first open and run it or when they rerun without cached results.
24
+ # - APP_VIEW: The query was executed during a published app session. This happens when a user opens up a published app or reruns the app without cached results.
25
+ # Only match metadata with "context": "SCHEDULED_RUN|APP_VIEW" to filter out those from notebook, which may bring more noise from development than value
26
+ HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "(?:SCHEDULED_RUN|APP_VIEW)".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
23
27
 
24
28
 
25
29
  @dataclass
@@ -197,13 +201,15 @@ class HexQueryFetcher:
197
201
  Example:
198
202
  -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
199
203
 
200
- # TODO: Consider supporting multiline metadata format in the future:
204
+ TODO: Consider supporting multiline metadata format in the future:
201
205
  # -- Hex query metadata: {
202
206
  # -- "categories": ["Scratchpad"],
203
207
  # -- "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
204
208
  # -- ...
205
209
  # -- }
206
210
 
211
+ TODO: Extract based on pattern matching is strict on the order of the keys in the metadata. Consider using a more flexible approach like JSON parsing.
212
+
207
213
  Returns:
208
214
  A tuple of (project_id, workspace_name) if both are successfully extracted
209
215
  None if extraction fails for any reason
@@ -82,6 +82,9 @@ class OpenApiConfig(ConfigModel):
82
82
  get_token: dict = Field(
83
83
  default={}, description="Retrieving a token from the endpoint."
84
84
  )
85
+ verify_ssl: bool = Field(
86
+ default=True, description="Enable SSL certificate verification"
87
+ )
85
88
 
86
89
  @validator("bearer_token", always=True)
87
90
  def ensure_only_one_token(
@@ -129,12 +132,14 @@ class OpenApiConfig(ConfigModel):
129
132
  tok_url=url4req,
130
133
  method=self.get_token["request_type"],
131
134
  proxies=self.proxies,
135
+ verify_ssl=self.verify_ssl,
132
136
  )
133
137
  sw_dict = get_swag_json(
134
138
  self.url,
135
139
  token=self.token,
136
140
  swagger_file=self.swagger_file,
137
141
  proxies=self.proxies,
142
+ verify_ssl=self.verify_ssl,
138
143
  ) # load the swagger file
139
144
 
140
145
  else: # using basic auth for accessing endpoints
@@ -144,6 +149,7 @@ class OpenApiConfig(ConfigModel):
144
149
  password=self.password,
145
150
  swagger_file=self.swagger_file,
146
151
  proxies=self.proxies,
152
+ verify_ssl=self.verify_ssl,
147
153
  )
148
154
  return sw_dict
149
155
 
@@ -343,6 +349,7 @@ class APISource(Source, ABC):
343
349
  tot_url,
344
350
  token=config.token,
345
351
  proxies=config.proxies,
352
+ verify_ssl=config.verify_ssl,
346
353
  )
347
354
  else:
348
355
  response = request_call(
@@ -350,6 +357,7 @@ class APISource(Source, ABC):
350
357
  username=config.username,
351
358
  password=config.password,
352
359
  proxies=config.proxies,
360
+ verify_ssl=config.verify_ssl,
353
361
  )
354
362
  if response.status_code == 200:
355
363
  fields2add, root_dataset_samples[dataset_name] = extract_fields(
@@ -380,6 +388,7 @@ class APISource(Source, ABC):
380
388
  tot_url,
381
389
  token=config.token,
382
390
  proxies=config.proxies,
391
+ verify_ssl=config.verify_ssl,
383
392
  )
384
393
  else:
385
394
  response = request_call(
@@ -387,6 +396,7 @@ class APISource(Source, ABC):
387
396
  username=config.username,
388
397
  password=config.password,
389
398
  proxies=config.proxies,
399
+ verify_ssl=config.verify_ssl,
390
400
  )
391
401
  if response.status_code == 200:
392
402
  fields2add, _ = extract_fields(response, dataset_name)
@@ -415,6 +425,7 @@ class APISource(Source, ABC):
415
425
  tot_url,
416
426
  token=config.token,
417
427
  proxies=config.proxies,
428
+ verify_ssl=config.verify_ssl,
418
429
  )
419
430
  else:
420
431
  response = request_call(
@@ -422,6 +433,7 @@ class APISource(Source, ABC):
422
433
  username=config.username,
423
434
  password=config.password,
424
435
  proxies=config.proxies,
436
+ verify_ssl=config.verify_ssl,
425
437
  )
426
438
  if response.status_code == 200:
427
439
  fields2add, _ = extract_fields(response, dataset_name)