acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/METADATA +2524 -2471
  2. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/RECORD +87 -87
  3. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datajob/dataflow.py +3 -3
  7. datahub/api/entities/forms/forms.py +34 -34
  8. datahub/api/graphql/assertion.py +1 -1
  9. datahub/api/graphql/operation.py +4 -4
  10. datahub/cli/check_cli.py +3 -2
  11. datahub/cli/config_utils.py +2 -2
  12. datahub/cli/delete_cli.py +6 -5
  13. datahub/cli/docker_cli.py +2 -2
  14. datahub/cli/exists_cli.py +2 -1
  15. datahub/cli/get_cli.py +2 -1
  16. datahub/cli/iceberg_cli.py +6 -5
  17. datahub/cli/ingest_cli.py +9 -6
  18. datahub/cli/migrate.py +4 -3
  19. datahub/cli/migration_utils.py +4 -3
  20. datahub/cli/put_cli.py +3 -2
  21. datahub/cli/specific/assertions_cli.py +2 -1
  22. datahub/cli/specific/datacontract_cli.py +3 -2
  23. datahub/cli/specific/dataproduct_cli.py +10 -9
  24. datahub/cli/specific/dataset_cli.py +4 -3
  25. datahub/cli/specific/forms_cli.py +2 -1
  26. datahub/cli/specific/group_cli.py +2 -1
  27. datahub/cli/specific/structuredproperties_cli.py +4 -3
  28. datahub/cli/specific/user_cli.py +2 -1
  29. datahub/cli/state_cli.py +2 -1
  30. datahub/cli/timeline_cli.py +2 -1
  31. datahub/configuration/source_common.py +1 -1
  32. datahub/emitter/request_helper.py +116 -3
  33. datahub/emitter/rest_emitter.py +163 -93
  34. datahub/entrypoints.py +2 -1
  35. datahub/ingestion/api/source.py +2 -5
  36. datahub/ingestion/glossary/classification_mixin.py +4 -2
  37. datahub/ingestion/graph/client.py +16 -7
  38. datahub/ingestion/graph/config.py +14 -0
  39. datahub/ingestion/graph/filters.py +1 -1
  40. datahub/ingestion/run/pipeline.py +3 -2
  41. datahub/ingestion/run/pipeline_config.py +1 -1
  42. datahub/ingestion/sink/datahub_rest.py +5 -6
  43. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  44. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  45. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  46. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  47. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  48. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  49. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  50. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  51. datahub/ingestion/source/feast.py +4 -4
  52. datahub/ingestion/source/ge_data_profiler.py +2 -1
  53. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  54. datahub/ingestion/source/ldap.py +1 -1
  55. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  56. datahub/ingestion/source/looker/lookml_source.py +7 -1
  57. datahub/ingestion/source/metadata/lineage.py +2 -1
  58. datahub/ingestion/source/mode.py +74 -28
  59. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  60. datahub/ingestion/source/powerbi/config.py +1 -1
  61. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  62. datahub/ingestion/source/redshift/usage.py +10 -9
  63. datahub/ingestion/source/sql/clickhouse.py +5 -1
  64. datahub/ingestion/source/sql/druid.py +7 -2
  65. datahub/ingestion/source/sql/oracle.py +6 -2
  66. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  67. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  68. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  69. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  70. datahub/integrations/assertion/common.py +3 -2
  71. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
  72. datahub/metadata/_urns/urn_defs.py +1786 -1786
  73. datahub/metadata/schema.avsc +17364 -16988
  74. datahub/metadata/schema_classes.py +3 -3
  75. datahub/metadata/schemas/__init__.py +3 -3
  76. datahub/sdk/main_client.py +2 -2
  77. datahub/secret/datahub_secret_store.py +2 -1
  78. datahub/telemetry/telemetry.py +2 -2
  79. datahub/testing/check_imports.py +1 -1
  80. datahub/upgrade/upgrade.py +10 -12
  81. datahub/utilities/logging_manager.py +8 -1
  82. datahub/utilities/server_config_util.py +378 -10
  83. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  84. datahub/utilities/urn_encoder.py +1 -1
  85. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc11.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
7
  from functools import lru_cache
8
8
  from json import JSONDecodeError
9
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
9
+ from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
10
 
11
11
  import dateutil.parser as dp
12
12
  import pydantic
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
203
203
  pass
204
204
 
205
205
 
206
+ class HTTPError504(HTTPError):
207
+ pass
208
+
209
+
206
210
  ModeRequestError = (HTTPError, JSONDecodeError)
207
211
 
208
212
 
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
217
221
  num_query_template_render: int = 0
218
222
  num_query_template_render_failures: int = 0
219
223
  num_query_template_render_success: int = 0
224
+ num_requests_exceeding_rate_limit: int = 0
225
+ num_requests_retried_on_timeout: int = 0
226
+ num_spaces_retrieved: int = 0
220
227
 
221
228
  def report_dropped_space(self, ent_name: str) -> None:
222
229
  self.filtered_spaces.append(ent_name)
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
456
463
  # Datasets
457
464
  datasets = []
458
465
  for imported_dataset_name in report_info.get("imported_datasets", {}):
459
- mode_dataset = self._get_request_json(
460
- f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
461
- )
466
+ try:
467
+ mode_dataset = self._get_request_json(
468
+ f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
469
+ )
470
+ except HTTPError as http_error:
471
+ status_code = http_error.response.status_code
472
+ if status_code == 404:
473
+ self.report.report_warning(
474
+ title="Report Not Found",
475
+ message="Referenced report for reusable dataset was not found.",
476
+ context=f"Report: {report_info.get('id')}, "
477
+ f"Imported Dataset Report: {imported_dataset_name.get('token')}",
478
+ )
479
+ continue
480
+ else:
481
+ raise http_error
482
+
462
483
  dataset_urn = builder.make_dataset_urn_with_platform_instance(
463
484
  self.platform,
464
485
  str(mode_dataset.get("id")),
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
562
583
  space_info = {}
563
584
  try:
564
585
  logger.debug(f"Retrieving spaces for {self.workspace_uri}")
565
- payload = self._get_request_json(f"{self.workspace_uri}/spaces?filter=all")
566
- spaces = payload.get("_embedded", {}).get("spaces", {})
567
- logger.debug(
568
- f"Got {len(spaces)} spaces from workspace {self.workspace_uri}"
569
- )
570
- for s in spaces:
571
- logger.debug(f"Space: {s.get('name')}")
572
- space_name = s.get("name", "")
573
- # Using both restricted and default_access_level because
574
- # there is a current bug with restricted returning False everytime
575
- # which has been reported to Mode team
576
- if self.config.exclude_restricted and (
577
- s.get("restricted") or s.get("default_access_level") == "restricted"
578
- ):
579
- logging.debug(
580
- f"Skipping space {space_name} due to exclude restricted"
581
- )
582
- continue
583
- if not self.config.space_pattern.allowed(space_name):
584
- self.report.report_dropped_space(space_name)
585
- logging.debug(f"Skipping space {space_name} due to space pattern")
586
- continue
587
- space_info[s.get("token", "")] = s.get("name", "")
586
+ for spaces_page in self._get_paged_request_json(
587
+ f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
588
+ ):
589
+ logger.debug(
590
+ f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
591
+ )
592
+ self.report.num_spaces_retrieved += len(spaces_page)
593
+ for s in spaces_page:
594
+ logger.debug(f"Space: {s.get('name')}")
595
+ space_name = s.get("name", "")
596
+ # Using both restricted and default_access_level because
597
+ # there is a current bug with restricted returning False everytime
598
+ # which has been reported to Mode team
599
+ if self.config.exclude_restricted and (
600
+ s.get("restricted")
601
+ or s.get("default_access_level") == "restricted"
602
+ ):
603
+ logging.debug(
604
+ f"Skipping space {space_name} due to exclude restricted"
605
+ )
606
+ continue
607
+ if not self.config.space_pattern.allowed(space_name):
608
+ self.report.report_dropped_space(space_name)
609
+ logging.debug(
610
+ f"Skipping space {space_name} due to space pattern"
611
+ )
612
+ continue
613
+ space_info[s.get("token", "")] = s.get("name", "")
588
614
  except ModeRequestError as e:
589
615
  self.report.report_failure(
590
616
  title="Failed to Retrieve Spaces",
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
1475
1501
  )
1476
1502
  return charts
1477
1503
 
1504
+ def _get_paged_request_json(
1505
+ self, url: str, key: str, per_page: int
1506
+ ) -> Iterator[List[Dict]]:
1507
+ page: int = 1
1508
+ while True:
1509
+ page_url = f"{url}&per_page={per_page}&page={page}"
1510
+ response = self._get_request_json(page_url)
1511
+ data: List[Dict] = response.get("_embedded", {}).get(key, [])
1512
+ if not data:
1513
+ break
1514
+ yield data
1515
+ page += 1
1516
+
1478
1517
  def _get_request_json(self, url: str) -> Dict:
1479
1518
  r = tenacity.Retrying(
1480
1519
  wait=wait_exponential(
1481
1520
  multiplier=self.config.api_options.retry_backoff_multiplier,
1482
1521
  max=self.config.api_options.max_retry_interval,
1483
1522
  ),
1484
- retry=retry_if_exception_type((HTTPError429, ConnectionError)),
1523
+ retry=retry_if_exception_type(
1524
+ (HTTPError429, HTTPError504, ConnectionError)
1525
+ ),
1485
1526
  stop=stop_after_attempt(self.config.api_options.max_attempts),
1486
1527
  )
1487
1528
 
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
1502
1543
  except HTTPError as http_error:
1503
1544
  error_response = http_error.response
1504
1545
  if error_response.status_code == 429:
1546
+ self.report.num_requests_exceeding_rate_limit += 1
1505
1547
  # respect Retry-After
1506
1548
  sleep_time = error_response.headers.get("retry-after")
1507
1549
  if sleep_time is not None:
1508
1550
  time.sleep(float(sleep_time))
1509
1551
  raise HTTPError429 from None
1552
+ elif error_response.status_code == 504:
1553
+ self.report.num_requests_retried_on_timeout += 1
1554
+ time.sleep(0.1)
1555
+ raise HTTPError504 from None
1510
1556
 
1511
1557
  logger.debug(
1512
1558
  f"Error response ({error_response.status_code}): {error_response.text}"
@@ -5,27 +5,35 @@ from typing import Any, Dict, Iterable, List, Optional, Type, Union
5
5
 
6
6
  import pandas as pd
7
7
  from neo4j import GraphDatabase
8
- from pydantic.fields import Field
8
+ from pydantic import Field
9
9
 
10
10
  from datahub.configuration.source_common import (
11
11
  EnvConfigMixin,
12
+ PlatformInstanceConfigMixin,
13
+ )
14
+ from datahub.emitter.mce_builder import (
15
+ make_data_platform_urn,
16
+ make_dataset_urn_with_platform_instance,
12
17
  )
13
- from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
14
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
19
  from datahub.ingestion.api.common import PipelineContext
16
20
  from datahub.ingestion.api.decorators import (
17
21
  SupportStatus,
22
+ capability,
18
23
  config_class,
19
24
  platform_name,
20
25
  support_status,
21
26
  )
22
27
  from datahub.ingestion.api.source import (
23
28
  MetadataWorkUnitProcessor,
29
+ SourceCapability,
24
30
  )
31
+ from datahub.ingestion.api.source_helpers import auto_workunit
25
32
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
33
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
27
34
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
35
  StaleEntityRemovalHandler,
36
+ StatefulStaleMetadataRemovalConfig,
29
37
  )
30
38
  from datahub.ingestion.source.state.stateful_ingestion_base import (
31
39
  StatefulIngestionConfigBase,
@@ -64,12 +72,16 @@ _type_mapping: Dict[Union[Type, str], Type] = {
64
72
  }
65
73
 
66
74
 
67
- class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
75
+ class Neo4jConfig(
76
+ StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
77
+ ):
68
78
  username: str = Field(description="Neo4j Username")
69
79
  password: str = Field(description="Neo4j Password")
70
80
  uri: str = Field(description="The URI for the Neo4j server")
71
81
  env: str = Field(description="Neo4j env")
72
82
 
83
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
84
+
73
85
 
74
86
  @dataclass
75
87
  class Neo4jSourceReport(StatefulIngestionReport):
@@ -79,21 +91,27 @@ class Neo4jSourceReport(StatefulIngestionReport):
79
91
 
80
92
  @platform_name("Neo4j", id="neo4j")
81
93
  @config_class(Neo4jConfig)
94
+ @capability(
95
+ SourceCapability.PLATFORM_INSTANCE, "Supported via the `platform_instance` config"
96
+ )
82
97
  @support_status(SupportStatus.CERTIFIED)
83
98
  class Neo4jSource(StatefulIngestionSourceBase):
84
99
  NODE = "node"
85
100
  RELATIONSHIP = "relationship"
86
- PLATFORM = "neo4j"
101
+ config: Neo4jConfig
102
+ report: Neo4jSourceReport
87
103
 
88
- def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
104
+ def __init__(self, config: Neo4jConfig, ctx: PipelineContext):
105
+ super().__init__(config, ctx)
89
106
  self.ctx = ctx
90
107
  self.config = config
108
+ self.platform = "neo4j"
91
109
  self.report: Neo4jSourceReport = Neo4jSourceReport()
92
110
 
93
111
  @classmethod
94
- def create(cls, config_dict, ctx):
112
+ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "Neo4jSource":
95
113
  config = Neo4jConfig.parse_obj(config_dict)
96
- return cls(ctx, config)
114
+ return cls(config, ctx)
97
115
 
98
116
  def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
99
117
  type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
@@ -123,34 +141,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
123
141
  dataset: str,
124
142
  description: Optional[str] = None,
125
143
  custom_properties: Optional[Dict[str, str]] = None,
126
- ) -> MetadataChangeProposalWrapper:
144
+ ) -> Iterable[MetadataWorkUnit]:
127
145
  dataset_properties = DatasetPropertiesClass(
128
146
  description=description,
129
147
  customProperties=custom_properties,
130
148
  )
131
- return MetadataChangeProposalWrapper(
132
- entityUrn=make_dataset_urn(
133
- platform=self.PLATFORM, name=dataset, env=self.config.env
149
+ yield MetadataChangeProposalWrapper(
150
+ entityUrn=make_dataset_urn_with_platform_instance(
151
+ platform=self.platform,
152
+ name=dataset,
153
+ platform_instance=self.config.platform_instance,
154
+ env=self.config.env,
134
155
  ),
135
156
  aspect=dataset_properties,
136
- )
157
+ ).as_workunit()
137
158
 
138
159
  def generate_neo4j_object(
139
160
  self, dataset: str, columns: list, obj_type: Optional[str] = None
140
- ) -> MetadataChangeProposalWrapper:
161
+ ) -> Optional[MetadataChangeProposalWrapper]:
141
162
  try:
142
163
  fields = [
143
164
  self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
144
165
  for d in columns
145
166
  for key, value in d.items()
146
167
  ]
147
- mcp = MetadataChangeProposalWrapper(
148
- entityUrn=make_dataset_urn(
149
- platform=self.PLATFORM, name=dataset, env=self.config.env
168
+ return MetadataChangeProposalWrapper(
169
+ entityUrn=make_dataset_urn_with_platform_instance(
170
+ platform=self.platform,
171
+ name=dataset,
172
+ platform_instance=self.config.platform_instance,
173
+ env=self.config.env,
150
174
  ),
151
175
  aspect=SchemaMetadataClass(
152
176
  schemaName=dataset,
153
- platform=make_data_platform_urn(self.PLATFORM),
177
+ platform=make_data_platform_urn(self.platform),
154
178
  version=0,
155
179
  hash="",
156
180
  platformSchema=OtherSchemaClass(rawSchema=""),
@@ -161,13 +185,16 @@ class Neo4jSource(StatefulIngestionSourceBase):
161
185
  fields=fields,
162
186
  ),
163
187
  )
164
- self.report.obj_created += 1
165
188
  except Exception as e:
166
189
  log.error(e)
167
- self.report.obj_failures += 1
168
- return mcp
190
+ self.report.report_failure(
191
+ message="Failed to process dataset",
192
+ context=dataset,
193
+ exc=e,
194
+ )
195
+ return None
169
196
 
170
- def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
197
+ def get_neo4j_metadata(self, query: str) -> Optional[pd.DataFrame]:
171
198
  driver = GraphDatabase.driver(
172
199
  self.config.uri, auth=(self.config.username, self.config.password)
173
200
  )
@@ -201,13 +228,14 @@ class Neo4jSource(StatefulIngestionSourceBase):
201
228
 
202
229
  union_cols = ["key", "obj_type", "property_data_types", "description"]
203
230
  df = pd.concat([node_df[union_cols], rel_df[union_cols]])
231
+ return df
204
232
  except Exception as e:
205
233
  self.report.failure(
206
234
  message="Failed to get neo4j metadata",
207
235
  exc=e,
208
236
  )
209
237
 
210
- return df
238
+ return None
211
239
 
212
240
  def process_nodes(self, data: list) -> pd.DataFrame:
213
241
  nodes = [record for record in data if record["value"]["type"] == self.NODE]
@@ -306,46 +334,48 @@ class Neo4jSource(StatefulIngestionSourceBase):
306
334
  df = self.get_neo4j_metadata(
307
335
  "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
308
336
  )
337
+ if df is None:
338
+ log.warning("No metadata retrieved from Neo4j")
339
+ return
340
+
309
341
  for _, row in df.iterrows():
310
342
  try:
311
- yield MetadataWorkUnit(
312
- id=row["key"],
313
- mcp=self.generate_neo4j_object(
314
- columns=row["property_data_types"],
315
- dataset=row["key"],
316
- ),
317
- is_primary_source=True,
343
+ neo4j_obj = self.generate_neo4j_object(
344
+ columns=row["property_data_types"],
345
+ dataset=row["key"],
318
346
  )
319
-
320
- yield MetadataWorkUnit(
321
- id=row["key"],
322
- mcp=MetadataChangeProposalWrapper(
323
- entityUrn=make_dataset_urn(
324
- platform=self.PLATFORM,
325
- name=row["key"],
326
- env=self.config.env,
327
- ),
328
- aspect=SubTypesClass(
329
- typeNames=[
330
- DatasetSubTypes.NEO4J_NODE
331
- if row["obj_type"] == self.NODE
332
- else DatasetSubTypes.NEO4J_RELATIONSHIP
333
- ]
334
- ),
347
+ if neo4j_obj:
348
+ yield from auto_workunit([neo4j_obj])
349
+
350
+ yield MetadataChangeProposalWrapper(
351
+ entityUrn=make_dataset_urn_with_platform_instance(
352
+ platform=self.platform,
353
+ name=row["key"],
354
+ platform_instance=self.config.platform_instance,
355
+ env=self.config.env,
335
356
  ),
336
- )
337
-
338
- yield MetadataWorkUnit(
339
- id=row["key"],
340
- mcp=self.add_properties(
341
- dataset=row["key"],
342
- custom_properties=None,
343
- description=row["description"],
357
+ aspect=SubTypesClass(
358
+ typeNames=[
359
+ DatasetSubTypes.NEO4J_NODE
360
+ if row["obj_type"] == self.NODE
361
+ else DatasetSubTypes.NEO4J_RELATIONSHIP
362
+ ]
344
363
  ),
364
+ ).as_workunit()
365
+
366
+ yield from self.add_properties(
367
+ dataset=row["key"],
368
+ custom_properties=None,
369
+ description=row["description"],
345
370
  )
346
371
 
347
372
  except Exception as e:
348
- raise e
373
+ log.error(f"Failed to process row {row['key']}: {str(e)}")
374
+ self.report.report_failure(
375
+ message="Error processing Neo4j metadata",
376
+ context=row["key"],
377
+ exc=e,
378
+ )
349
379
 
350
- def get_report(self):
380
+ def get_report(self) -> "Neo4jSourceReport":
351
381
  return self.report
@@ -513,7 +513,7 @@ class PowerBiDashboardSourceConfig(
513
513
  include_workspace_name_in_dataset_urn: bool = pydantic.Field(
514
514
  default=False,
515
515
  description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
516
- "Read section #11560 at https://datahubproject.io/docs/how/updating-datahub/ before enabling this option."
516
+ "Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
517
517
  "To maintain backward compatibility, this is set to False.",
518
518
  )
519
519
 
@@ -63,10 +63,10 @@ class SessionWithTimeout(requests.Session):
63
63
  super().__init__(*args, **kwargs)
64
64
  self.timeout = timeout
65
65
 
66
- def request(self, method, url, **kwargs):
66
+ def request(self, method, url, *args, **kwargs):
67
67
  # Set the default timeout if none is provided
68
68
  kwargs.setdefault("timeout", self.timeout)
69
- return super().request(method, url, **kwargs)
69
+ return super().request(method, url, *args, **kwargs)
70
70
 
71
71
 
72
72
  class DataResolverBase(ABC):
@@ -182,15 +182,16 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
186
- with PerfTimer() as timer:
187
- # Generate operation aspect workunits
188
- yield from self._gen_operation_aspect_workunits(
189
- self.connection, all_tables
190
- )
191
- self.report.operational_metadata_extraction_sec[
192
- self.config.database
193
- ] = timer.elapsed_seconds(digits=2)
185
+ with self.report.new_stage(
186
+ USAGE_EXTRACTION_OPERATIONAL_STATS
187
+ ), PerfTimer() as timer:
188
+ # Generate operation aspect workunits
189
+ yield from self._gen_operation_aspect_workunits(
190
+ self.connection, all_tables
191
+ )
192
+ self.report.operational_metadata_extraction_sec[
193
+ self.config.database
194
+ ] = timer.elapsed_seconds(digits=2)
194
195
 
195
196
  # Generate aggregate events
196
197
  with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
@@ -145,7 +145,11 @@ class ClickHouseConfig(
145
145
  )
146
146
  include_materialized_views: Optional[bool] = Field(default=True, description="")
147
147
 
148
- def get_sql_alchemy_url(self, current_db=None):
148
+ def get_sql_alchemy_url(
149
+ self,
150
+ uri_opts: Optional[Dict[str, Any]] = None,
151
+ current_db: Optional[str] = None,
152
+ ) -> str:
149
153
  url = make_url(
150
154
  super().get_sql_alchemy_url(uri_opts=self.uri_opts, current_db=current_db)
151
155
  )
@@ -1,4 +1,6 @@
1
1
  # This import verifies that the dependencies are available.
2
+ from typing import Any, Dict, Optional
3
+
2
4
  import pydruid # noqa: F401
3
5
  from pydantic.fields import Field
4
6
  from pydruid.db.sqlalchemy import DruidDialect
@@ -38,8 +40,11 @@ class DruidConfig(BasicSQLAlchemyConfig):
38
40
  description="regex patterns for schemas to filter in ingestion.",
39
41
  )
40
42
 
41
- def get_sql_alchemy_url(self):
42
- return f"{super().get_sql_alchemy_url()}/druid/v2/sql/"
43
+ def get_sql_alchemy_url(
44
+ self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
45
+ ) -> str:
46
+ base_url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
47
+ return f"{base_url}/druid/v2/sql/"
43
48
 
44
49
  """
45
50
  The pydruid library already formats the table name correctly, so we do not
@@ -127,11 +127,15 @@ class OracleConfig(BasicSQLAlchemyConfig):
127
127
  )
128
128
  return v
129
129
 
130
- def get_sql_alchemy_url(self):
131
- url = super().get_sql_alchemy_url()
130
+ def get_sql_alchemy_url(
131
+ self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
132
+ ) -> str:
133
+ url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
134
+
132
135
  if self.service_name:
133
136
  assert not self.database
134
137
  url = f"{url}/?service_name={self.service_name}"
138
+
135
139
  return url
136
140
 
137
141
  def get_identifier(self, schema: str, table: str) -> str:
@@ -10,7 +10,8 @@ from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import (
10
10
  IngestionCheckpointingProviderConfig,
11
11
  JobId,
12
12
  )
13
- from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
13
+ from datahub.ingestion.graph.client import DataHubGraph
14
+ from datahub.ingestion.graph.config import DatahubClientConfig
14
15
  from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass
15
16
 
16
17
  logger = logging.getLogger(__name__)
@@ -24,7 +24,7 @@ def check_user_role(
24
24
  mitigation_message_prefix: str = (
25
25
  "Assign `Site Administrator Explorer` role to the user"
26
26
  )
27
- mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup"
27
+ mitigation_message_suffix: str = "Refer to the setup guide: https://docs.datahub.com/docs/quick-ingestion-guides/tableau/setup"
28
28
 
29
29
  try:
30
30
  # TODO: Add check for `Enable Derived Permissions`
@@ -2,7 +2,7 @@ import collections
2
2
  import dataclasses
3
3
  import logging
4
4
  from datetime import datetime
5
- from typing import Dict, Iterable, List
5
+ from typing import Any, Dict, Iterable, List, Optional
6
6
 
7
7
  from dateutil import parser
8
8
  from pydantic.fields import Field
@@ -74,8 +74,12 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
74
74
  options: dict = Field(default={}, description="")
75
75
  query_log_table: str = Field(default="system.query_log", exclude=True)
76
76
 
77
- def get_sql_alchemy_url(self):
78
- return super().get_sql_alchemy_url()
77
+ def get_sql_alchemy_url(
78
+ self,
79
+ uri_opts: Optional[Dict[str, Any]] = None,
80
+ current_db: Optional[str] = None,
81
+ ) -> str:
82
+ return super().get_sql_alchemy_url(uri_opts=uri_opts, current_db=current_db)
79
83
 
80
84
 
81
85
  @platform_name("ClickHouse")
@@ -4,7 +4,7 @@ import json
4
4
  import logging
5
5
  from datetime import datetime
6
6
  from email.utils import parseaddr
7
- from typing import Dict, Iterable, List, Optional
7
+ from typing import Any, Dict, Iterable, List, Optional
8
8
 
9
9
  from dateutil import parser
10
10
  from pydantic.fields import Field
@@ -98,8 +98,10 @@ class TrinoUsageConfig(TrinoConfig, BaseUsageConfig, EnvBasedSourceBaseConfig):
98
98
  options: dict = Field(default={}, description="")
99
99
  database: str = Field(description="The name of the catalog from getting the usage")
100
100
 
101
- def get_sql_alchemy_url(self):
102
- return super().get_sql_alchemy_url()
101
+ def get_sql_alchemy_url(
102
+ self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
103
+ ) -> str:
104
+ return super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
103
105
 
104
106
 
105
107
  @dataclasses.dataclass
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, TypedDict
3
3
 
4
4
  from datahub.api.entities.assertion.assertion import BaseEntityAssertion
5
5
  from datahub.ingestion.graph.client import get_default_graph
6
+ from datahub.ingestion.graph.config import ClientMode
6
7
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
7
8
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
8
9
  from datahub.utilities.urns.urn import Urn
@@ -15,7 +16,7 @@ class ColumnDict(TypedDict):
15
16
 
16
17
  @lru_cache
17
18
  def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
18
- with get_default_graph() as graph:
19
+ with get_default_graph(ClientMode.CLI) as graph:
19
20
  props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
20
21
  if props is not None:
21
22
  return props.qualifiedName
@@ -24,7 +25,7 @@ def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
24
25
 
25
26
  @lru_cache
26
27
  def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
27
- with get_default_graph() as graph:
28
+ with get_default_graph(ClientMode.INGESTION) as graph:
28
29
  schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
29
30
  if schema is not None:
30
31
  return [