acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/METADATA +2480 -2480
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/RECORD +54 -54
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/forms/forms.py +34 -35
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/delete_cli.py +1 -1
- datahub/cli/docker_cli.py +2 -2
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +44 -52
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +3 -1
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
- datahub/metadata/_urns/urn_defs.py +1786 -1786
- datahub/metadata/schema.avsc +17364 -16988
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/testing/check_imports.py +1 -1
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import re
|
|
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
|
|
|
12
13
|
|
|
13
14
|
from datahub.configuration.git import GitReference
|
|
14
15
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
16
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
15
17
|
from datahub.ingestion.api.decorators import (
|
|
16
18
|
SupportStatus,
|
|
17
|
-
capability,
|
|
18
19
|
config_class,
|
|
19
20
|
platform_name,
|
|
20
21
|
support_status,
|
|
21
22
|
)
|
|
22
23
|
from datahub.ingestion.api.source import (
|
|
23
24
|
CapabilityReport,
|
|
24
|
-
SourceCapability,
|
|
25
25
|
TestableSource,
|
|
26
26
|
TestConnectionReport,
|
|
27
27
|
)
|
|
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
|
|
|
40
40
|
logger = logging.getLogger(__name__)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
@dataclasses.dataclass
|
|
44
|
+
class DBTCoreReport(DBTSourceReport):
|
|
45
|
+
catalog_info: Optional[dict] = None
|
|
46
|
+
manifest_info: Optional[dict] = None
|
|
47
|
+
|
|
48
|
+
|
|
43
49
|
class DBTCoreConfig(DBTCommonConfig):
|
|
44
50
|
manifest_path: str = Field(
|
|
45
|
-
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json
|
|
46
|
-
"
|
|
51
|
+
description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
|
|
52
|
+
"This can be a local file or a URI."
|
|
47
53
|
)
|
|
48
|
-
catalog_path: str = Field(
|
|
49
|
-
|
|
50
|
-
"
|
|
54
|
+
catalog_path: Optional[str] = Field(
|
|
55
|
+
None,
|
|
56
|
+
description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
|
|
57
|
+
"This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
|
|
58
|
+
"This can be a local file or a URI.",
|
|
51
59
|
)
|
|
52
60
|
sources_path: Optional[str] = Field(
|
|
53
61
|
default=None,
|
|
54
|
-
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json.
|
|
55
|
-
"specified, last-modified fields will not be populated.
|
|
62
|
+
description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
|
|
63
|
+
"If not specified, last-modified fields will not be populated. "
|
|
64
|
+
"This can be a local file or a URI.",
|
|
56
65
|
)
|
|
57
66
|
run_results_paths: List[str] = Field(
|
|
58
67
|
default=[],
|
|
@@ -161,7 +170,7 @@ def get_columns(
|
|
|
161
170
|
|
|
162
171
|
def extract_dbt_entities(
|
|
163
172
|
all_manifest_entities: Dict[str, Dict[str, Any]],
|
|
164
|
-
all_catalog_entities: Dict[str, Dict[str, Any]],
|
|
173
|
+
all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
|
|
165
174
|
sources_results: List[Dict[str, Any]],
|
|
166
175
|
manifest_adapter: str,
|
|
167
176
|
use_identifiers: bool,
|
|
@@ -186,15 +195,6 @@ def extract_dbt_entities(
|
|
|
186
195
|
):
|
|
187
196
|
name = manifest_node["alias"]
|
|
188
197
|
|
|
189
|
-
# initialize comment to "" for consistency with descriptions
|
|
190
|
-
# (since dbt null/undefined descriptions as "")
|
|
191
|
-
comment = ""
|
|
192
|
-
|
|
193
|
-
if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
|
|
194
|
-
"comment"
|
|
195
|
-
):
|
|
196
|
-
comment = all_catalog_entities[key]["metadata"]["comment"]
|
|
197
|
-
|
|
198
198
|
materialization = None
|
|
199
199
|
if "materialized" in manifest_node.get("config", {}):
|
|
200
200
|
# It's a model
|
|
@@ -204,8 +204,9 @@ def extract_dbt_entities(
|
|
|
204
204
|
if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
|
|
205
205
|
upstream_nodes = manifest_node["depends_on"]["nodes"]
|
|
206
206
|
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
catalog_node = (
|
|
208
|
+
all_catalog_entities.get(key) if all_catalog_entities is not None else None
|
|
209
|
+
)
|
|
209
210
|
missing_from_catalog = catalog_node is None
|
|
210
211
|
catalog_type = None
|
|
211
212
|
|
|
@@ -214,16 +215,23 @@ def extract_dbt_entities(
|
|
|
214
215
|
# Test and ephemeral nodes will never show up in the catalog.
|
|
215
216
|
missing_from_catalog = False
|
|
216
217
|
else:
|
|
217
|
-
if not only_include_if_in_catalog:
|
|
218
|
+
if all_catalog_entities is not None and not only_include_if_in_catalog:
|
|
219
|
+
# If the catalog file is missing, we have already generated a general message.
|
|
218
220
|
report.warning(
|
|
219
221
|
title="Node missing from catalog",
|
|
220
222
|
message="Found a node in the manifest file but not in the catalog. "
|
|
221
223
|
"This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
|
|
222
|
-
"Some metadata,
|
|
224
|
+
"Some metadata, particularly schema information, will be impacted.",
|
|
223
225
|
context=key,
|
|
224
226
|
)
|
|
225
227
|
else:
|
|
226
|
-
catalog_type =
|
|
228
|
+
catalog_type = catalog_node["metadata"]["type"]
|
|
229
|
+
|
|
230
|
+
# initialize comment to "" for consistency with descriptions
|
|
231
|
+
# (since dbt null/undefined descriptions as "")
|
|
232
|
+
comment = ""
|
|
233
|
+
if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
|
|
234
|
+
comment = catalog_node["metadata"]["comment"]
|
|
227
235
|
|
|
228
236
|
query_tag_props = manifest_node.get("query_tag", {})
|
|
229
237
|
|
|
@@ -231,12 +239,15 @@ def extract_dbt_entities(
|
|
|
231
239
|
|
|
232
240
|
owner = meta.get("owner")
|
|
233
241
|
if owner is None:
|
|
234
|
-
owner = manifest_node.get("config", {}).get("meta"
|
|
242
|
+
owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
|
|
243
|
+
|
|
244
|
+
if not meta:
|
|
245
|
+
# On older versions of dbt, the meta field was nested under config
|
|
246
|
+
# for some node types.
|
|
247
|
+
meta = manifest_node.get("config", {}).get("meta") or {}
|
|
235
248
|
|
|
236
249
|
tags = manifest_node.get("tags", [])
|
|
237
250
|
tags = [tag_prefix + tag for tag in tags]
|
|
238
|
-
if not meta:
|
|
239
|
-
meta = manifest_node.get("config", {}).get("meta", {})
|
|
240
251
|
|
|
241
252
|
max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
|
|
242
253
|
max_loaded_at = None
|
|
@@ -453,15 +464,18 @@ def load_run_results(
|
|
|
453
464
|
@platform_name("dbt")
|
|
454
465
|
@config_class(DBTCoreConfig)
|
|
455
466
|
@support_status(SupportStatus.CERTIFIED)
|
|
456
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
457
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
458
467
|
class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
459
468
|
config: DBTCoreConfig
|
|
469
|
+
report: DBTCoreReport
|
|
470
|
+
|
|
471
|
+
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
|
|
472
|
+
super().__init__(config, ctx)
|
|
473
|
+
self.report = DBTCoreReport()
|
|
460
474
|
|
|
461
475
|
@classmethod
|
|
462
476
|
def create(cls, config_dict, ctx):
|
|
463
477
|
config = DBTCoreConfig.parse_obj(config_dict)
|
|
464
|
-
return cls(config, ctx
|
|
478
|
+
return cls(config, ctx)
|
|
465
479
|
|
|
466
480
|
@staticmethod
|
|
467
481
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
471
485
|
DBTCoreSource.load_file_as_json(
|
|
472
486
|
source_config.manifest_path, source_config.aws_connection
|
|
473
487
|
)
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
488
|
+
if source_config.catalog_path is not None:
|
|
489
|
+
DBTCoreSource.load_file_as_json(
|
|
490
|
+
source_config.catalog_path, source_config.aws_connection
|
|
491
|
+
)
|
|
477
492
|
test_report.basic_connectivity = CapabilityReport(capable=True)
|
|
478
493
|
except Exception as e:
|
|
479
494
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
511
526
|
dbt_manifest_json = self.load_file_as_json(
|
|
512
527
|
self.config.manifest_path, self.config.aws_connection
|
|
513
528
|
)
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
529
|
+
dbt_manifest_metadata = dbt_manifest_json["metadata"]
|
|
530
|
+
self.report.manifest_info = dict(
|
|
531
|
+
generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
|
|
532
|
+
dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
|
|
533
|
+
project_name=dbt_manifest_metadata.get("project_name", "unknown"),
|
|
517
534
|
)
|
|
518
535
|
|
|
536
|
+
dbt_catalog_json = None
|
|
537
|
+
dbt_catalog_metadata = None
|
|
538
|
+
if self.config.catalog_path is not None:
|
|
539
|
+
dbt_catalog_json = self.load_file_as_json(
|
|
540
|
+
self.config.catalog_path, self.config.aws_connection
|
|
541
|
+
)
|
|
542
|
+
dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
|
|
543
|
+
self.report.catalog_info = dict(
|
|
544
|
+
generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
|
|
545
|
+
dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
|
|
546
|
+
project_name=dbt_catalog_metadata.get("project_name", "unknown"),
|
|
547
|
+
)
|
|
548
|
+
else:
|
|
549
|
+
self.report.warning(
|
|
550
|
+
title="No catalog file configured",
|
|
551
|
+
message="Some metadata, particularly schema information, will be missing.",
|
|
552
|
+
)
|
|
553
|
+
|
|
519
554
|
if self.config.sources_path is not None:
|
|
520
555
|
dbt_sources_json = self.load_file_as_json(
|
|
521
556
|
self.config.sources_path, self.config.aws_connection
|
|
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
528
563
|
manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
|
|
529
564
|
manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
|
|
530
565
|
|
|
531
|
-
catalog_schema =
|
|
532
|
-
catalog_version =
|
|
566
|
+
catalog_schema = None
|
|
567
|
+
catalog_version = None
|
|
568
|
+
if dbt_catalog_metadata is not None:
|
|
569
|
+
catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
|
|
570
|
+
catalog_version = dbt_catalog_metadata.get("dbt_version")
|
|
533
571
|
|
|
534
572
|
manifest_nodes = dbt_manifest_json["nodes"]
|
|
535
573
|
manifest_sources = dbt_manifest_json["sources"]
|
|
536
574
|
|
|
537
575
|
all_manifest_entities = {**manifest_nodes, **manifest_sources}
|
|
538
576
|
|
|
539
|
-
|
|
540
|
-
|
|
577
|
+
all_catalog_entities = None
|
|
578
|
+
if dbt_catalog_json is not None:
|
|
579
|
+
catalog_nodes = dbt_catalog_json["nodes"]
|
|
580
|
+
catalog_sources = dbt_catalog_json["sources"]
|
|
541
581
|
|
|
542
|
-
|
|
582
|
+
all_catalog_entities = {**catalog_nodes, **catalog_sources}
|
|
543
583
|
|
|
544
584
|
nodes = extract_dbt_entities(
|
|
545
585
|
all_manifest_entities=all_manifest_entities,
|
|
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
590
630
|
)
|
|
591
631
|
except Exception as e:
|
|
592
632
|
self.report.info(
|
|
593
|
-
title="
|
|
633
|
+
title="dbt Catalog Version",
|
|
594
634
|
message="Failed to determine the catalog version",
|
|
595
635
|
exc=e,
|
|
596
636
|
)
|
|
@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
|
135
135
|
"""
|
|
136
136
|
This plugin extracts:
|
|
137
137
|
|
|
138
|
-
- Entities as [`MLPrimaryKey`](https://
|
|
139
|
-
- Fields as [`MLFeature`](https://
|
|
140
|
-
- Feature views and on-demand feature views as [`MLFeatureTable`](https://
|
|
141
|
-
- Batch and stream source details as [`Dataset`](https://
|
|
138
|
+
- Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
|
|
139
|
+
- Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
|
|
140
|
+
- Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
|
|
141
|
+
- Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
|
|
142
142
|
- Column types associated with each entity and feature
|
|
143
143
|
"""
|
|
144
144
|
|
|
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|
|
40
40
|
del kwargs["timeout"]
|
|
41
41
|
super().__init__(*args, **kwargs)
|
|
42
42
|
|
|
43
|
-
def send(self, request, **kwargs):
|
|
43
|
+
def send(self, request, *args, **kwargs):
|
|
44
44
|
timeout = kwargs.get("timeout")
|
|
45
45
|
if timeout is None and hasattr(self, "timeout"):
|
|
46
46
|
kwargs["timeout"] = self.timeout
|
|
47
|
-
return super().send(request, **kwargs)
|
|
47
|
+
return super().send(request, *args, **kwargs)
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class IcebergProfilingConfig(ConfigModel):
|
datahub/ingestion/source/ldap.py
CHANGED
|
@@ -113,7 +113,7 @@ class LookerAPI:
|
|
|
113
113
|
)
|
|
114
114
|
except SDKError as e:
|
|
115
115
|
raise ConfigurationError(
|
|
116
|
-
|
|
116
|
+
"Failed to connect/authenticate with looker - check your configuration"
|
|
117
117
|
) from e
|
|
118
118
|
|
|
119
119
|
self.client_stats = LookerAPIStats()
|
|
@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
497
497
|
f"Failed to find a project name for model {model_name}"
|
|
498
498
|
)
|
|
499
499
|
return model.project_name
|
|
500
|
-
except SDKError:
|
|
500
|
+
except SDKError as e:
|
|
501
|
+
self.reporter.failure(
|
|
502
|
+
title="Failed to find a project name for model",
|
|
503
|
+
message="Consider configuring a static project name in your config file",
|
|
504
|
+
context=str(dict(model_name=model_name)),
|
|
505
|
+
exc=e,
|
|
506
|
+
)
|
|
501
507
|
raise ValueError(
|
|
502
508
|
f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
|
|
503
509
|
f"in your config file"
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
from functools import lru_cache
|
|
8
8
|
from json import JSONDecodeError
|
|
9
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
9
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import dateutil.parser as dp
|
|
12
12
|
import pydantic
|
|
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
|
|
|
203
203
|
pass
|
|
204
204
|
|
|
205
205
|
|
|
206
|
+
class HTTPError504(HTTPError):
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
|
|
206
210
|
ModeRequestError = (HTTPError, JSONDecodeError)
|
|
207
211
|
|
|
208
212
|
|
|
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
|
217
221
|
num_query_template_render: int = 0
|
|
218
222
|
num_query_template_render_failures: int = 0
|
|
219
223
|
num_query_template_render_success: int = 0
|
|
224
|
+
num_requests_exceeding_rate_limit: int = 0
|
|
225
|
+
num_requests_retried_on_timeout: int = 0
|
|
226
|
+
num_spaces_retrieved: int = 0
|
|
220
227
|
|
|
221
228
|
def report_dropped_space(self, ent_name: str) -> None:
|
|
222
229
|
self.filtered_spaces.append(ent_name)
|
|
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
456
463
|
# Datasets
|
|
457
464
|
datasets = []
|
|
458
465
|
for imported_dataset_name in report_info.get("imported_datasets", {}):
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
466
|
+
try:
|
|
467
|
+
mode_dataset = self._get_request_json(
|
|
468
|
+
f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
|
|
469
|
+
)
|
|
470
|
+
except HTTPError as http_error:
|
|
471
|
+
status_code = http_error.response.status_code
|
|
472
|
+
if status_code == 404:
|
|
473
|
+
self.report.report_warning(
|
|
474
|
+
title="Report Not Found",
|
|
475
|
+
message="Referenced report for reusable dataset was not found.",
|
|
476
|
+
context=f"Report: {report_info.get('id')}, "
|
|
477
|
+
f"Imported Dataset Report: {imported_dataset_name.get('token')}",
|
|
478
|
+
)
|
|
479
|
+
continue
|
|
480
|
+
else:
|
|
481
|
+
raise http_error
|
|
482
|
+
|
|
462
483
|
dataset_urn = builder.make_dataset_urn_with_platform_instance(
|
|
463
484
|
self.platform,
|
|
464
485
|
str(mode_dataset.get("id")),
|
|
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
562
583
|
space_info = {}
|
|
563
584
|
try:
|
|
564
585
|
logger.debug(f"Retrieving spaces for {self.workspace_uri}")
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
)
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
586
|
+
for spaces_page in self._get_paged_request_json(
|
|
587
|
+
f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
|
|
588
|
+
):
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
|
|
591
|
+
)
|
|
592
|
+
self.report.num_spaces_retrieved += len(spaces_page)
|
|
593
|
+
for s in spaces_page:
|
|
594
|
+
logger.debug(f"Space: {s.get('name')}")
|
|
595
|
+
space_name = s.get("name", "")
|
|
596
|
+
# Using both restricted and default_access_level because
|
|
597
|
+
# there is a current bug with restricted returning False everytime
|
|
598
|
+
# which has been reported to Mode team
|
|
599
|
+
if self.config.exclude_restricted and (
|
|
600
|
+
s.get("restricted")
|
|
601
|
+
or s.get("default_access_level") == "restricted"
|
|
602
|
+
):
|
|
603
|
+
logging.debug(
|
|
604
|
+
f"Skipping space {space_name} due to exclude restricted"
|
|
605
|
+
)
|
|
606
|
+
continue
|
|
607
|
+
if not self.config.space_pattern.allowed(space_name):
|
|
608
|
+
self.report.report_dropped_space(space_name)
|
|
609
|
+
logging.debug(
|
|
610
|
+
f"Skipping space {space_name} due to space pattern"
|
|
611
|
+
)
|
|
612
|
+
continue
|
|
613
|
+
space_info[s.get("token", "")] = s.get("name", "")
|
|
588
614
|
except ModeRequestError as e:
|
|
589
615
|
self.report.report_failure(
|
|
590
616
|
title="Failed to Retrieve Spaces",
|
|
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1475
1501
|
)
|
|
1476
1502
|
return charts
|
|
1477
1503
|
|
|
1504
|
+
def _get_paged_request_json(
|
|
1505
|
+
self, url: str, key: str, per_page: int
|
|
1506
|
+
) -> Iterator[List[Dict]]:
|
|
1507
|
+
page: int = 1
|
|
1508
|
+
while True:
|
|
1509
|
+
page_url = f"{url}&per_page={per_page}&page={page}"
|
|
1510
|
+
response = self._get_request_json(page_url)
|
|
1511
|
+
data: List[Dict] = response.get("_embedded", {}).get(key, [])
|
|
1512
|
+
if not data:
|
|
1513
|
+
break
|
|
1514
|
+
yield data
|
|
1515
|
+
page += 1
|
|
1516
|
+
|
|
1478
1517
|
def _get_request_json(self, url: str) -> Dict:
|
|
1479
1518
|
r = tenacity.Retrying(
|
|
1480
1519
|
wait=wait_exponential(
|
|
1481
1520
|
multiplier=self.config.api_options.retry_backoff_multiplier,
|
|
1482
1521
|
max=self.config.api_options.max_retry_interval,
|
|
1483
1522
|
),
|
|
1484
|
-
retry=retry_if_exception_type(
|
|
1523
|
+
retry=retry_if_exception_type(
|
|
1524
|
+
(HTTPError429, HTTPError504, ConnectionError)
|
|
1525
|
+
),
|
|
1485
1526
|
stop=stop_after_attempt(self.config.api_options.max_attempts),
|
|
1486
1527
|
)
|
|
1487
1528
|
|
|
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1502
1543
|
except HTTPError as http_error:
|
|
1503
1544
|
error_response = http_error.response
|
|
1504
1545
|
if error_response.status_code == 429:
|
|
1546
|
+
self.report.num_requests_exceeding_rate_limit += 1
|
|
1505
1547
|
# respect Retry-After
|
|
1506
1548
|
sleep_time = error_response.headers.get("retry-after")
|
|
1507
1549
|
if sleep_time is not None:
|
|
1508
1550
|
time.sleep(float(sleep_time))
|
|
1509
1551
|
raise HTTPError429 from None
|
|
1552
|
+
elif error_response.status_code == 504:
|
|
1553
|
+
self.report.num_requests_retried_on_timeout += 1
|
|
1554
|
+
time.sleep(0.1)
|
|
1555
|
+
raise HTTPError504 from None
|
|
1510
1556
|
|
|
1511
1557
|
logger.debug(
|
|
1512
1558
|
f"Error response ({error_response.status_code}): {error_response.text}"
|