acryl-datahub 1.0.0.3rc8__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (60) hide show
  1. {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/METADATA +2466 -2466
  2. {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/RECORD +60 -60
  3. {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datajob/dataflow.py +3 -3
  7. datahub/api/entities/dataset/dataset.py +9 -11
  8. datahub/api/entities/forms/forms.py +34 -35
  9. datahub/api/graphql/assertion.py +1 -1
  10. datahub/api/graphql/operation.py +4 -4
  11. datahub/cli/delete_cli.py +1 -1
  12. datahub/cli/docker_cli.py +2 -2
  13. datahub/configuration/common.py +5 -0
  14. datahub/configuration/source_common.py +1 -1
  15. datahub/emitter/request_helper.py +116 -3
  16. datahub/emitter/rest_emitter.py +44 -52
  17. datahub/ingestion/api/source.py +2 -5
  18. datahub/ingestion/api/source_helpers.py +1 -0
  19. datahub/ingestion/glossary/classification_mixin.py +4 -2
  20. datahub/ingestion/graph/client.py +3 -1
  21. datahub/ingestion/graph/config.py +1 -0
  22. datahub/ingestion/graph/filters.py +1 -1
  23. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  24. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  25. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  26. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  27. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  28. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  29. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  30. datahub/ingestion/source/feast.py +4 -4
  31. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  32. datahub/ingestion/source/ldap.py +1 -1
  33. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  34. datahub/ingestion/source/looker/lookml_source.py +7 -1
  35. datahub/ingestion/source/mode.py +74 -28
  36. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  37. datahub/ingestion/source/powerbi/config.py +1 -1
  38. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  39. datahub/ingestion/source/redshift/usage.py +10 -9
  40. datahub/ingestion/source/slack/slack.py +4 -52
  41. datahub/ingestion/source/snowflake/snowflake_connection.py +19 -1
  42. datahub/ingestion/source/sql/clickhouse.py +5 -1
  43. datahub/ingestion/source/sql/druid.py +7 -2
  44. datahub/ingestion/source/sql/oracle.py +6 -2
  45. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  46. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  47. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  48. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
  49. datahub/metadata/_urns/urn_defs.py +1786 -1786
  50. datahub/metadata/schema.avsc +17364 -16988
  51. datahub/metadata/schema_classes.py +3 -3
  52. datahub/metadata/schemas/__init__.py +3 -3
  53. datahub/specific/dataset.py +12 -0
  54. datahub/testing/check_imports.py +1 -1
  55. datahub/utilities/logging_manager.py +8 -1
  56. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  57. datahub/utilities/urn_encoder.py +1 -1
  58. {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/entry_points.txt +0 -0
  59. {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/licenses/LICENSE +0 -0
  60. {acryl_datahub-1.0.0.3rc8.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/top_level.txt +0 -0
@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
125
125
  @dataclass
126
126
  class DBTSourceReport(StaleEntityRemovalSourceReport):
127
127
  sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
128
+ sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
128
129
  sql_parser_parse_failures: int = 0
129
130
  sql_parser_detach_ctes_failures: int = 0
130
131
  sql_parser_table_errors: int = 0
@@ -829,11 +830,13 @@ def get_column_type(
829
830
  "Enabled by default, configure using `include_column_lineage`",
830
831
  )
831
832
  class DBTSourceBase(StatefulIngestionSourceBase):
832
- def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
833
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
833
834
  super().__init__(config, ctx)
835
+ self.platform: str = "dbt"
836
+
834
837
  self.config = config
835
- self.platform: str = platform
836
838
  self.report: DBTSourceReport = DBTSourceReport()
839
+
837
840
  self.compiled_owner_extraction_pattern: Optional[Any] = None
838
841
  if self.config.owner_extraction_pattern:
839
842
  self.compiled_owner_extraction_pattern = re.compile(
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1177
1180
  logger.debug(
1178
1181
  f"Not generating CLL for {node.dbt_name} because we don't need it."
1179
1182
  )
1183
+ elif node.language != "sql":
1184
+ logger.debug(
1185
+ f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
1186
+ )
1187
+ self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
1180
1188
  elif node.compiled_code:
1181
1189
  # Add CTE stops based on the upstreams list.
1182
1190
  cte_mapping = {
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import json
2
3
  import logging
3
4
  import re
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
12
13
 
13
14
  from datahub.configuration.git import GitReference
14
15
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
16
+ from datahub.ingestion.api.common import PipelineContext
15
17
  from datahub.ingestion.api.decorators import (
16
18
  SupportStatus,
17
- capability,
18
19
  config_class,
19
20
  platform_name,
20
21
  support_status,
21
22
  )
22
23
  from datahub.ingestion.api.source import (
23
24
  CapabilityReport,
24
- SourceCapability,
25
25
  TestableSource,
26
26
  TestConnectionReport,
27
27
  )
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
40
40
  logger = logging.getLogger(__name__)
41
41
 
42
42
 
43
+ @dataclasses.dataclass
44
+ class DBTCoreReport(DBTSourceReport):
45
+ catalog_info: Optional[dict] = None
46
+ manifest_info: Optional[dict] = None
47
+
48
+
43
49
  class DBTCoreConfig(DBTCommonConfig):
44
50
  manifest_path: str = Field(
45
- description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note "
46
- "this can be a local file or a URI."
51
+ description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
52
+ "This can be a local file or a URI."
47
53
  )
48
- catalog_path: str = Field(
49
- description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this "
50
- "can be a local file or a URI."
54
+ catalog_path: Optional[str] = Field(
55
+ None,
56
+ description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
57
+ "This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
58
+ "This can be a local file or a URI.",
51
59
  )
52
60
  sources_path: Optional[str] = Field(
53
61
  default=None,
54
- description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not "
55
- "specified, last-modified fields will not be populated. Note this can be a local file or a URI.",
62
+ description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
63
+ "If not specified, last-modified fields will not be populated. "
64
+ "This can be a local file or a URI.",
56
65
  )
57
66
  run_results_paths: List[str] = Field(
58
67
  default=[],
@@ -161,7 +170,7 @@ def get_columns(
161
170
 
162
171
  def extract_dbt_entities(
163
172
  all_manifest_entities: Dict[str, Dict[str, Any]],
164
- all_catalog_entities: Dict[str, Dict[str, Any]],
173
+ all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
165
174
  sources_results: List[Dict[str, Any]],
166
175
  manifest_adapter: str,
167
176
  use_identifiers: bool,
@@ -186,15 +195,6 @@ def extract_dbt_entities(
186
195
  ):
187
196
  name = manifest_node["alias"]
188
197
 
189
- # initialize comment to "" for consistency with descriptions
190
- # (since dbt null/undefined descriptions as "")
191
- comment = ""
192
-
193
- if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
194
- "comment"
195
- ):
196
- comment = all_catalog_entities[key]["metadata"]["comment"]
197
-
198
198
  materialization = None
199
199
  if "materialized" in manifest_node.get("config", {}):
200
200
  # It's a model
@@ -204,8 +204,9 @@ def extract_dbt_entities(
204
204
  if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
205
205
  upstream_nodes = manifest_node["depends_on"]["nodes"]
206
206
 
207
- # It's a source
208
- catalog_node = all_catalog_entities.get(key)
207
+ catalog_node = (
208
+ all_catalog_entities.get(key) if all_catalog_entities is not None else None
209
+ )
209
210
  missing_from_catalog = catalog_node is None
210
211
  catalog_type = None
211
212
 
@@ -214,16 +215,23 @@ def extract_dbt_entities(
214
215
  # Test and ephemeral nodes will never show up in the catalog.
215
216
  missing_from_catalog = False
216
217
  else:
217
- if not only_include_if_in_catalog:
218
+ if all_catalog_entities is not None and not only_include_if_in_catalog:
219
+ # If the catalog file is missing, we have already generated a general message.
218
220
  report.warning(
219
221
  title="Node missing from catalog",
220
222
  message="Found a node in the manifest file but not in the catalog. "
221
223
  "This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
222
- "Some metadata, such as column types and descriptions, will be impacted.",
224
+ "Some metadata, particularly schema information, will be impacted.",
223
225
  context=key,
224
226
  )
225
227
  else:
226
- catalog_type = all_catalog_entities[key]["metadata"]["type"]
228
+ catalog_type = catalog_node["metadata"]["type"]
229
+
230
+ # initialize comment to "" for consistency with descriptions
231
+ # (since dbt null/undefined descriptions as "")
232
+ comment = ""
233
+ if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
234
+ comment = catalog_node["metadata"]["comment"]
227
235
 
228
236
  query_tag_props = manifest_node.get("query_tag", {})
229
237
 
@@ -231,12 +239,15 @@ def extract_dbt_entities(
231
239
 
232
240
  owner = meta.get("owner")
233
241
  if owner is None:
234
- owner = manifest_node.get("config", {}).get("meta", {}).get("owner")
242
+ owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
243
+
244
+ if not meta:
245
+ # On older versions of dbt, the meta field was nested under config
246
+ # for some node types.
247
+ meta = manifest_node.get("config", {}).get("meta") or {}
235
248
 
236
249
  tags = manifest_node.get("tags", [])
237
250
  tags = [tag_prefix + tag for tag in tags]
238
- if not meta:
239
- meta = manifest_node.get("config", {}).get("meta", {})
240
251
 
241
252
  max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
242
253
  max_loaded_at = None
@@ -453,15 +464,18 @@ def load_run_results(
453
464
  @platform_name("dbt")
454
465
  @config_class(DBTCoreConfig)
455
466
  @support_status(SupportStatus.CERTIFIED)
456
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
457
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
458
467
  class DBTCoreSource(DBTSourceBase, TestableSource):
459
468
  config: DBTCoreConfig
469
+ report: DBTCoreReport
470
+
471
+ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
472
+ super().__init__(config, ctx)
473
+ self.report = DBTCoreReport()
460
474
 
461
475
  @classmethod
462
476
  def create(cls, config_dict, ctx):
463
477
  config = DBTCoreConfig.parse_obj(config_dict)
464
- return cls(config, ctx, "dbt")
478
+ return cls(config, ctx)
465
479
 
466
480
  @staticmethod
467
481
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
471
485
  DBTCoreSource.load_file_as_json(
472
486
  source_config.manifest_path, source_config.aws_connection
473
487
  )
474
- DBTCoreSource.load_file_as_json(
475
- source_config.catalog_path, source_config.aws_connection
476
- )
488
+ if source_config.catalog_path is not None:
489
+ DBTCoreSource.load_file_as_json(
490
+ source_config.catalog_path, source_config.aws_connection
491
+ )
477
492
  test_report.basic_connectivity = CapabilityReport(capable=True)
478
493
  except Exception as e:
479
494
  test_report.basic_connectivity = CapabilityReport(
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
511
526
  dbt_manifest_json = self.load_file_as_json(
512
527
  self.config.manifest_path, self.config.aws_connection
513
528
  )
514
-
515
- dbt_catalog_json = self.load_file_as_json(
516
- self.config.catalog_path, self.config.aws_connection
529
+ dbt_manifest_metadata = dbt_manifest_json["metadata"]
530
+ self.report.manifest_info = dict(
531
+ generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
532
+ dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
533
+ project_name=dbt_manifest_metadata.get("project_name", "unknown"),
517
534
  )
518
535
 
536
+ dbt_catalog_json = None
537
+ dbt_catalog_metadata = None
538
+ if self.config.catalog_path is not None:
539
+ dbt_catalog_json = self.load_file_as_json(
540
+ self.config.catalog_path, self.config.aws_connection
541
+ )
542
+ dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
543
+ self.report.catalog_info = dict(
544
+ generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
545
+ dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
546
+ project_name=dbt_catalog_metadata.get("project_name", "unknown"),
547
+ )
548
+ else:
549
+ self.report.warning(
550
+ title="No catalog file configured",
551
+ message="Some metadata, particularly schema information, will be missing.",
552
+ )
553
+
519
554
  if self.config.sources_path is not None:
520
555
  dbt_sources_json = self.load_file_as_json(
521
556
  self.config.sources_path, self.config.aws_connection
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
528
563
  manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
529
564
  manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
530
565
 
531
- catalog_schema = dbt_catalog_json.get("metadata", {}).get("dbt_schema_version")
532
- catalog_version = dbt_catalog_json.get("metadata", {}).get("dbt_version")
566
+ catalog_schema = None
567
+ catalog_version = None
568
+ if dbt_catalog_metadata is not None:
569
+ catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
570
+ catalog_version = dbt_catalog_metadata.get("dbt_version")
533
571
 
534
572
  manifest_nodes = dbt_manifest_json["nodes"]
535
573
  manifest_sources = dbt_manifest_json["sources"]
536
574
 
537
575
  all_manifest_entities = {**manifest_nodes, **manifest_sources}
538
576
 
539
- catalog_nodes = dbt_catalog_json["nodes"]
540
- catalog_sources = dbt_catalog_json["sources"]
577
+ all_catalog_entities = None
578
+ if dbt_catalog_json is not None:
579
+ catalog_nodes = dbt_catalog_json["nodes"]
580
+ catalog_sources = dbt_catalog_json["sources"]
541
581
 
542
- all_catalog_entities = {**catalog_nodes, **catalog_sources}
582
+ all_catalog_entities = {**catalog_nodes, **catalog_sources}
543
583
 
544
584
  nodes = extract_dbt_entities(
545
585
  all_manifest_entities=all_manifest_entities,
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
590
630
  )
591
631
  except Exception as e:
592
632
  self.report.info(
593
- title="Dbt Catalog Version",
633
+ title="dbt Catalog Version",
594
634
  message="Failed to determine the catalog version",
595
635
  exc=e,
596
636
  )
@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
135
135
  """
136
136
  This plugin extracts:
137
137
 
138
- - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
139
- - Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
140
- - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
141
- - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
138
+ - Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
139
+ - Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
140
+ - Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
141
+ - Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
142
142
  - Column types associated with each entity and feature
143
143
  """
144
144
 
@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
40
40
  del kwargs["timeout"]
41
41
  super().__init__(*args, **kwargs)
42
42
 
43
- def send(self, request, **kwargs):
43
+ def send(self, request, *args, **kwargs):
44
44
  timeout = kwargs.get("timeout")
45
45
  if timeout is None and hasattr(self, "timeout"):
46
46
  kwargs["timeout"] = self.timeout
47
- return super().send(request, **kwargs)
47
+ return super().send(request, *args, **kwargs)
48
48
 
49
49
 
50
50
  class IcebergProfilingConfig(ConfigModel):
@@ -515,5 +515,5 @@ def parse_ldap_dn(input_clean: bytes) -> str:
515
515
 
516
516
  def get_attr_or_none(
517
517
  attrs: Dict[str, Any], key: str, default: Optional[str] = None
518
- ) -> str:
518
+ ) -> Optional[str]:
519
519
  return attrs[key][0].decode() if attrs.get(key) else default
@@ -113,7 +113,7 @@ class LookerAPI:
113
113
  )
114
114
  except SDKError as e:
115
115
  raise ConfigurationError(
116
- f"Failed to connect/authenticate with looker - check your configuration: {e}"
116
+ "Failed to connect/authenticate with looker - check your configuration"
117
117
  ) from e
118
118
 
119
119
  self.client_stats = LookerAPIStats()
@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
497
497
  f"Failed to find a project name for model {model_name}"
498
498
  )
499
499
  return model.project_name
500
- except SDKError:
500
+ except SDKError as e:
501
+ self.reporter.failure(
502
+ title="Failed to find a project name for model",
503
+ message="Consider configuring a static project name in your config file",
504
+ context=str(dict(model_name=model_name)),
505
+ exc=e,
506
+ )
501
507
  raise ValueError(
502
508
  f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
503
509
  f"in your config file"
@@ -6,7 +6,7 @@ from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
7
  from functools import lru_cache
8
8
  from json import JSONDecodeError
9
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
9
+ from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
10
 
11
11
  import dateutil.parser as dp
12
12
  import pydantic
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
203
203
  pass
204
204
 
205
205
 
206
+ class HTTPError504(HTTPError):
207
+ pass
208
+
209
+
206
210
  ModeRequestError = (HTTPError, JSONDecodeError)
207
211
 
208
212
 
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
217
221
  num_query_template_render: int = 0
218
222
  num_query_template_render_failures: int = 0
219
223
  num_query_template_render_success: int = 0
224
+ num_requests_exceeding_rate_limit: int = 0
225
+ num_requests_retried_on_timeout: int = 0
226
+ num_spaces_retrieved: int = 0
220
227
 
221
228
  def report_dropped_space(self, ent_name: str) -> None:
222
229
  self.filtered_spaces.append(ent_name)
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
456
463
  # Datasets
457
464
  datasets = []
458
465
  for imported_dataset_name in report_info.get("imported_datasets", {}):
459
- mode_dataset = self._get_request_json(
460
- f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
461
- )
466
+ try:
467
+ mode_dataset = self._get_request_json(
468
+ f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
469
+ )
470
+ except HTTPError as http_error:
471
+ status_code = http_error.response.status_code
472
+ if status_code == 404:
473
+ self.report.report_warning(
474
+ title="Report Not Found",
475
+ message="Referenced report for reusable dataset was not found.",
476
+ context=f"Report: {report_info.get('id')}, "
477
+ f"Imported Dataset Report: {imported_dataset_name.get('token')}",
478
+ )
479
+ continue
480
+ else:
481
+ raise http_error
482
+
462
483
  dataset_urn = builder.make_dataset_urn_with_platform_instance(
463
484
  self.platform,
464
485
  str(mode_dataset.get("id")),
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
562
583
  space_info = {}
563
584
  try:
564
585
  logger.debug(f"Retrieving spaces for {self.workspace_uri}")
565
- payload = self._get_request_json(f"{self.workspace_uri}/spaces?filter=all")
566
- spaces = payload.get("_embedded", {}).get("spaces", {})
567
- logger.debug(
568
- f"Got {len(spaces)} spaces from workspace {self.workspace_uri}"
569
- )
570
- for s in spaces:
571
- logger.debug(f"Space: {s.get('name')}")
572
- space_name = s.get("name", "")
573
- # Using both restricted and default_access_level because
574
- # there is a current bug with restricted returning False everytime
575
- # which has been reported to Mode team
576
- if self.config.exclude_restricted and (
577
- s.get("restricted") or s.get("default_access_level") == "restricted"
578
- ):
579
- logging.debug(
580
- f"Skipping space {space_name} due to exclude restricted"
581
- )
582
- continue
583
- if not self.config.space_pattern.allowed(space_name):
584
- self.report.report_dropped_space(space_name)
585
- logging.debug(f"Skipping space {space_name} due to space pattern")
586
- continue
587
- space_info[s.get("token", "")] = s.get("name", "")
586
+ for spaces_page in self._get_paged_request_json(
587
+ f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
588
+ ):
589
+ logger.debug(
590
+ f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
591
+ )
592
+ self.report.num_spaces_retrieved += len(spaces_page)
593
+ for s in spaces_page:
594
+ logger.debug(f"Space: {s.get('name')}")
595
+ space_name = s.get("name", "")
596
+ # Using both restricted and default_access_level because
597
+ # there is a current bug with restricted returning False everytime
598
+ # which has been reported to Mode team
599
+ if self.config.exclude_restricted and (
600
+ s.get("restricted")
601
+ or s.get("default_access_level") == "restricted"
602
+ ):
603
+ logging.debug(
604
+ f"Skipping space {space_name} due to exclude restricted"
605
+ )
606
+ continue
607
+ if not self.config.space_pattern.allowed(space_name):
608
+ self.report.report_dropped_space(space_name)
609
+ logging.debug(
610
+ f"Skipping space {space_name} due to space pattern"
611
+ )
612
+ continue
613
+ space_info[s.get("token", "")] = s.get("name", "")
588
614
  except ModeRequestError as e:
589
615
  self.report.report_failure(
590
616
  title="Failed to Retrieve Spaces",
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
1475
1501
  )
1476
1502
  return charts
1477
1503
 
1504
+ def _get_paged_request_json(
1505
+ self, url: str, key: str, per_page: int
1506
+ ) -> Iterator[List[Dict]]:
1507
+ page: int = 1
1508
+ while True:
1509
+ page_url = f"{url}&per_page={per_page}&page={page}"
1510
+ response = self._get_request_json(page_url)
1511
+ data: List[Dict] = response.get("_embedded", {}).get(key, [])
1512
+ if not data:
1513
+ break
1514
+ yield data
1515
+ page += 1
1516
+
1478
1517
  def _get_request_json(self, url: str) -> Dict:
1479
1518
  r = tenacity.Retrying(
1480
1519
  wait=wait_exponential(
1481
1520
  multiplier=self.config.api_options.retry_backoff_multiplier,
1482
1521
  max=self.config.api_options.max_retry_interval,
1483
1522
  ),
1484
- retry=retry_if_exception_type((HTTPError429, ConnectionError)),
1523
+ retry=retry_if_exception_type(
1524
+ (HTTPError429, HTTPError504, ConnectionError)
1525
+ ),
1485
1526
  stop=stop_after_attempt(self.config.api_options.max_attempts),
1486
1527
  )
1487
1528
 
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
1502
1543
  except HTTPError as http_error:
1503
1544
  error_response = http_error.response
1504
1545
  if error_response.status_code == 429:
1546
+ self.report.num_requests_exceeding_rate_limit += 1
1505
1547
  # respect Retry-After
1506
1548
  sleep_time = error_response.headers.get("retry-after")
1507
1549
  if sleep_time is not None:
1508
1550
  time.sleep(float(sleep_time))
1509
1551
  raise HTTPError429 from None
1552
+ elif error_response.status_code == 504:
1553
+ self.report.num_requests_retried_on_timeout += 1
1554
+ time.sleep(0.1)
1555
+ raise HTTPError504 from None
1510
1556
 
1511
1557
  logger.debug(
1512
1558
  f"Error response ({error_response.status_code}): {error_response.text}"