acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -45,7 +45,7 @@ def _get_owner_urn(maybe_urn: str) -> str:
45
45
 
46
46
  def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) -> None:
47
47
  try:
48
- parsed_urn: Urn = Urn.create_from_string(urn)
48
+ parsed_urn: Urn = Urn.from_string(urn)
49
49
  entity_type = parsed_urn.get_type()
50
50
  except Exception:
51
51
  click.secho(f"Provided urn {urn} does not seem valid", fg="red")
@@ -31,7 +31,8 @@ def properties() -> None:
31
31
  def upsert(file: Path) -> None:
32
32
  """Upsert structured properties in DataHub."""
33
33
 
34
- StructuredProperties.create(str(file))
34
+ with get_default_graph() as graph:
35
+ StructuredProperties.create(str(file), graph)
35
36
 
36
37
 
37
38
  @properties.command(
@@ -258,7 +258,7 @@ class AllowDenyPattern(ConfigModel):
258
258
  return AllowDenyPattern()
259
259
 
260
260
  def allowed(self, string: str) -> bool:
261
- if self._denied(string):
261
+ if self.denied(string):
262
262
  return False
263
263
 
264
264
  return any(
@@ -266,7 +266,7 @@ class AllowDenyPattern(ConfigModel):
266
266
  for allow_pattern in self.allow
267
267
  )
268
268
 
269
- def _denied(self, string: str) -> bool:
269
+ def denied(self, string: str) -> bool:
270
270
  for deny_pattern in self.deny:
271
271
  if re.match(deny_pattern, string, self.regex_flags):
272
272
  return True
@@ -290,7 +290,7 @@ class AllowDenyPattern(ConfigModel):
290
290
  raise ValueError(
291
291
  "allow list must be fully specified to get list of allowed strings"
292
292
  )
293
- return [a for a in self.allow if not self._denied(a)]
293
+ return [a for a in self.allow if not self.denied(a)]
294
294
 
295
295
  def __eq__(self, other): # type: ignore
296
296
  return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
@@ -24,7 +24,11 @@ class GitReference(ConfigModel):
24
24
  "main",
25
25
  description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
26
26
  )
27
-
27
+ url_subdir: Optional[str] = Field(
28
+ default=None,
29
+ description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
30
+ "Only affects URL generation, not git operations.",
31
+ )
28
32
  url_template: Optional[str] = Field(
29
33
  None,
30
34
  description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
@@ -68,6 +72,8 @@ class GitReference(ConfigModel):
68
72
 
69
73
  def get_url_for_file_path(self, file_path: str) -> str:
70
74
  assert self.url_template
75
+ if self.url_subdir:
76
+ file_path = f"{self.url_subdir}/{file_path}"
71
77
  return self.url_template.format(
72
78
  repo_url=self.repo, branch=self.branch, file_path=file_path
73
79
  )
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
  from typing import Any, Dict, Optional
3
4
 
@@ -34,5 +35,34 @@ class CallableConsumerConfig:
34
35
  "oauth_cb must be a string representing python function reference "
35
36
  "in the format <python-module>:<function-name>."
36
37
  )
38
+
39
+ call_back_fn = import_path(call_back)
40
+ self._validate_call_back_fn_signature(call_back_fn)
41
+
37
42
  # Set the callback
38
- self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
43
+ self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
44
+
45
+ def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
46
+ sig = inspect.signature(call_back_fn)
47
+
48
+ num_positional_args = len(
49
+ [
50
+ param
51
+ for param in sig.parameters.values()
52
+ if param.kind
53
+ in (
54
+ inspect.Parameter.POSITIONAL_ONLY,
55
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
56
+ )
57
+ and param.default == inspect.Parameter.empty
58
+ ]
59
+ )
60
+
61
+ has_variadic_args = any(
62
+ param.kind == inspect.Parameter.VAR_POSITIONAL
63
+ for param in sig.parameters.values()
64
+ )
65
+
66
+ assert num_positional_args == 1 or (
67
+ has_variadic_args and num_positional_args <= 1
68
+ ), "oauth_cb function must accept single positional argument."
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import time
2
3
  from collections import defaultdict
3
4
  from dataclasses import dataclass
4
5
  from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -6,12 +7,15 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
6
7
  from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
7
8
  from datahub.emitter.serialization_helper import pre_json_transform
8
9
  from datahub.metadata.schema_classes import (
10
+ AuditStampClass,
9
11
  ChangeTypeClass,
12
+ EdgeClass,
10
13
  GenericAspectClass,
11
14
  KafkaAuditHeaderClass,
12
15
  MetadataChangeProposalClass,
13
16
  SystemMetadataClass,
14
17
  )
18
+ from datahub.metadata.urns import Urn
15
19
  from datahub.utilities.urns.urn import guess_entity_type
16
20
 
17
21
 
@@ -89,3 +93,42 @@ class MetadataPatchProposal:
89
93
  )
90
94
  for aspect_name, patches in self.patches.items()
91
95
  ]
96
+
97
+ @classmethod
98
+ def _mint_auditstamp(cls, message: Optional[str] = None) -> AuditStampClass:
99
+ """
100
+ Creates an AuditStampClass instance with the current timestamp and other default values.
101
+
102
+ Args:
103
+ message: The message associated with the audit stamp (optional).
104
+
105
+ Returns:
106
+ An instance of AuditStampClass.
107
+ """
108
+ return AuditStampClass(
109
+ time=int(time.time() * 1000.0),
110
+ actor="urn:li:corpuser:datahub",
111
+ message=message,
112
+ )
113
+
114
+ @classmethod
115
+ def _ensure_urn_type(
116
+ cls, entity_type: str, edges: List[EdgeClass], context: str
117
+ ) -> None:
118
+ """
119
+ Ensures that the destination URNs in the given edges have the specified entity type.
120
+
121
+ Args:
122
+ entity_type: The entity type to check against.
123
+ edges: A list of Edge objects.
124
+ context: The context or description of the operation.
125
+
126
+ Raises:
127
+ ValueError: If any of the destination URNs is not of the specified entity type.
128
+ """
129
+ for e in edges:
130
+ urn = Urn.from_string(e.destinationUrn)
131
+ if not urn.entity_type == entity_type:
132
+ raise ValueError(
133
+ f"{context}: {e.destinationUrn} is not of type {entity_type}"
134
+ )
@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
46
46
  os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
47
47
  )
48
48
 
49
- # The limit is 16mb. We will use a max of 15mb to have some space for overhead.
50
- _MAX_BATCH_INGEST_PAYLOAD_SIZE = 15 * 1024 * 1024
49
+ # The limit is 16mb. We will use a max of 15mb to have some space
50
+ # for overhead like request headers.
51
+ # This applies to pretty much all calls to GMS.
52
+ INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
53
+
54
+ # This limit is somewhat arbitrary. All GMS endpoints will timeout
55
+ # and return a 500 if processing takes too long. To avoid sending
56
+ # too much to the backend and hitting a timeout, we try to limit
57
+ # the number of MCPs we send in a batch.
58
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
59
+ os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
60
+ )
51
61
 
52
62
 
53
63
  class DataHubRestEmitter(Closeable, Emitter):
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
290
300
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
291
301
  # If we will exceed the limit, we need to break it up into chunks.
292
302
  mcp_obj_chunks: List[List[str]] = []
293
- current_chunk_size = _MAX_BATCH_INGEST_PAYLOAD_SIZE
303
+ current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
294
304
  for mcp_obj in mcp_objs:
295
305
  mcp_obj_size = len(json.dumps(mcp_obj))
296
306
 
297
- if mcp_obj_size + current_chunk_size > _MAX_BATCH_INGEST_PAYLOAD_SIZE:
307
+ if (
308
+ mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309
+ or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310
+ ):
298
311
  mcp_obj_chunks.append([])
299
312
  current_chunk_size = 0
300
313
  mcp_obj_chunks[-1].append(mcp_obj)
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from typing import Iterable, Optional
3
+
4
+ from pydantic.fields import Field
5
+
6
+ from datahub.configuration.common import ConfigModel
7
+ from datahub.emitter.mce_builder import set_aspect
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
10
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
11
+ from datahub.metadata.schema_classes import (
12
+ DatasetPropertiesClass,
13
+ MetadataChangeEventClass,
14
+ SystemMetadataClass,
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def convert_dataset_properties_to_patch(
21
+ urn: str,
22
+ aspect: DatasetPropertiesClass,
23
+ system_metadata: Optional[SystemMetadataClass],
24
+ ) -> MetadataWorkUnit:
25
+ patch_builder = create_dataset_props_patch_builder(urn, aspect, system_metadata)
26
+ mcp = next(iter(patch_builder.build()))
27
+ return MetadataWorkUnit(id=MetadataWorkUnit.generate_workunit_id(mcp), mcp_raw=mcp)
28
+
29
+
30
+ def auto_incremental_properties(
31
+ incremental_properties: bool,
32
+ stream: Iterable[MetadataWorkUnit],
33
+ ) -> Iterable[MetadataWorkUnit]:
34
+ if not incremental_properties:
35
+ yield from stream
36
+ return # early exit
37
+
38
+ for wu in stream:
39
+ urn = wu.get_urn()
40
+
41
+ if isinstance(wu.metadata, MetadataChangeEventClass):
42
+ properties_aspect = wu.get_aspect_of_type(DatasetPropertiesClass)
43
+ set_aspect(wu.metadata, None, DatasetPropertiesClass)
44
+ if len(wu.metadata.proposedSnapshot.aspects) > 0:
45
+ yield wu
46
+
47
+ if properties_aspect:
48
+ yield convert_dataset_properties_to_patch(
49
+ urn, properties_aspect, wu.metadata.systemMetadata
50
+ )
51
+ elif isinstance(wu.metadata, MetadataChangeProposalWrapper) and isinstance(
52
+ wu.metadata.aspect, DatasetPropertiesClass
53
+ ):
54
+ properties_aspect = wu.metadata.aspect
55
+ if properties_aspect:
56
+ yield convert_dataset_properties_to_patch(
57
+ urn, properties_aspect, wu.metadata.systemMetadata
58
+ )
59
+ else:
60
+ yield wu
61
+
62
+
63
+ # TODO: Use this in SQLCommonConfig. Currently only used in snowflake
64
+ class IncrementalPropertiesConfigMixin(ConfigModel):
65
+ incremental_properties: bool = Field(
66
+ default=False,
67
+ description="When enabled, emits dataset properties as incremental to existing dataset properties "
68
+ "in DataHub. When disabled, re-states dataset properties on each run.",
69
+ )
@@ -184,6 +184,7 @@ class StructuredLogs(Report):
184
184
 
185
185
  @dataclass
186
186
  class SourceReport(Report):
187
+ event_not_produced_warn: bool = True
187
188
  events_produced: int = 0
188
189
  events_produced_per_sec: int = 0
189
190
 
@@ -492,11 +493,15 @@ class Source(Closeable, metaclass=ABCMeta):
492
493
 
493
494
  def _infer_platform(self) -> Optional[str]:
494
495
  config = self.get_config()
495
- return (
496
+ platform = (
496
497
  getattr(config, "platform_name", None)
497
498
  or getattr(self, "platform", None)
498
499
  or getattr(config, "platform", None)
499
500
  )
501
+ if platform is None and hasattr(self, "get_platform_id"):
502
+ platform = type(self).get_platform_id()
503
+
504
+ return platform
500
505
 
501
506
  def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
502
507
  config = self.get_config()
@@ -32,6 +32,7 @@ from datahub.metadata.schema_classes import (
32
32
  SchemaFieldClass,
33
33
  SchemaMetadataClass,
34
34
  StatusClass,
35
+ SystemMetadataClass,
35
36
  TimeWindowSizeClass,
36
37
  )
37
38
  from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
@@ -65,9 +66,10 @@ def auto_workunit(
65
66
  def create_dataset_props_patch_builder(
66
67
  dataset_urn: str,
67
68
  dataset_properties: DatasetPropertiesClass,
69
+ system_metadata: Optional[SystemMetadataClass] = None,
68
70
  ) -> DatasetPatchBuilder:
69
71
  """Creates a patch builder with a table's or view's attributes and dataset properties"""
70
- patch_builder = DatasetPatchBuilder(dataset_urn)
72
+ patch_builder = DatasetPatchBuilder(dataset_urn, system_metadata)
71
73
  patch_builder.set_display_name(dataset_properties.name)
72
74
  patch_builder.set_description(dataset_properties.description)
73
75
  patch_builder.set_created(dataset_properties.created)
@@ -148,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera
148
150
  report.report_workunit(wu)
149
151
  yield wu
150
152
 
151
- if report.events_produced == 0:
153
+ if report.event_not_produced_warn and report.events_produced == 0:
152
154
  report.warning(
153
155
  title="No metadata was produced by the source",
154
156
  message="Please check the source configuration, filters, and permissions.",
@@ -67,6 +67,7 @@ from datahub.metadata.schema_classes import (
67
67
  SystemMetadataClass,
68
68
  TelemetryClientIdClass,
69
69
  )
70
+ from datahub.telemetry.telemetry import telemetry_instance
70
71
  from datahub.utilities.perf_timer import PerfTimer
71
72
  from datahub.utilities.str_enum import StrEnum
72
73
  from datahub.utilities.urns.urn import Urn, guess_entity_type
@@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph:
1819
1820
  graph_config = config_utils.load_client_config()
1820
1821
  graph = DataHubGraph(graph_config)
1821
1822
  graph.test_connection()
1823
+ telemetry_instance.set_context(server=graph)
1822
1824
  return graph
@@ -148,10 +148,10 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
148
148
 
149
149
  def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
150
150
  assert ctx.pipeline_config
151
- if not self.report_recipe or not ctx.pipeline_config._raw_dict:
151
+ if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
152
152
  return ""
153
153
  else:
154
- return json.dumps(redact_raw_config(ctx.pipeline_config._raw_dict))
154
+ return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
155
155
 
156
156
  def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
157
157
  self.sink.write_record_async(
@@ -44,7 +44,8 @@ from datahub.ingestion.transformer.system_metadata_transformer import (
44
44
  )
45
45
  from datahub.ingestion.transformer.transform_registry import transform_registry
46
46
  from datahub.metadata.schema_classes import MetadataChangeProposalClass
47
- from datahub.telemetry import stats, telemetry
47
+ from datahub.telemetry import stats
48
+ from datahub.telemetry.telemetry import telemetry_instance
48
49
  from datahub.utilities._custom_package_loader import model_version_name
49
50
  from datahub.utilities.global_warning_util import (
50
51
  clear_global_warnings,
@@ -220,7 +221,7 @@ class Pipeline:
220
221
  dry_run: bool = False,
221
222
  preview_mode: bool = False,
222
223
  preview_workunits: int = 10,
223
- report_to: Optional[str] = None,
224
+ report_to: Optional[str] = "datahub",
224
225
  no_progress: bool = False,
225
226
  ):
226
227
  self.config = config
@@ -273,8 +274,9 @@ class Pipeline:
273
274
  if self.graph is None and isinstance(self.sink, DatahubRestSink):
274
275
  with _add_init_error_context("setup default datahub client"):
275
276
  self.graph = self.sink.emitter.to_graph()
277
+ self.graph.test_connection()
276
278
  self.ctx.graph = self.graph
277
- telemetry.telemetry_instance.update_capture_exception_context(server=self.graph)
279
+ telemetry_instance.set_context(server=self.graph)
278
280
 
279
281
  with set_graph_context(self.graph):
280
282
  with _add_init_error_context("configure reporters"):
@@ -615,7 +617,7 @@ class Pipeline:
615
617
  sink_warnings = len(self.sink.get_report().warnings)
616
618
  global_warnings = len(get_global_warnings())
617
619
 
618
- telemetry.telemetry_instance.ping(
620
+ telemetry_instance.ping(
619
621
  "ingest_stats",
620
622
  {
621
623
  "source_type": self.source_type,
@@ -637,7 +639,6 @@ class Pipeline:
637
639
  ),
638
640
  "has_pipeline_name": bool(self.config.pipeline_name),
639
641
  },
640
- self.ctx.graph,
641
642
  )
642
643
 
643
644
  def _approx_all_vals(self, d: LossyList[Any]) -> int:
@@ -117,3 +117,9 @@ class PipelineConfig(ConfigModel):
117
117
  config = cls.parse_obj(resolved_dict)
118
118
  config._raw_dict = raw_dict
119
119
  return config
120
+
121
+ def get_raw_dict(self) -> Dict:
122
+ result = self._raw_dict
123
+ if result is None:
124
+ result = self.dict()
125
+ return result
@@ -18,7 +18,10 @@ from datahub.configuration.common import (
18
18
  )
19
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
20
  from datahub.emitter.mcp_builder import mcps_from_mce
21
- from datahub.emitter.rest_emitter import DataHubRestEmitter
21
+ from datahub.emitter.rest_emitter import (
22
+ BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
+ DataHubRestEmitter,
24
+ )
22
25
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
23
26
  from datahub.ingestion.api.sink import (
24
27
  NoopWriteCallback,
@@ -65,11 +68,19 @@ class DatahubRestSinkConfig(DatahubClientConfig):
65
68
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
66
69
 
67
70
  # These only apply in async modes.
68
- max_threads: int = _DEFAULT_REST_SINK_MAX_THREADS
69
- max_pending_requests: int = 2000
71
+ max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
72
+ max_pending_requests: pydantic.PositiveInt = 2000
70
73
 
71
74
  # Only applies in async batch mode.
72
- max_per_batch: int = 100
75
+ max_per_batch: pydantic.PositiveInt = 100
76
+
77
+ @pydantic.validator("max_per_batch", always=True)
78
+ def validate_max_per_batch(cls, v):
79
+ if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
80
+ raise ValueError(
81
+ f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
82
+ )
83
+ return v
73
84
 
74
85
 
75
86
  @dataclasses.dataclass
@@ -201,6 +201,10 @@ class ABSSource(StatefulIngestionSourceBase):
201
201
  ).infer_schema(file)
202
202
  elif extension == ".json":
203
203
  fields = json.JsonInferrer().infer_schema(file)
204
+ elif extension == ".jsonl":
205
+ fields = json.JsonInferrer(
206
+ max_rows=self.source_config.max_rows, format="jsonl"
207
+ ).infer_schema(file)
204
208
  elif extension == ".avro":
205
209
  fields = avro.AvroInferrer().infer_schema(file)
206
210
  else:
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime, timedelta, timezone
2
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
3
3
 
4
4
  import boto3
5
5
  from boto3.session import Session
@@ -107,6 +107,14 @@ class AwsConnectionConfig(ConfigModel):
107
107
  default=None,
108
108
  description="A set of proxy configs to use with AWS. See the [botocore.config](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html) docs for details.",
109
109
  )
110
+ aws_retry_num: int = Field(
111
+ default=5,
112
+ description="Number of times to retry failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
113
+ )
114
+ aws_retry_mode: Literal["legacy", "standard", "adaptive"] = Field(
115
+ default="standard",
116
+ description="Retry mode to use for failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
117
+ )
110
118
 
111
119
  read_timeout: float = Field(
112
120
  default=DEFAULT_TIMEOUT,
@@ -199,6 +207,10 @@ class AwsConnectionConfig(ConfigModel):
199
207
  return Config(
200
208
  proxies=self.aws_proxy,
201
209
  read_timeout=self.read_timeout,
210
+ retries={
211
+ "max_attempts": self.aws_retry_num,
212
+ "mode": self.aws_retry_mode,
213
+ },
202
214
  **self.aws_advanced_config,
203
215
  )
204
216
 
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections import defaultdict
2
3
  from typing import TYPE_CHECKING, DefaultDict, Dict, Iterable, List, Optional
3
4
 
@@ -36,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
36
37
  if TYPE_CHECKING:
37
38
  from mypy_boto3_sagemaker import SageMakerClient
38
39
 
40
+ logger = logging.getLogger(__name__)
41
+
39
42
 
40
43
  @platform_name("SageMaker")
41
44
  @config_class(SagemakerSourceConfig)
@@ -75,6 +78,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
75
78
  ]
76
79
 
77
80
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
81
+ logger.info("Starting SageMaker ingestion...")
78
82
  # get common lineage graph
79
83
  lineage_processor = LineageProcessor(
80
84
  sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
@@ -83,6 +87,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
83
87
 
84
88
  # extract feature groups if specified
85
89
  if self.source_config.extract_feature_groups:
90
+ logger.info("Extracting feature groups...")
86
91
  feature_group_processor = FeatureGroupProcessor(
87
92
  sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
88
93
  )
@@ -95,6 +100,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
95
100
 
96
101
  # extract jobs if specified
97
102
  if self.source_config.extract_jobs is not False:
103
+ logger.info("Extracting jobs...")
98
104
  job_processor = JobProcessor(
99
105
  sagemaker_client=self.client_factory.get_client,
100
106
  env=self.env,
@@ -109,6 +115,8 @@ class SagemakerSource(StatefulIngestionSourceBase):
109
115
 
110
116
  # extract models if specified
111
117
  if self.source_config.extract_models:
118
+ logger.info("Extracting models...")
119
+
112
120
  model_processor = ModelProcessor(
113
121
  sagemaker_client=self.sagemaker_client,
114
122
  env=self.env,
@@ -40,8 +40,11 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
40
40
  groups_scanned = 0
41
41
  models_scanned = 0
42
42
  jobs_scanned = 0
43
+ jobs_processed = 0
43
44
  datasets_scanned = 0
44
45
  filtered: List[str] = field(default_factory=list)
46
+ model_endpoint_lineage = 0
47
+ model_group_lineage = 0
45
48
 
46
49
  def report_feature_group_scanned(self) -> None:
47
50
  self.feature_groups_scanned += 1
@@ -58,6 +61,9 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
58
61
  def report_model_scanned(self) -> None:
59
62
  self.models_scanned += 1
60
63
 
64
+ def report_job_processed(self) -> None:
65
+ self.jobs_processed += 1
66
+
61
67
  def report_job_scanned(self) -> None:
62
68
  self.jobs_scanned += 1
63
69
 
@@ -1,3 +1,5 @@
1
+ import logging
2
+ import textwrap
1
3
  from dataclasses import dataclass
2
4
  from typing import TYPE_CHECKING, Iterable, List
3
5
 
@@ -28,6 +30,8 @@ if TYPE_CHECKING:
28
30
  FeatureGroupSummaryTypeDef,
29
31
  )
30
32
 
33
+ logger = logging.getLogger(__name__)
34
+
31
35
 
32
36
  @dataclass
33
37
  class FeatureGroupProcessor:
@@ -197,11 +201,12 @@ class FeatureGroupProcessor:
197
201
 
198
202
  full_table_name = f"{glue_database}.{glue_table}"
199
203
 
200
- self.report.report_warning(
201
- full_table_name,
202
- f"""Note: table {full_table_name} is an AWS Glue object.
204
+ logging.info(
205
+ textwrap.dedent(
206
+ f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
203
207
  To view full table metadata, run Glue ingestion
204
- (see https://datahubproject.io/docs/metadata-ingestion/#aws-glue-glue)""",
208
+ (see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
209
+ )
205
210
  )
206
211
 
207
212
  feature_sources.append(
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from collections import defaultdict
2
3
  from dataclasses import dataclass, field
3
4
  from enum import Enum
@@ -49,6 +50,8 @@ from datahub.metadata.schema_classes import (
49
50
  if TYPE_CHECKING:
50
51
  from mypy_boto3_sagemaker import SageMakerClient
51
52
 
53
+ logger = logging.getLogger(__name__)
54
+
52
55
  JobInfo = TypeVar(
53
56
  "JobInfo",
54
57
  AutoMlJobInfo,
@@ -274,15 +277,18 @@ class JobProcessor:
274
277
  )
275
278
 
276
279
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
280
+ logger.info("Getting all SageMaker jobs")
277
281
  jobs = self.get_all_jobs()
278
282
 
279
283
  processed_jobs: Dict[str, SageMakerJob] = {}
280
284
 
285
+ logger.info("Processing SageMaker jobs")
281
286
  # first pass: process jobs and collect datasets used
287
+ logger.info("first pass: process jobs and collect datasets used")
282
288
  for job in jobs:
283
289
  job_type = job_type_to_info[job["type"]]
284
290
  job_name = job[job_type.list_name_key]
285
-
291
+ logger.debug(f"Processing job {job_name} with type {job_type}")
286
292
  job_details = self.get_job_details(job_name, job["type"])
287
293
 
288
294
  processed_job = getattr(self, job_type.processor)(job_details)
@@ -293,6 +299,9 @@ class JobProcessor:
293
299
  # second pass:
294
300
  # - move output jobs to inputs
295
301
  # - aggregate i/o datasets
302
+ logger.info(
303
+ "second pass: move output jobs to inputs and aggregate i/o datasets"
304
+ )
296
305
  for job_urn in sorted(processed_jobs):
297
306
  processed_job = processed_jobs[job_urn]
298
307
 
@@ -301,6 +310,7 @@ class JobProcessor:
301
310
 
302
311
  all_datasets.update(processed_job.input_datasets)
303
312
  all_datasets.update(processed_job.output_datasets)
313
+ self.report.report_job_processed()
304
314
 
305
315
  # yield datasets
306
316
  for dataset_urn, dataset in all_datasets.items():
@@ -322,6 +332,7 @@ class JobProcessor:
322
332
  self.report.report_dataset_scanned()
323
333
 
324
334
  # third pass: construct and yield MCEs
335
+ logger.info("third pass: construct and yield MCEs")
325
336
  for job_urn in sorted(processed_jobs):
326
337
  processed_job = processed_jobs[job_urn]
327
338
  job_snapshot = processed_job.job_snapshot