acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show
  1. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2562 -2476
  2. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
  3. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +46 -6
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  11. datahub/ingestion/source/common/subtypes.py +3 -0
  12. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  13. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  14. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  15. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  16. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  17. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  18. datahub/ingestion/source/excel/__init__.py +0 -0
  19. datahub/ingestion/source/excel/config.py +92 -0
  20. datahub/ingestion/source/excel/excel_file.py +539 -0
  21. datahub/ingestion/source/excel/profiling.py +308 -0
  22. datahub/ingestion/source/excel/report.py +49 -0
  23. datahub/ingestion/source/excel/source.py +662 -0
  24. datahub/ingestion/source/excel/util.py +18 -0
  25. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  26. datahub/ingestion/source/openapi.py +1 -1
  27. datahub/ingestion/source/powerbi/config.py +33 -0
  28. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  29. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  30. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  31. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  32. datahub/ingestion/source/redshift/config.py +9 -6
  33. datahub/ingestion/source/redshift/lineage.py +386 -687
  34. datahub/ingestion/source/redshift/redshift.py +19 -106
  35. datahub/ingestion/source/s3/source.py +65 -59
  36. datahub/ingestion/source/snowflake/constants.py +2 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
  39. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  40. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  42. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
  43. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  44. datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
  46. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  47. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  48. datahub/ingestion/source/sql/mssql/source.py +62 -3
  49. datahub/ingestion/source/sql_queries.py +24 -2
  50. datahub/ingestion/source/state/checkpoint.py +3 -28
  51. datahub/ingestion/source/unity/config.py +74 -9
  52. datahub/ingestion/source/unity/proxy.py +167 -5
  53. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  54. datahub/ingestion/source/unity/proxy_types.py +24 -0
  55. datahub/ingestion/source/unity/report.py +5 -0
  56. datahub/ingestion/source/unity/source.py +111 -1
  57. datahub/ingestion/source/usage/usage_common.py +1 -0
  58. datahub/metadata/_internal_schema_classes.py +573 -517
  59. datahub/metadata/_urns/urn_defs.py +1748 -1748
  60. datahub/metadata/schema.avsc +18564 -18484
  61. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  62. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  63. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  64. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  65. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  66. datahub/metadata/schemas/Ownership.avsc +69 -0
  67. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  68. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  69. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  70. datahub/metadata/schemas/__init__.py +3 -3
  71. datahub/sdk/chart.py +36 -22
  72. datahub/sdk/dashboard.py +38 -62
  73. datahub/sdk/lineage_client.py +6 -26
  74. datahub/sdk/main_client.py +7 -3
  75. datahub/sdk/search_filters.py +16 -0
  76. datahub/specific/aspect_helpers/siblings.py +73 -0
  77. datahub/specific/dataset.py +2 -0
  78. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  79. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  80. datahub/upgrade/upgrade.py +14 -2
  81. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  82. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
  83. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
  84. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
@@ -192,7 +192,8 @@
192
192
  "relationship": {
193
193
  "name": "Consumes",
194
194
  "entityTypes": [
195
- "dataset"
195
+ "dataset",
196
+ "chart"
196
197
  ],
197
198
  "isLineage": true
198
199
  }
@@ -397,5 +398,5 @@
397
398
  }
398
399
  },
399
400
  "generated_by": "metadata-ingestion/scripts/modeldocgen.py",
400
- "generated_at": "2025-07-01T10:49:03.713749+00:00"
401
+ "generated_at": "2025-08-05T19:29:49.306404+00:00"
401
402
  }
@@ -639,6 +639,7 @@ class Pipeline:
639
639
  "transformer_types": [
640
640
  transformer.type for transformer in self.config.transformers or []
641
641
  ],
642
+ "extractor_type": self.config.source.extractor,
642
643
  "records_written": stats.discretize(
643
644
  self.sink.get_report().total_records_written
644
645
  ),
@@ -1,5 +1,6 @@
1
1
  import logging
2
- from typing import Iterable, Optional, Union
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Iterable, Optional, Union
3
4
 
4
5
  from datahub.emitter.mce_builder import make_tag_urn
5
6
  from datahub.ingestion.api.common import PipelineContext
@@ -11,9 +12,14 @@ from datahub.ingestion.source.aws.s3_util import (
11
12
  )
12
13
  from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
13
14
 
15
+ if TYPE_CHECKING:
16
+ from mypy_boto3_s3.service_resource import ObjectSummary
17
+
14
18
  logging.getLogger("py4j").setLevel(logging.ERROR)
15
19
  logger: logging.Logger = logging.getLogger(__name__)
16
20
 
21
+ LIST_OBJECTS_PAGE_SIZE = 1000
22
+
17
23
 
18
24
  def get_s3_tags(
19
25
  bucket_name: str,
@@ -74,16 +80,79 @@ def get_s3_tags(
74
80
  return new_tags
75
81
 
76
82
 
83
+ @dataclass
84
+ class DirEntry:
85
+ """
86
+ Intended to be similar to os.DirEntry, which contains a name, full path, and possibly
87
+ other attributes of a directory entry. Currently only used to represent S3 folder-like
88
+ paths.
89
+ """
90
+
91
+ name: str
92
+ path: str
93
+
94
+
77
95
  def list_folders_path(
78
- s3_uri: str, aws_config: Optional[AwsConnectionConfig]
79
- ) -> Iterable[str]:
96
+ s3_uri: str,
97
+ *,
98
+ startswith: str = "",
99
+ aws_config: Optional[AwsConnectionConfig] = None,
100
+ ) -> Iterable[DirEntry]:
101
+ """
102
+ Given an S3 URI to a folder or bucket, return all sub-folders underneath that URI,
103
+ optionally filtering by startswith. Returned entries never contain a trailing slash.
104
+ """
105
+
106
+ if not is_s3_uri(s3_uri):
107
+ raise ValueError("Not a s3 URI: " + s3_uri)
108
+ if aws_config is None:
109
+ raise ValueError("aws_config not set. Cannot browse s3")
110
+
111
+ if not s3_uri.endswith("/"):
112
+ s3_uri += "/"
113
+
114
+ bucket_name = get_bucket_name(s3_uri)
115
+ if not bucket_name:
116
+ # No bucket name means we only have the s3[an]:// protocol, not a full bucket and
117
+ # prefix.
118
+ for folder in list_buckets(startswith, aws_config):
119
+ yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
120
+ return
121
+
122
+ prefix = get_bucket_relative_path(s3_uri) + startswith
123
+ for folder in list_folders(bucket_name, prefix, aws_config):
124
+ folder = folder.removesuffix("/").split("/")[-1]
125
+ yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
126
+
127
+
128
+ def list_objects_recursive_path(
129
+ s3_uri: str, *, startswith: str, aws_config: Optional[AwsConnectionConfig]
130
+ ) -> Iterable["ObjectSummary"]:
131
+ """
132
+ Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
133
+ filtering by startswith.
134
+ """
135
+
80
136
  if not is_s3_uri(s3_uri):
81
137
  raise ValueError("Not a s3 URI: " + s3_uri)
82
138
  if aws_config is None:
83
139
  raise ValueError("aws_config not set. Cannot browse s3")
140
+ if startswith and "/" in startswith:
141
+ raise ValueError(f"startswith contains forward slash: {repr(startswith)}")
142
+
143
+ if not s3_uri.endswith("/"):
144
+ s3_uri += "/"
145
+
84
146
  bucket_name = get_bucket_name(s3_uri)
85
- prefix = get_bucket_relative_path(s3_uri)
86
- yield from list_folders(bucket_name, prefix, aws_config)
147
+ if not bucket_name:
148
+ # No bucket name means we only have the s3[an]:// protocol, not a full bucket and
149
+ # prefix.
150
+ for bucket_name in list_buckets(startswith, aws_config):
151
+ yield from list_objects_recursive(bucket_name, "", aws_config)
152
+ return
153
+
154
+ prefix = get_bucket_relative_path(s3_uri) + startswith
155
+ yield from list_objects_recursive(bucket_name, prefix, aws_config)
87
156
 
88
157
 
89
158
  def list_folders(
@@ -99,3 +168,26 @@ def list_folders(
99
168
  if folder.endswith("/"):
100
169
  folder = folder[:-1]
101
170
  yield f"{folder}"
171
+
172
+
173
+ def list_buckets(
174
+ prefix: str, aws_config: Optional[AwsConnectionConfig]
175
+ ) -> Iterable[str]:
176
+ if aws_config is None:
177
+ raise ValueError("aws_config not set. Cannot browse s3")
178
+ s3_client = aws_config.get_s3_client()
179
+ paginator = s3_client.get_paginator("list_buckets")
180
+ for page in paginator.paginate(Prefix=prefix):
181
+ for o in page.get("Buckets", []):
182
+ yield str(o.get("Name"))
183
+
184
+
185
+ def list_objects_recursive(
186
+ bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
187
+ ) -> Iterable["ObjectSummary"]:
188
+ if aws_config is None:
189
+ raise ValueError("aws_config not set. Cannot browse s3")
190
+ s3_resource = aws_config.get_s3_resource()
191
+ bucket = s3_resource.Bucket(bucket_name)
192
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(LIST_OBJECTS_PAGE_SIZE):
193
+ yield obj
@@ -2,16 +2,23 @@ import logging
2
2
  import os
3
3
  from typing import Any, Dict, Optional
4
4
 
5
+ from google.api_core.client_info import ClientInfo
5
6
  from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
6
7
  from google.cloud.logging_v2.client import Client as GCPLoggingClient
7
8
  from pydantic import Field, PrivateAttr
8
9
 
10
+ from datahub._version import __version__
9
11
  from datahub.configuration.common import ConfigModel
10
12
  from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
11
13
 
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
 
17
+ def _get_bigquery_client_info() -> ClientInfo:
18
+ """Get ClientInfo with DataHub user-agent for BigQuery client identification"""
19
+ return ClientInfo(user_agent=f"datahub/{__version__}")
20
+
21
+
15
22
  class BigQueryConnectionConfig(ConfigModel):
16
23
  credential: Optional[GCPCredential] = Field(
17
24
  default=None, description="BigQuery credential informations"
@@ -41,7 +48,11 @@ class BigQueryConnectionConfig(ConfigModel):
41
48
 
42
49
  def get_bigquery_client(self) -> bigquery.Client:
43
50
  client_options = self.extra_client_options
44
- return bigquery.Client(self.project_on_behalf, **client_options)
51
+ return bigquery.Client(
52
+ self.project_on_behalf,
53
+ client_info=_get_bigquery_client_info(),
54
+ **client_options,
55
+ )
45
56
 
46
57
  def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
47
58
  return resourcemanager_v3.ProjectsClient()
@@ -54,6 +54,8 @@ class DatasetContainerSubTypes(StrEnum):
54
54
  ABS_CONTAINER = "ABS container"
55
55
  KEYSPACE = "Keyspace" # Cassandra
56
56
  NAMESPACE = "Namespace" # Iceberg
57
+ DREMIO_SPACE = "Dremio Space"
58
+ DREMIO_SOURCE = "Dremio Source"
57
59
 
58
60
 
59
61
  class BIContainerSubTypes(StrEnum):
@@ -63,6 +65,7 @@ class BIContainerSubTypes(StrEnum):
63
65
  TABLEAU_SITE = "Site"
64
66
  TABLEAU_PROJECT = "Project"
65
67
  TABLEAU_WORKBOOK = "Workbook"
68
+ POWERBI_WORKSPACE = "Workspace"
66
69
  POWERBI_DATASET = "Semantic Model"
67
70
  POWERBI_DATASET_TABLE = "Table"
68
71
  QLIK_SPACE = "Qlik Space"
@@ -563,7 +563,7 @@ class PathSpec(ConfigModel):
563
563
  def extract_table_name_and_path(self, path: str) -> Tuple[str, str]:
564
564
  parsed_vars = self.get_named_vars(path)
565
565
  if parsed_vars is None or "table" not in parsed_vars.named:
566
- return os.path.basename(path), path
566
+ return os.path.basename(path.removesuffix("/")), path
567
567
  else:
568
568
  include = self.include
569
569
  depth = include.count("/", 0, include.find("{table}"))
@@ -104,6 +104,22 @@ class DataHubDatabaseReader:
104
104
  ORDER BY mav.urn
105
105
  """
106
106
 
107
+ def _get_json_extract_expression(self) -> str:
108
+ """
109
+ Returns the appropriate JSON extraction expression based on the database dialect.
110
+
111
+ Returns:
112
+ Database-specific JSON extraction expression
113
+ """
114
+ # Return the correct JSON extraction expression for the "removed" field,
115
+ # depending on the database dialect.
116
+ if self.engine.dialect.name == "postgresql":
117
+ # For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
118
+ return "((metadata::json)->>'removed')::boolean"
119
+ else:
120
+ # For other databases (e.g., MySQL), use JSON_EXTRACT.
121
+ return "JSON_EXTRACT(metadata, '$.removed')"
122
+
107
123
  def query(self, set_structured_properties_filter: bool) -> str:
108
124
  """
109
125
  Main query that gets data for specified date range with appropriate filters.
@@ -125,7 +141,7 @@ class DataHubDatabaseReader:
125
141
  LEFT JOIN (
126
142
  SELECT
127
143
  *,
128
- JSON_EXTRACT(metadata, '$.removed') as removed
144
+ {self._get_json_extract_expression()} as removed
129
145
  FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
130
146
  WHERE aspect = 'status'
131
147
  AND version = 0
@@ -241,15 +257,10 @@ class DataHubDatabaseReader:
241
257
  "end_createdon": end_date.strftime(DATETIME_FORMAT),
242
258
  "limit": limit,
243
259
  "offset": offset,
260
+ # Always pass exclude_aspects as a tuple, postgres doesn't support lists
261
+ "exclude_aspects": tuple(self.config.exclude_aspects),
244
262
  }
245
263
 
246
- # Add exclude_aspects if needed
247
- if (
248
- hasattr(self.config, "exclude_aspects")
249
- and self.config.exclude_aspects
250
- ):
251
- params["exclude_aspects"] = tuple(self.config.exclude_aspects)
252
-
253
264
  logger.info(
254
265
  f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
255
266
  f"with limit {limit} and offset {offset} (inclusive range)"
@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
91
91
  OwnershipClass,
92
92
  OwnershipSourceTypeClass,
93
93
  OwnershipTypeClass,
94
+ SiblingsClass,
94
95
  StatusClass,
95
96
  SubTypesClass,
96
97
  TagAssociationClass,
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
98
99
  ViewPropertiesClass,
99
100
  )
100
101
  from datahub.metadata.urns import DatasetUrn
102
+ from datahub.specific.dataset import DatasetPatchBuilder
101
103
  from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
102
104
  from datahub.sql_parsing.sqlglot_lineage import (
103
105
  SqlParsingDebugInfo,
@@ -374,6 +376,14 @@ class DBTCommonConfig(
374
376
  "Set to False to skip it for engines like AWS Athena where it's not required.",
375
377
  )
376
378
 
379
+ dbt_is_primary_sibling: bool = Field(
380
+ default=True,
381
+ description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
382
+ "When True (default), dbt entities are primary and target platform entities are secondary. "
383
+ "When False, target platform entities are primary and dbt entities are secondary. "
384
+ "Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
385
+ )
386
+
377
387
  drop_duplicate_sources: bool = Field(
378
388
  default=True,
379
389
  description="When enabled, drops sources that have the same name in the target platform as a model. "
@@ -1476,6 +1486,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1476
1486
  dataset_snapshot = DatasetSnapshot(
1477
1487
  urn=node_datahub_urn, aspects=list(snapshot_aspects)
1478
1488
  )
1489
+ # Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
1490
+ if self._should_create_sibling_relationships(node):
1491
+ # Get the target platform URN
1492
+ target_platform_urn = node.get_urn(
1493
+ self.config.target_platform,
1494
+ self.config.env,
1495
+ self.config.target_platform_instance,
1496
+ )
1497
+
1498
+ yield MetadataChangeProposalWrapper(
1499
+ entityUrn=node_datahub_urn,
1500
+ aspect=SiblingsClass(
1501
+ siblings=[target_platform_urn],
1502
+ primary=self.config.dbt_is_primary_sibling,
1503
+ ),
1504
+ ).as_workunit()
1505
+
1479
1506
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1480
1507
  if self.config.write_semantics == "PATCH":
1481
1508
  mce = self.get_patched_mce(mce)
@@ -1579,6 +1606,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1579
1606
  if not node.exists_in_target_platform:
1580
1607
  continue
1581
1608
 
1609
+ # Emit sibling patch for target platform entity BEFORE any other aspects.
1610
+ # This ensures the hook can detect explicit primary settings when processing later aspects.
1611
+ if self._should_create_sibling_relationships(node):
1612
+ # Get the dbt platform URN
1613
+ dbt_platform_urn = node.get_urn(
1614
+ DBT_PLATFORM,
1615
+ self.config.env,
1616
+ self.config.platform_instance,
1617
+ )
1618
+
1619
+ # Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
1620
+ target_patch = DatasetPatchBuilder(node_datahub_urn)
1621
+ target_patch.add_sibling(
1622
+ dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
1623
+ )
1624
+
1625
+ yield from auto_workunit(
1626
+ MetadataWorkUnit(
1627
+ id=MetadataWorkUnit.generate_workunit_id(mcp),
1628
+ mcp_raw=mcp,
1629
+ is_primary_source=False, # Not authoritative over warehouse metadata
1630
+ )
1631
+ for mcp in target_patch.build()
1632
+ )
1633
+
1582
1634
  # This code block is run when we are generating entities of platform type.
1583
1635
  # We will not link the platform not to the dbt node for type "source" because
1584
1636
  # in this case the platform table existed first.
@@ -2134,5 +2186,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
2134
2186
  term_id_set.add(existing_term.urn)
2135
2187
  return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
2136
2188
 
2189
+ def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
2190
+ """
2191
+ Determines whether to emit sibling relationships for a dbt node.
2192
+
2193
+ Sibling relationships (both dbt entity's aspect and target entity's patch) are only
2194
+ emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
2195
+ relationships. When dbt_is_primary_sibling=True,
2196
+ the SiblingAssociationHook handles sibling creation automatically.
2197
+
2198
+ Args:
2199
+ node: The dbt node to evaluate
2200
+
2201
+ Returns:
2202
+ True if sibling patches should be emitted for this node
2203
+ """
2204
+ # Only create siblings for entities that exist in target platform
2205
+ if not node.exists_in_target_platform:
2206
+ return False
2207
+
2208
+ # Only emit patches when explicit primary/secondary control is needed
2209
+ return self.config.dbt_is_primary_sibling is False
2210
+
2137
2211
  def get_report(self):
2138
2212
  return self.report
@@ -14,6 +14,7 @@ from datahub.emitter.mce_builder import (
14
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
15
  from datahub.emitter.mcp_builder import ContainerKey
16
16
  from datahub.ingestion.api.workunit import MetadataWorkUnit
17
+ from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
17
18
  from datahub.ingestion.source.dremio.dremio_entities import (
18
19
  DremioContainer,
19
20
  DremioDataset,
@@ -364,9 +365,9 @@ class DremioAspects:
364
365
  ) -> Optional[BrowsePathsV2Class]:
365
366
  paths = []
366
367
 
367
- if entity.subclass == "Dremio Space":
368
+ if entity.subclass == DatasetContainerSubTypes.DREMIO_SPACE.value:
368
369
  paths.append(BrowsePathEntryClass(id="Spaces"))
369
- elif entity.subclass == "Dremio Source":
370
+ elif entity.subclass == DatasetContainerSubTypes.DREMIO_SOURCE.value:
370
371
  paths.append(BrowsePathEntryClass(id="Sources"))
371
372
  if paths:
372
373
  return BrowsePathsV2Class(path=paths)
@@ -90,6 +90,10 @@ class DremioSourceMapEntry:
90
90
  @capability(
91
91
  SourceCapability.CONTAINERS,
92
92
  "Enabled by default",
93
+ subtype_modifier=[
94
+ SourceCapabilityModifier.DREMIO_SPACE,
95
+ SourceCapabilityModifier.DREMIO_SOURCE,
96
+ ],
93
97
  )
94
98
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
95
99
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
@@ -12,7 +12,7 @@ from typing import (
12
12
  Union,
13
13
  )
14
14
 
15
- from pydantic.fields import Field
15
+ from pydantic import Field, PositiveInt
16
16
 
17
17
  from datahub.configuration.common import AllowDenyPattern
18
18
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -73,7 +73,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
73
73
 
74
74
  MAX_ITEMS_TO_RETRIEVE = 100
75
75
  PAGE_SIZE = 100
76
- MAX_SCHEMA_SIZE = 300
77
76
  MAX_PRIMARY_KEYS_SIZE = 100
78
77
  FIELD_DELIMITER = "."
79
78
 
@@ -107,6 +106,10 @@ class DynamoDBConfig(
107
106
  'Refer "Advanced Configurations" section for more details',
108
107
  )
109
108
 
109
+ max_schema_size: PositiveInt = Field(
110
+ default=300, description="Maximum number of fields to include in the schema."
111
+ )
112
+
110
113
  table_pattern: AllowDenyPattern = Field(
111
114
  default=AllowDenyPattern.allow_all(),
112
115
  description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'",
@@ -455,25 +458,25 @@ class DynamoDBSource(StatefulIngestionSourceBase):
455
458
  ) -> SchemaMetadataClass:
456
459
  """ "
457
460
  To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
458
- in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the
461
+ in descending order and truncate the schema by max_schema_size, and then start to construct the
459
462
  schema metadata sorted by attribute name
460
463
  """
461
464
 
462
465
  canonical_schema: List[SchemaField] = []
463
466
  schema_size = len(schema.values())
464
467
  table_fields = list(schema.values())
465
- if schema_size > MAX_SCHEMA_SIZE:
468
+ if schema_size > self.config.max_schema_size:
466
469
  # downsample the schema, using frequency as the sort key
467
470
  self.report.report_warning(
468
471
  title="Schema Size Too Large",
469
- message=f"Downsampling the table schema because MAX_SCHEMA_SIZE threshold is {MAX_SCHEMA_SIZE}",
472
+ message=f"Downsampling the table schema because `max_schema_size` threshold is {self.config.max_schema_size}",
470
473
  context=f"Collection: {dataset_urn}",
471
474
  )
472
475
 
473
476
  # Add this information to the custom properties so user can know they are looking at down sampled schema
474
477
  dataset_properties.customProperties["schema.downsampled"] = "True"
475
478
  dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
476
- # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
479
+ # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include max_schema_size items
477
480
  primary_keys = []
478
481
  for schema_field in sorted(
479
482
  table_fields,
@@ -481,7 +484,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
481
484
  -x["count"],
482
485
  x["delimited_name"],
483
486
  ), # Negate `count` for descending order, `delimited_name` stays the same for ascending
484
- )[0:MAX_SCHEMA_SIZE]:
487
+ )[: self.config.max_schema_size]:
485
488
  field_path = schema_field["delimited_name"]
486
489
  native_data_type = self.get_native_type(schema_field["type"], table_name)
487
490
  type = self.get_field_type(schema_field["type"], table_name)
File without changes
@@ -0,0 +1,92 @@
1
+ from typing import List, Optional, Union
2
+
3
+ from pydantic.fields import Field
4
+
5
+ from datahub.configuration.common import AllowDenyPattern
6
+ from datahub.configuration.source_common import DatasetSourceConfigMixin
7
+ from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
8
+ from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig
9
+ from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
10
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
11
+ StatefulStaleMetadataRemovalConfig,
12
+ )
13
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
14
+ StatefulIngestionConfigBase,
15
+ )
16
+ from datahub.ingestion.source_config.operation_config import is_profiling_enabled
17
+
18
+
19
+ class ExcelSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
20
+ path_list: List[str] = Field(
21
+ description="List of paths to Excel files or folders to ingest."
22
+ )
23
+
24
+ path_pattern: AllowDenyPattern = Field(
25
+ default=AllowDenyPattern.allow_all(),
26
+ description="Regex patterns for file paths to filter in ingestion.",
27
+ )
28
+
29
+ aws_config: Optional[AwsConnectionConfig] = Field(
30
+ default=None, description="AWS configuration"
31
+ )
32
+
33
+ use_s3_bucket_tags: Optional[bool] = Field(
34
+ default=False,
35
+ description="Whether or not to create tags in datahub from the s3 bucket",
36
+ )
37
+
38
+ use_s3_object_tags: Optional[bool] = Field(
39
+ default=False,
40
+ description="Whether or not to create tags in datahub from the s3 object",
41
+ )
42
+
43
+ verify_ssl: Union[bool, str] = Field(
44
+ default=True,
45
+ description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.",
46
+ )
47
+
48
+ azure_config: Optional[AzureConnectionConfig] = Field(
49
+ default=None, description="Azure configuration"
50
+ )
51
+
52
+ use_abs_blob_tags: Optional[bool] = Field(
53
+ default=False,
54
+ description="Whether to create tags in datahub from the abs blob tags",
55
+ )
56
+
57
+ convert_urns_to_lowercase: bool = Field(
58
+ default=False,
59
+ description="Enable to convert the Excel asset urns to lowercase",
60
+ )
61
+
62
+ active_sheet_only: bool = Field(
63
+ default=False,
64
+ description="Enable to only ingest the active sheet of the workbook. If not set, all sheets will be ingested.",
65
+ )
66
+
67
+ worksheet_pattern: AllowDenyPattern = Field(
68
+ default=AllowDenyPattern.allow_all(),
69
+ description="Regex patterns for worksheets to ingest. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
70
+ "For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
71
+ )
72
+
73
+ profile_pattern: AllowDenyPattern = Field(
74
+ default=AllowDenyPattern.allow_all(),
75
+ description="Regex patterns for worksheets to profile. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
76
+ "For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
77
+ )
78
+
79
+ profiling: GEProfilingConfig = Field(
80
+ default=GEProfilingConfig(),
81
+ description="Configuration for profiling",
82
+ )
83
+
84
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
85
+ default=None,
86
+ description="Configuration for stateful ingestion and stale metadata removal.",
87
+ )
88
+
89
+ def is_profiling_enabled(self) -> bool:
90
+ return self.profiling.enabled and is_profiling_enabled(
91
+ self.profiling.operation_config
92
+ )