acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/METADATA +2564 -2501
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/RECORD +63 -55
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +45 -5
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/metadata/_internal_schema_classes.py +568 -512
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18242 -18168
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datahub.emitter.mce_builder import make_tag_urn
|
|
5
6
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -11,9 +12,14 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
11
12
|
)
|
|
12
13
|
from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
17
|
+
|
|
14
18
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
15
19
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
21
|
+
LIST_OBJECTS_PAGE_SIZE = 1000
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
def get_s3_tags(
|
|
19
25
|
bucket_name: str,
|
|
@@ -74,16 +80,79 @@ def get_s3_tags(
|
|
|
74
80
|
return new_tags
|
|
75
81
|
|
|
76
82
|
|
|
83
|
+
@dataclass
|
|
84
|
+
class DirEntry:
|
|
85
|
+
"""
|
|
86
|
+
Intended to be similar to os.DirEntry, which contains a name, full path, and possibly
|
|
87
|
+
other attributes of a directory entry. Currently only used to represent S3 folder-like
|
|
88
|
+
paths.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
name: str
|
|
92
|
+
path: str
|
|
93
|
+
|
|
94
|
+
|
|
77
95
|
def list_folders_path(
|
|
78
|
-
s3_uri: str,
|
|
79
|
-
|
|
96
|
+
s3_uri: str,
|
|
97
|
+
*,
|
|
98
|
+
startswith: str = "",
|
|
99
|
+
aws_config: Optional[AwsConnectionConfig] = None,
|
|
100
|
+
) -> Iterable[DirEntry]:
|
|
101
|
+
"""
|
|
102
|
+
Given an S3 URI to a folder or bucket, return all sub-folders underneath that URI,
|
|
103
|
+
optionally filtering by startswith. Returned entries never contain a trailing slash.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
if not is_s3_uri(s3_uri):
|
|
107
|
+
raise ValueError("Not a s3 URI: " + s3_uri)
|
|
108
|
+
if aws_config is None:
|
|
109
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
110
|
+
|
|
111
|
+
if not s3_uri.endswith("/"):
|
|
112
|
+
s3_uri += "/"
|
|
113
|
+
|
|
114
|
+
bucket_name = get_bucket_name(s3_uri)
|
|
115
|
+
if not bucket_name:
|
|
116
|
+
# No bucket name means we only have the s3[an]:// protocol, not a full bucket and
|
|
117
|
+
# prefix.
|
|
118
|
+
for folder in list_buckets(startswith, aws_config):
|
|
119
|
+
yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
prefix = get_bucket_relative_path(s3_uri) + startswith
|
|
123
|
+
for folder in list_folders(bucket_name, prefix, aws_config):
|
|
124
|
+
folder = folder.removesuffix("/").split("/")[-1]
|
|
125
|
+
yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def list_objects_recursive_path(
|
|
129
|
+
s3_uri: str, *, startswith: str, aws_config: Optional[AwsConnectionConfig]
|
|
130
|
+
) -> Iterable["ObjectSummary"]:
|
|
131
|
+
"""
|
|
132
|
+
Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
|
|
133
|
+
filtering by startswith.
|
|
134
|
+
"""
|
|
135
|
+
|
|
80
136
|
if not is_s3_uri(s3_uri):
|
|
81
137
|
raise ValueError("Not a s3 URI: " + s3_uri)
|
|
82
138
|
if aws_config is None:
|
|
83
139
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
140
|
+
if startswith and "/" in startswith:
|
|
141
|
+
raise ValueError(f"startswith contains forward slash: {repr(startswith)}")
|
|
142
|
+
|
|
143
|
+
if not s3_uri.endswith("/"):
|
|
144
|
+
s3_uri += "/"
|
|
145
|
+
|
|
84
146
|
bucket_name = get_bucket_name(s3_uri)
|
|
85
|
-
|
|
86
|
-
|
|
147
|
+
if not bucket_name:
|
|
148
|
+
# No bucket name means we only have the s3[an]:// protocol, not a full bucket and
|
|
149
|
+
# prefix.
|
|
150
|
+
for bucket_name in list_buckets(startswith, aws_config):
|
|
151
|
+
yield from list_objects_recursive(bucket_name, "", aws_config)
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
prefix = get_bucket_relative_path(s3_uri) + startswith
|
|
155
|
+
yield from list_objects_recursive(bucket_name, prefix, aws_config)
|
|
87
156
|
|
|
88
157
|
|
|
89
158
|
def list_folders(
|
|
@@ -99,3 +168,26 @@ def list_folders(
|
|
|
99
168
|
if folder.endswith("/"):
|
|
100
169
|
folder = folder[:-1]
|
|
101
170
|
yield f"{folder}"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def list_buckets(
|
|
174
|
+
prefix: str, aws_config: Optional[AwsConnectionConfig]
|
|
175
|
+
) -> Iterable[str]:
|
|
176
|
+
if aws_config is None:
|
|
177
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
178
|
+
s3_client = aws_config.get_s3_client()
|
|
179
|
+
paginator = s3_client.get_paginator("list_buckets")
|
|
180
|
+
for page in paginator.paginate(Prefix=prefix):
|
|
181
|
+
for o in page.get("Buckets", []):
|
|
182
|
+
yield str(o.get("Name"))
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def list_objects_recursive(
|
|
186
|
+
bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
|
|
187
|
+
) -> Iterable["ObjectSummary"]:
|
|
188
|
+
if aws_config is None:
|
|
189
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
190
|
+
s3_resource = aws_config.get_s3_resource()
|
|
191
|
+
bucket = s3_resource.Bucket(bucket_name)
|
|
192
|
+
for obj in bucket.objects.filter(Prefix=prefix).page_size(LIST_OBJECTS_PAGE_SIZE):
|
|
193
|
+
yield obj
|
|
@@ -54,6 +54,8 @@ class DatasetContainerSubTypes(StrEnum):
|
|
|
54
54
|
ABS_CONTAINER = "ABS container"
|
|
55
55
|
KEYSPACE = "Keyspace" # Cassandra
|
|
56
56
|
NAMESPACE = "Namespace" # Iceberg
|
|
57
|
+
DREMIO_SPACE = "Dremio Space"
|
|
58
|
+
DREMIO_SOURCE = "Dremio Source"
|
|
57
59
|
|
|
58
60
|
|
|
59
61
|
class BIContainerSubTypes(StrEnum):
|
|
@@ -63,6 +65,7 @@ class BIContainerSubTypes(StrEnum):
|
|
|
63
65
|
TABLEAU_SITE = "Site"
|
|
64
66
|
TABLEAU_PROJECT = "Project"
|
|
65
67
|
TABLEAU_WORKBOOK = "Workbook"
|
|
68
|
+
POWERBI_WORKSPACE = "Workspace"
|
|
66
69
|
POWERBI_DATASET = "Semantic Model"
|
|
67
70
|
POWERBI_DATASET_TABLE = "Table"
|
|
68
71
|
QLIK_SPACE = "Qlik Space"
|
|
@@ -563,7 +563,7 @@ class PathSpec(ConfigModel):
|
|
|
563
563
|
def extract_table_name_and_path(self, path: str) -> Tuple[str, str]:
|
|
564
564
|
parsed_vars = self.get_named_vars(path)
|
|
565
565
|
if parsed_vars is None or "table" not in parsed_vars.named:
|
|
566
|
-
return os.path.basename(path), path
|
|
566
|
+
return os.path.basename(path.removesuffix("/")), path
|
|
567
567
|
else:
|
|
568
568
|
include = self.include
|
|
569
569
|
depth = include.count("/", 0, include.find("{table}"))
|
|
@@ -104,6 +104,22 @@ class DataHubDatabaseReader:
|
|
|
104
104
|
ORDER BY mav.urn
|
|
105
105
|
"""
|
|
106
106
|
|
|
107
|
+
def _get_json_extract_expression(self) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Returns the appropriate JSON extraction expression based on the database dialect.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Database-specific JSON extraction expression
|
|
113
|
+
"""
|
|
114
|
+
# Return the correct JSON extraction expression for the "removed" field,
|
|
115
|
+
# depending on the database dialect.
|
|
116
|
+
if self.engine.dialect.name == "postgresql":
|
|
117
|
+
# For PostgreSQL, cast the metadata column to JSON and extract the 'removed' key as boolean.
|
|
118
|
+
return "((metadata::json)->>'removed')::boolean"
|
|
119
|
+
else:
|
|
120
|
+
# For other databases (e.g., MySQL), use JSON_EXTRACT.
|
|
121
|
+
return "JSON_EXTRACT(metadata, '$.removed')"
|
|
122
|
+
|
|
107
123
|
def query(self, set_structured_properties_filter: bool) -> str:
|
|
108
124
|
"""
|
|
109
125
|
Main query that gets data for specified date range with appropriate filters.
|
|
@@ -125,7 +141,7 @@ class DataHubDatabaseReader:
|
|
|
125
141
|
LEFT JOIN (
|
|
126
142
|
SELECT
|
|
127
143
|
*,
|
|
128
|
-
|
|
144
|
+
{self._get_json_extract_expression()} as removed
|
|
129
145
|
FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
|
|
130
146
|
WHERE aspect = 'status'
|
|
131
147
|
AND version = 0
|
|
@@ -241,15 +257,10 @@ class DataHubDatabaseReader:
|
|
|
241
257
|
"end_createdon": end_date.strftime(DATETIME_FORMAT),
|
|
242
258
|
"limit": limit,
|
|
243
259
|
"offset": offset,
|
|
260
|
+
# Always pass exclude_aspects as a tuple, postgres doesn't support lists
|
|
261
|
+
"exclude_aspects": tuple(self.config.exclude_aspects),
|
|
244
262
|
}
|
|
245
263
|
|
|
246
|
-
# Add exclude_aspects if needed
|
|
247
|
-
if (
|
|
248
|
-
hasattr(self.config, "exclude_aspects")
|
|
249
|
-
and self.config.exclude_aspects
|
|
250
|
-
):
|
|
251
|
-
params["exclude_aspects"] = tuple(self.config.exclude_aspects)
|
|
252
|
-
|
|
253
264
|
logger.info(
|
|
254
265
|
f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
|
|
255
266
|
f"with limit {limit} and offset {offset} (inclusive range)"
|
|
@@ -91,6 +91,7 @@ from datahub.metadata.schema_classes import (
|
|
|
91
91
|
OwnershipClass,
|
|
92
92
|
OwnershipSourceTypeClass,
|
|
93
93
|
OwnershipTypeClass,
|
|
94
|
+
SiblingsClass,
|
|
94
95
|
StatusClass,
|
|
95
96
|
SubTypesClass,
|
|
96
97
|
TagAssociationClass,
|
|
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
|
|
|
98
99
|
ViewPropertiesClass,
|
|
99
100
|
)
|
|
100
101
|
from datahub.metadata.urns import DatasetUrn
|
|
102
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
101
103
|
from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
|
|
102
104
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
103
105
|
SqlParsingDebugInfo,
|
|
@@ -374,6 +376,14 @@ class DBTCommonConfig(
|
|
|
374
376
|
"Set to False to skip it for engines like AWS Athena where it's not required.",
|
|
375
377
|
)
|
|
376
378
|
|
|
379
|
+
dbt_is_primary_sibling: bool = Field(
|
|
380
|
+
default=True,
|
|
381
|
+
description="Experimental: Controls sibling relationship primary designation between dbt entities and target platform entities. "
|
|
382
|
+
"When True (default), dbt entities are primary and target platform entities are secondary. "
|
|
383
|
+
"When False, target platform entities are primary and dbt entities are secondary. "
|
|
384
|
+
"Uses aspect patches for precise control. Requires DataHub server 1.3.0+.",
|
|
385
|
+
)
|
|
386
|
+
|
|
377
387
|
drop_duplicate_sources: bool = Field(
|
|
378
388
|
default=True,
|
|
379
389
|
description="When enabled, drops sources that have the same name in the target platform as a model. "
|
|
@@ -1476,6 +1486,23 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1476
1486
|
dataset_snapshot = DatasetSnapshot(
|
|
1477
1487
|
urn=node_datahub_urn, aspects=list(snapshot_aspects)
|
|
1478
1488
|
)
|
|
1489
|
+
# Emit sibling aspect for dbt entity (dbt is authoritative source for sibling relationships)
|
|
1490
|
+
if self._should_create_sibling_relationships(node):
|
|
1491
|
+
# Get the target platform URN
|
|
1492
|
+
target_platform_urn = node.get_urn(
|
|
1493
|
+
self.config.target_platform,
|
|
1494
|
+
self.config.env,
|
|
1495
|
+
self.config.target_platform_instance,
|
|
1496
|
+
)
|
|
1497
|
+
|
|
1498
|
+
yield MetadataChangeProposalWrapper(
|
|
1499
|
+
entityUrn=node_datahub_urn,
|
|
1500
|
+
aspect=SiblingsClass(
|
|
1501
|
+
siblings=[target_platform_urn],
|
|
1502
|
+
primary=self.config.dbt_is_primary_sibling,
|
|
1503
|
+
),
|
|
1504
|
+
).as_workunit()
|
|
1505
|
+
|
|
1479
1506
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1480
1507
|
if self.config.write_semantics == "PATCH":
|
|
1481
1508
|
mce = self.get_patched_mce(mce)
|
|
@@ -1579,6 +1606,31 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1579
1606
|
if not node.exists_in_target_platform:
|
|
1580
1607
|
continue
|
|
1581
1608
|
|
|
1609
|
+
# Emit sibling patch for target platform entity BEFORE any other aspects.
|
|
1610
|
+
# This ensures the hook can detect explicit primary settings when processing later aspects.
|
|
1611
|
+
if self._should_create_sibling_relationships(node):
|
|
1612
|
+
# Get the dbt platform URN
|
|
1613
|
+
dbt_platform_urn = node.get_urn(
|
|
1614
|
+
DBT_PLATFORM,
|
|
1615
|
+
self.config.env,
|
|
1616
|
+
self.config.platform_instance,
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
# Create patch for target platform entity (make it primary when dbt_is_primary_sibling=False)
|
|
1620
|
+
target_patch = DatasetPatchBuilder(node_datahub_urn)
|
|
1621
|
+
target_patch.add_sibling(
|
|
1622
|
+
dbt_platform_urn, primary=not self.config.dbt_is_primary_sibling
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
yield from auto_workunit(
|
|
1626
|
+
MetadataWorkUnit(
|
|
1627
|
+
id=MetadataWorkUnit.generate_workunit_id(mcp),
|
|
1628
|
+
mcp_raw=mcp,
|
|
1629
|
+
is_primary_source=False, # Not authoritative over warehouse metadata
|
|
1630
|
+
)
|
|
1631
|
+
for mcp in target_patch.build()
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1582
1634
|
# This code block is run when we are generating entities of platform type.
|
|
1583
1635
|
# We will not link the platform not to the dbt node for type "source" because
|
|
1584
1636
|
# in this case the platform table existed first.
|
|
@@ -2134,5 +2186,27 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
2134
2186
|
term_id_set.add(existing_term.urn)
|
|
2135
2187
|
return [GlossaryTermAssociation(term_urn) for term_urn in sorted(term_id_set)]
|
|
2136
2188
|
|
|
2189
|
+
def _should_create_sibling_relationships(self, node: DBTNode) -> bool:
|
|
2190
|
+
"""
|
|
2191
|
+
Determines whether to emit sibling relationships for a dbt node.
|
|
2192
|
+
|
|
2193
|
+
Sibling relationships (both dbt entity's aspect and target entity's patch) are only
|
|
2194
|
+
emitted when dbt_is_primary_sibling=False to establish explicit primary/secondary
|
|
2195
|
+
relationships. When dbt_is_primary_sibling=True,
|
|
2196
|
+
the SiblingAssociationHook handles sibling creation automatically.
|
|
2197
|
+
|
|
2198
|
+
Args:
|
|
2199
|
+
node: The dbt node to evaluate
|
|
2200
|
+
|
|
2201
|
+
Returns:
|
|
2202
|
+
True if sibling patches should be emitted for this node
|
|
2203
|
+
"""
|
|
2204
|
+
# Only create siblings for entities that exist in target platform
|
|
2205
|
+
if not node.exists_in_target_platform:
|
|
2206
|
+
return False
|
|
2207
|
+
|
|
2208
|
+
# Only emit patches when explicit primary/secondary control is needed
|
|
2209
|
+
return self.config.dbt_is_primary_sibling is False
|
|
2210
|
+
|
|
2137
2211
|
def get_report(self):
|
|
2138
2212
|
return self.report
|
|
@@ -14,6 +14,7 @@ from datahub.emitter.mce_builder import (
|
|
|
14
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
15
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
16
16
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
|
+
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
|
|
17
18
|
from datahub.ingestion.source.dremio.dremio_entities import (
|
|
18
19
|
DremioContainer,
|
|
19
20
|
DremioDataset,
|
|
@@ -364,9 +365,9 @@ class DremioAspects:
|
|
|
364
365
|
) -> Optional[BrowsePathsV2Class]:
|
|
365
366
|
paths = []
|
|
366
367
|
|
|
367
|
-
if entity.subclass ==
|
|
368
|
+
if entity.subclass == DatasetContainerSubTypes.DREMIO_SPACE.value:
|
|
368
369
|
paths.append(BrowsePathEntryClass(id="Spaces"))
|
|
369
|
-
elif entity.subclass ==
|
|
370
|
+
elif entity.subclass == DatasetContainerSubTypes.DREMIO_SOURCE.value:
|
|
370
371
|
paths.append(BrowsePathEntryClass(id="Sources"))
|
|
371
372
|
if paths:
|
|
372
373
|
return BrowsePathsV2Class(path=paths)
|
|
@@ -90,6 +90,10 @@ class DremioSourceMapEntry:
|
|
|
90
90
|
@capability(
|
|
91
91
|
SourceCapability.CONTAINERS,
|
|
92
92
|
"Enabled by default",
|
|
93
|
+
subtype_modifier=[
|
|
94
|
+
SourceCapabilityModifier.DREMIO_SPACE,
|
|
95
|
+
SourceCapabilityModifier.DREMIO_SOURCE,
|
|
96
|
+
],
|
|
93
97
|
)
|
|
94
98
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
95
99
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
@@ -12,7 +12,7 @@ from typing import (
|
|
|
12
12
|
Union,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
from pydantic
|
|
15
|
+
from pydantic import Field, PositiveInt
|
|
16
16
|
|
|
17
17
|
from datahub.configuration.common import AllowDenyPattern
|
|
18
18
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
@@ -73,7 +73,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
|
73
73
|
|
|
74
74
|
MAX_ITEMS_TO_RETRIEVE = 100
|
|
75
75
|
PAGE_SIZE = 100
|
|
76
|
-
MAX_SCHEMA_SIZE = 300
|
|
77
76
|
MAX_PRIMARY_KEYS_SIZE = 100
|
|
78
77
|
FIELD_DELIMITER = "."
|
|
79
78
|
|
|
@@ -107,6 +106,10 @@ class DynamoDBConfig(
|
|
|
107
106
|
'Refer "Advanced Configurations" section for more details',
|
|
108
107
|
)
|
|
109
108
|
|
|
109
|
+
max_schema_size: PositiveInt = Field(
|
|
110
|
+
default=300, description="Maximum number of fields to include in the schema."
|
|
111
|
+
)
|
|
112
|
+
|
|
110
113
|
table_pattern: AllowDenyPattern = Field(
|
|
111
114
|
default=AllowDenyPattern.allow_all(),
|
|
112
115
|
description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'",
|
|
@@ -455,25 +458,25 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
455
458
|
) -> SchemaMetadataClass:
|
|
456
459
|
""" "
|
|
457
460
|
To construct the schema metadata, it will first sort the schema by the occurrence of attribute names
|
|
458
|
-
in descending order and truncate the schema by
|
|
461
|
+
in descending order and truncate the schema by max_schema_size, and then start to construct the
|
|
459
462
|
schema metadata sorted by attribute name
|
|
460
463
|
"""
|
|
461
464
|
|
|
462
465
|
canonical_schema: List[SchemaField] = []
|
|
463
466
|
schema_size = len(schema.values())
|
|
464
467
|
table_fields = list(schema.values())
|
|
465
|
-
if schema_size >
|
|
468
|
+
if schema_size > self.config.max_schema_size:
|
|
466
469
|
# downsample the schema, using frequency as the sort key
|
|
467
470
|
self.report.report_warning(
|
|
468
471
|
title="Schema Size Too Large",
|
|
469
|
-
message=f"Downsampling the table schema because
|
|
472
|
+
message=f"Downsampling the table schema because `max_schema_size` threshold is {self.config.max_schema_size}",
|
|
470
473
|
context=f"Collection: {dataset_urn}",
|
|
471
474
|
)
|
|
472
475
|
|
|
473
476
|
# Add this information to the custom properties so user can know they are looking at down sampled schema
|
|
474
477
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
475
478
|
dataset_properties.customProperties["schema.totalFields"] = f"{schema_size}"
|
|
476
|
-
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include
|
|
479
|
+
# append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include max_schema_size items
|
|
477
480
|
primary_keys = []
|
|
478
481
|
for schema_field in sorted(
|
|
479
482
|
table_fields,
|
|
@@ -481,7 +484,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
481
484
|
-x["count"],
|
|
482
485
|
x["delimited_name"],
|
|
483
486
|
), # Negate `count` for descending order, `delimited_name` stays the same for ascending
|
|
484
|
-
)[
|
|
487
|
+
)[: self.config.max_schema_size]:
|
|
485
488
|
field_path = schema_field["delimited_name"]
|
|
486
489
|
native_data_type = self.get_native_type(schema_field["type"], table_name)
|
|
487
490
|
type = self.get_field_type(schema_field["type"], table_name)
|
|
File without changes
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic.fields import Field
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
6
|
+
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
7
|
+
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
8
|
+
from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig
|
|
9
|
+
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
10
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
11
|
+
StatefulStaleMetadataRemovalConfig,
|
|
12
|
+
)
|
|
13
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
14
|
+
StatefulIngestionConfigBase,
|
|
15
|
+
)
|
|
16
|
+
from datahub.ingestion.source_config.operation_config import is_profiling_enabled
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ExcelSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
20
|
+
path_list: List[str] = Field(
|
|
21
|
+
description="List of paths to Excel files or folders to ingest."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
path_pattern: AllowDenyPattern = Field(
|
|
25
|
+
default=AllowDenyPattern.allow_all(),
|
|
26
|
+
description="Regex patterns for file paths to filter in ingestion.",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
aws_config: Optional[AwsConnectionConfig] = Field(
|
|
30
|
+
default=None, description="AWS configuration"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
use_s3_bucket_tags: Optional[bool] = Field(
|
|
34
|
+
default=False,
|
|
35
|
+
description="Whether or not to create tags in datahub from the s3 bucket",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
use_s3_object_tags: Optional[bool] = Field(
|
|
39
|
+
default=False,
|
|
40
|
+
description="Whether or not to create tags in datahub from the s3 object",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
verify_ssl: Union[bool, str] = Field(
|
|
44
|
+
default=True,
|
|
45
|
+
description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
azure_config: Optional[AzureConnectionConfig] = Field(
|
|
49
|
+
default=None, description="Azure configuration"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
use_abs_blob_tags: Optional[bool] = Field(
|
|
53
|
+
default=False,
|
|
54
|
+
description="Whether to create tags in datahub from the abs blob tags",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
convert_urns_to_lowercase: bool = Field(
|
|
58
|
+
default=False,
|
|
59
|
+
description="Enable to convert the Excel asset urns to lowercase",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
active_sheet_only: bool = Field(
|
|
63
|
+
default=False,
|
|
64
|
+
description="Enable to only ingest the active sheet of the workbook. If not set, all sheets will be ingested.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
worksheet_pattern: AllowDenyPattern = Field(
|
|
68
|
+
default=AllowDenyPattern.allow_all(),
|
|
69
|
+
description="Regex patterns for worksheets to ingest. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
|
|
70
|
+
"For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
profile_pattern: AllowDenyPattern = Field(
|
|
74
|
+
default=AllowDenyPattern.allow_all(),
|
|
75
|
+
description="Regex patterns for worksheets to profile. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
|
|
76
|
+
"For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
profiling: GEProfilingConfig = Field(
|
|
80
|
+
default=GEProfilingConfig(),
|
|
81
|
+
description="Configuration for profiling",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
85
|
+
default=None,
|
|
86
|
+
description="Configuration for stateful ingestion and stale metadata removal.",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def is_profiling_enabled(self) -> bool:
|
|
90
|
+
return self.profiling.enabled and is_profiling_enabled(
|
|
91
|
+
self.profiling.operation_config
|
|
92
|
+
)
|