acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
|
|
|
33
33
|
TestableSource,
|
|
34
34
|
TestConnectionReport,
|
|
35
35
|
)
|
|
36
|
-
from datahub.ingestion.api.source_helpers import
|
|
36
|
+
from datahub.ingestion.api.source_helpers import (
|
|
37
|
+
auto_workunit,
|
|
38
|
+
create_dataset_props_patch_builder,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
41
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
39
42
|
ClassificationHandler,
|
|
@@ -45,6 +48,7 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
45
48
|
DatasetSubTypes,
|
|
46
49
|
)
|
|
47
50
|
from datahub.ingestion.source.redshift.config import RedshiftConfig
|
|
51
|
+
from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
|
|
48
52
|
from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
|
|
49
53
|
from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
|
|
50
54
|
from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
|
|
@@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.profile import RedshiftProfiler
|
|
|
52
56
|
from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
|
|
53
57
|
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
54
58
|
RedshiftColumn,
|
|
59
|
+
RedshiftDatabase,
|
|
55
60
|
RedshiftDataDictionary,
|
|
56
61
|
RedshiftSchema,
|
|
57
62
|
RedshiftTable,
|
|
@@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
150
155
|
- Table, row, and column statistics via optional SQL profiling
|
|
151
156
|
- Table lineage
|
|
152
157
|
- Usage statistics
|
|
153
|
-
|
|
154
|
-
### Prerequisites
|
|
155
|
-
|
|
156
|
-
This source needs to access system tables that require extra permissions.
|
|
157
|
-
To grant these permissions, please alter your datahub Redshift user the following way:
|
|
158
|
-
```sql
|
|
159
|
-
ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED;
|
|
160
|
-
GRANT SELECT ON pg_catalog.svv_table_info to datahub_user;
|
|
161
|
-
GRANT SELECT ON pg_catalog.svl_user_info to datahub_user;
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
:::note
|
|
165
|
-
|
|
166
|
-
Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements.
|
|
167
|
-
|
|
168
|
-
:::
|
|
169
|
-
|
|
170
|
-
### Lineage
|
|
171
|
-
|
|
172
|
-
There are multiple lineage collector implementations as Redshift does not support table lineage out of the box.
|
|
173
|
-
|
|
174
|
-
#### stl_scan_based
|
|
175
|
-
The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to
|
|
176
|
-
discover lineage between tables.
|
|
177
|
-
Pros:
|
|
178
|
-
- Fast
|
|
179
|
-
- Reliable
|
|
180
|
-
|
|
181
|
-
Cons:
|
|
182
|
-
- Does not work with Spectrum/external tables because those scans do not show up in stl_scan table.
|
|
183
|
-
- If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies.
|
|
184
|
-
|
|
185
|
-
#### sql_based
|
|
186
|
-
The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries
|
|
187
|
-
and uses sql parsing to discover the dependencies.
|
|
188
|
-
|
|
189
|
-
Pros:
|
|
190
|
-
- Works with Spectrum tables
|
|
191
|
-
- Views are connected properly if a table depends on it
|
|
192
|
-
|
|
193
|
-
Cons:
|
|
194
|
-
- Slow.
|
|
195
|
-
- Less reliable as the query parser can fail on certain queries
|
|
196
|
-
|
|
197
|
-
#### mixed
|
|
198
|
-
Using both collector above and first applying the sql based and then the stl_scan based one.
|
|
199
|
-
|
|
200
|
-
Pros:
|
|
201
|
-
- Works with Spectrum tables
|
|
202
|
-
- Views are connected properly if a table depends on it
|
|
203
|
-
- A bit more reliable than the sql_based one only
|
|
204
|
-
|
|
205
|
-
Cons:
|
|
206
|
-
- Slow
|
|
207
|
-
- May be incorrect at times as the query parser can fail on certain queries
|
|
208
|
-
|
|
209
|
-
:::note
|
|
210
|
-
|
|
211
|
-
The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window.
|
|
212
|
-
|
|
213
|
-
:::
|
|
214
|
-
|
|
215
|
-
### Profiling
|
|
216
|
-
Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled.
|
|
217
|
-
|
|
218
|
-
If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data.
|
|
219
|
-
```yaml
|
|
220
|
-
profiling:
|
|
221
|
-
profile_table_level_only: true
|
|
222
|
-
```
|
|
223
158
|
"""
|
|
224
159
|
|
|
225
160
|
# TODO: Replace with standardized types in sql_types.py
|
|
@@ -330,6 +265,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
330
265
|
self.config: RedshiftConfig = config
|
|
331
266
|
self.report: RedshiftReport = RedshiftReport()
|
|
332
267
|
self.classification_handler = ClassificationHandler(self.config, self.report)
|
|
268
|
+
self.datashares_helper = RedshiftDatasharesHelper(
|
|
269
|
+
self.config, self.report, self.ctx.graph
|
|
270
|
+
)
|
|
333
271
|
self.platform = "redshift"
|
|
334
272
|
self.domain_registry = None
|
|
335
273
|
if self.config.domain:
|
|
@@ -361,6 +299,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
361
299
|
is_serverless=self.config.is_serverless
|
|
362
300
|
)
|
|
363
301
|
|
|
302
|
+
self.db: Optional[RedshiftDatabase] = None
|
|
364
303
|
self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {}
|
|
365
304
|
self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {}
|
|
366
305
|
self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {}
|
|
@@ -424,6 +363,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
424
363
|
|
|
425
364
|
database = self.config.database
|
|
426
365
|
logger.info(f"Processing db {database}")
|
|
366
|
+
|
|
367
|
+
self.db = self.data_dictionary.get_database_details(connection, database)
|
|
368
|
+
self.report.is_shared_database = (
|
|
369
|
+
self.db is not None and self.db.is_shared_database()
|
|
370
|
+
)
|
|
427
371
|
with self.report.new_stage(METADATA_EXTRACTION):
|
|
428
372
|
self.db_tables[database] = defaultdict()
|
|
429
373
|
self.db_views[database] = defaultdict()
|
|
@@ -563,7 +507,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
563
507
|
|
|
564
508
|
schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
|
|
565
509
|
schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
|
|
566
|
-
conn=connection,
|
|
510
|
+
conn=connection,
|
|
511
|
+
database=database,
|
|
512
|
+
schema=schema,
|
|
513
|
+
is_shared_database=self.report.is_shared_database,
|
|
567
514
|
)
|
|
568
515
|
|
|
569
516
|
if self.config.include_tables:
|
|
@@ -883,10 +830,14 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
883
830
|
domain_config=self.config.domain,
|
|
884
831
|
)
|
|
885
832
|
|
|
886
|
-
def cache_tables_and_views(
|
|
833
|
+
def cache_tables_and_views(
|
|
834
|
+
self, connection: redshift_connector.Connection, database: str
|
|
835
|
+
) -> None:
|
|
887
836
|
tables, views = self.data_dictionary.get_tables_and_views(
|
|
888
837
|
conn=connection,
|
|
838
|
+
database=database,
|
|
889
839
|
skip_external_tables=self.config.skip_external_tables,
|
|
840
|
+
is_shared_database=self.report.is_shared_database,
|
|
890
841
|
)
|
|
891
842
|
for schema in tables:
|
|
892
843
|
if not is_schema_allowed(
|
|
@@ -1029,6 +980,28 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1029
980
|
database: str,
|
|
1030
981
|
lineage_extractor: RedshiftSqlLineageV2,
|
|
1031
982
|
) -> Iterable[MetadataWorkUnit]:
|
|
983
|
+
if self.config.include_share_lineage:
|
|
984
|
+
outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
|
|
985
|
+
yield from auto_workunit(
|
|
986
|
+
self.datashares_helper.to_platform_resource(list(outbound_shares))
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
if self.db and self.db.is_shared_database():
|
|
990
|
+
inbound_share = self.db.get_inbound_share()
|
|
991
|
+
if inbound_share is None:
|
|
992
|
+
self.report.warning(
|
|
993
|
+
title="Upstream lineage of inbound datashare will be missing",
|
|
994
|
+
message="Database options do not contain sufficient information",
|
|
995
|
+
context=f"Database: {database}, Options {self.db.options}",
|
|
996
|
+
)
|
|
997
|
+
else:
|
|
998
|
+
for known_lineage in self.datashares_helper.generate_lineage(
|
|
999
|
+
inbound_share, self.get_all_tables()[database]
|
|
1000
|
+
):
|
|
1001
|
+
lineage_extractor.aggregator.add(known_lineage)
|
|
1002
|
+
|
|
1003
|
+
# TODO: distinguish between definition level lineage and audit log based lineage.
|
|
1004
|
+
# Definition level lineage should never be skipped
|
|
1032
1005
|
if not self._should_ingest_lineage():
|
|
1033
1006
|
return
|
|
1034
1007
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
|
-
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
5
6
|
|
|
6
7
|
import redshift_connector
|
|
7
8
|
|
|
@@ -41,6 +42,9 @@ class RedshiftTable(BaseTable):
|
|
|
41
42
|
serde_parameters: Optional[str] = None
|
|
42
43
|
last_altered: Optional[datetime] = None
|
|
43
44
|
|
|
45
|
+
def is_external_table(self) -> bool:
|
|
46
|
+
return self.type == "EXTERNAL_TABLE"
|
|
47
|
+
|
|
44
48
|
|
|
45
49
|
@dataclass
|
|
46
50
|
class RedshiftView(BaseTable):
|
|
@@ -51,6 +55,9 @@ class RedshiftView(BaseTable):
|
|
|
51
55
|
size_in_bytes: Optional[int] = None
|
|
52
56
|
rows_count: Optional[int] = None
|
|
53
57
|
|
|
58
|
+
def is_external_table(self) -> bool:
|
|
59
|
+
return self.type == "EXTERNAL_TABLE"
|
|
60
|
+
|
|
54
61
|
|
|
55
62
|
@dataclass
|
|
56
63
|
class RedshiftSchema:
|
|
@@ -59,8 +66,119 @@ class RedshiftSchema:
|
|
|
59
66
|
type: str
|
|
60
67
|
owner: Optional[str] = None
|
|
61
68
|
option: Optional[str] = None
|
|
69
|
+
external_platform: Optional[str] = None
|
|
62
70
|
external_database: Optional[str] = None
|
|
63
71
|
|
|
72
|
+
def is_external_schema(self) -> bool:
|
|
73
|
+
return self.type == "external"
|
|
74
|
+
|
|
75
|
+
def get_upstream_schema_name(self) -> Optional[str]:
|
|
76
|
+
"""Gets the schema name from the external schema option.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Optional[str]: The schema name from the external schema option
|
|
80
|
+
if this is an external schema and has a valid option format, None otherwise.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
if not self.is_external_schema() or not self.option:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
# For external schema on redshift, option is in form
|
|
87
|
+
# {"SCHEMA":"tickit"}
|
|
88
|
+
schema_match = re.search(r'"SCHEMA"\s*:\s*"([^"]*)"', self.option)
|
|
89
|
+
if not schema_match:
|
|
90
|
+
return None
|
|
91
|
+
else:
|
|
92
|
+
return schema_match.group(1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class PartialInboundDatashare:
|
|
97
|
+
share_name: str
|
|
98
|
+
producer_namespace_prefix: str
|
|
99
|
+
consumer_database: str
|
|
100
|
+
|
|
101
|
+
def get_description(self) -> str:
|
|
102
|
+
return (
|
|
103
|
+
f"Namespace Prefix {self.producer_namespace_prefix} Share {self.share_name}"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class OutboundDatashare:
|
|
109
|
+
share_name: str
|
|
110
|
+
producer_namespace: str
|
|
111
|
+
source_database: str
|
|
112
|
+
|
|
113
|
+
def get_key(self) -> str:
|
|
114
|
+
return f"{self.producer_namespace}.{self.share_name}"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class InboundDatashare:
|
|
119
|
+
share_name: str
|
|
120
|
+
producer_namespace: str
|
|
121
|
+
consumer_database: str
|
|
122
|
+
|
|
123
|
+
def get_key(self) -> str:
|
|
124
|
+
return f"{self.producer_namespace}.{self.share_name}"
|
|
125
|
+
|
|
126
|
+
def get_description(self) -> str:
|
|
127
|
+
return f"Namespace {self.producer_namespace} Share {self.share_name}"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class RedshiftDatabase:
|
|
132
|
+
name: str
|
|
133
|
+
type: str
|
|
134
|
+
options: Optional[str] = None
|
|
135
|
+
|
|
136
|
+
def is_shared_database(self) -> bool:
|
|
137
|
+
return self.type == "shared"
|
|
138
|
+
|
|
139
|
+
# NOTE: ideally options are in form
|
|
140
|
+
# {"datashare_name":"xxx","datashare_producer_account":"1234","datashare_producer_namespace":"yyy"}
|
|
141
|
+
# however due to varchar(128) type of database table that captures options
|
|
142
|
+
# we may receive only partial information about inbound share
|
|
143
|
+
def get_inbound_share(
|
|
144
|
+
self,
|
|
145
|
+
) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]:
|
|
146
|
+
if not self.is_shared_database() or not self.options:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
# Convert into single regex ??
|
|
150
|
+
share_name_match = re.search(r'"datashare_name"\s*:\s*"([^"]*)"', self.options)
|
|
151
|
+
namespace_match = re.search(
|
|
152
|
+
r'"datashare_producer_namespace"\s*:\s*"([^"]*)"', self.options
|
|
153
|
+
)
|
|
154
|
+
partial_namespace_match = re.search(
|
|
155
|
+
r'"datashare_producer_namespace"\s*:\s*"([^"]*)$', self.options
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if not share_name_match:
|
|
159
|
+
# We will always at least get share name
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
share_name = share_name_match.group(1)
|
|
163
|
+
if namespace_match:
|
|
164
|
+
return InboundDatashare(
|
|
165
|
+
share_name=share_name,
|
|
166
|
+
producer_namespace=namespace_match.group(1),
|
|
167
|
+
consumer_database=self.name,
|
|
168
|
+
)
|
|
169
|
+
elif partial_namespace_match:
|
|
170
|
+
return PartialInboundDatashare(
|
|
171
|
+
share_name=share_name,
|
|
172
|
+
producer_namespace_prefix=partial_namespace_match.group(1),
|
|
173
|
+
consumer_database=self.name,
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
return PartialInboundDatashare(
|
|
177
|
+
share_name=share_name,
|
|
178
|
+
producer_namespace_prefix="",
|
|
179
|
+
consumer_database=self.name,
|
|
180
|
+
)
|
|
181
|
+
|
|
64
182
|
|
|
65
183
|
@dataclass
|
|
66
184
|
class RedshiftExtraTableMeta:
|
|
@@ -141,13 +259,31 @@ class RedshiftDataDictionary:
|
|
|
141
259
|
|
|
142
260
|
return [db[0] for db in dbs]
|
|
143
261
|
|
|
262
|
+
@staticmethod
|
|
263
|
+
def get_database_details(
|
|
264
|
+
conn: redshift_connector.Connection, database: str
|
|
265
|
+
) -> Optional[RedshiftDatabase]:
|
|
266
|
+
cursor = RedshiftDataDictionary.get_query_result(
|
|
267
|
+
conn,
|
|
268
|
+
RedshiftCommonQuery.get_database_details(database),
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
row = cursor.fetchone()
|
|
272
|
+
if row is None:
|
|
273
|
+
return None
|
|
274
|
+
return RedshiftDatabase(
|
|
275
|
+
name=database,
|
|
276
|
+
type=row[1],
|
|
277
|
+
options=row[2],
|
|
278
|
+
)
|
|
279
|
+
|
|
144
280
|
@staticmethod
|
|
145
281
|
def get_schemas(
|
|
146
282
|
conn: redshift_connector.Connection, database: str
|
|
147
283
|
) -> List[RedshiftSchema]:
|
|
148
284
|
cursor = RedshiftDataDictionary.get_query_result(
|
|
149
285
|
conn,
|
|
150
|
-
RedshiftCommonQuery.list_schemas
|
|
286
|
+
RedshiftCommonQuery.list_schemas(database),
|
|
151
287
|
)
|
|
152
288
|
|
|
153
289
|
schemas = cursor.fetchall()
|
|
@@ -158,8 +294,8 @@ class RedshiftDataDictionary:
|
|
|
158
294
|
database=database,
|
|
159
295
|
name=schema[field_names.index("schema_name")],
|
|
160
296
|
type=schema[field_names.index("schema_type")],
|
|
161
|
-
owner=schema[field_names.index("schema_owner_name")],
|
|
162
297
|
option=schema[field_names.index("schema_option")],
|
|
298
|
+
external_platform=schema[field_names.index("external_platform")],
|
|
163
299
|
external_database=schema[field_names.index("external_database")],
|
|
164
300
|
)
|
|
165
301
|
for schema in schemas
|
|
@@ -202,7 +338,9 @@ class RedshiftDataDictionary:
|
|
|
202
338
|
def get_tables_and_views(
|
|
203
339
|
self,
|
|
204
340
|
conn: redshift_connector.Connection,
|
|
341
|
+
database: str,
|
|
205
342
|
skip_external_tables: bool = False,
|
|
343
|
+
is_shared_database: bool = False,
|
|
206
344
|
) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]:
|
|
207
345
|
tables: Dict[str, List[RedshiftTable]] = {}
|
|
208
346
|
views: Dict[str, List[RedshiftView]] = {}
|
|
@@ -213,7 +351,11 @@ class RedshiftDataDictionary:
|
|
|
213
351
|
|
|
214
352
|
cur = RedshiftDataDictionary.get_query_result(
|
|
215
353
|
conn,
|
|
216
|
-
RedshiftCommonQuery.list_tables(
|
|
354
|
+
RedshiftCommonQuery.list_tables(
|
|
355
|
+
database=database,
|
|
356
|
+
skip_external_tables=skip_external_tables,
|
|
357
|
+
is_shared_database=is_shared_database,
|
|
358
|
+
),
|
|
217
359
|
)
|
|
218
360
|
field_names = [i[0] for i in cur.description]
|
|
219
361
|
db_tables = cur.fetchall()
|
|
@@ -358,11 +500,18 @@ class RedshiftDataDictionary:
|
|
|
358
500
|
|
|
359
501
|
@staticmethod
|
|
360
502
|
def get_columns_for_schema(
|
|
361
|
-
conn: redshift_connector.Connection,
|
|
503
|
+
conn: redshift_connector.Connection,
|
|
504
|
+
database: str,
|
|
505
|
+
schema: RedshiftSchema,
|
|
506
|
+
is_shared_database: bool = False,
|
|
362
507
|
) -> Dict[str, List[RedshiftColumn]]:
|
|
363
508
|
cursor = RedshiftDataDictionary.get_query_result(
|
|
364
509
|
conn,
|
|
365
|
-
RedshiftCommonQuery.list_columns
|
|
510
|
+
RedshiftCommonQuery.list_columns(
|
|
511
|
+
database_name=database,
|
|
512
|
+
schema_name=schema.name,
|
|
513
|
+
is_shared_database=is_shared_database,
|
|
514
|
+
),
|
|
366
515
|
)
|
|
367
516
|
|
|
368
517
|
table_columns: Dict[str, List[RedshiftColumn]] = {}
|
|
@@ -508,3 +657,34 @@ class RedshiftDataDictionary:
|
|
|
508
657
|
start_time=row[field_names.index("start_time")],
|
|
509
658
|
)
|
|
510
659
|
rows = cursor.fetchmany()
|
|
660
|
+
|
|
661
|
+
@staticmethod
|
|
662
|
+
def get_outbound_datashares(
|
|
663
|
+
conn: redshift_connector.Connection,
|
|
664
|
+
) -> Iterable[OutboundDatashare]:
|
|
665
|
+
cursor = conn.cursor()
|
|
666
|
+
cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
|
|
667
|
+
for item in cursor.fetchall():
|
|
668
|
+
yield OutboundDatashare(
|
|
669
|
+
share_name=item[1],
|
|
670
|
+
producer_namespace=item[2],
|
|
671
|
+
source_database=item[3],
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# NOTE: this is not used right now as it requires superuser privilege
|
|
675
|
+
# We can use this in future if the permissions are lowered.
|
|
676
|
+
@staticmethod
|
|
677
|
+
def get_inbound_datashare(
|
|
678
|
+
conn: redshift_connector.Connection,
|
|
679
|
+
database: str,
|
|
680
|
+
) -> Optional[InboundDatashare]:
|
|
681
|
+
cursor = conn.cursor()
|
|
682
|
+
cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
|
|
683
|
+
item = cursor.fetchone()
|
|
684
|
+
if item:
|
|
685
|
+
return InboundDatashare(
|
|
686
|
+
share_name=item[1],
|
|
687
|
+
producer_namespace=item[2],
|
|
688
|
+
consumer_database=item[3],
|
|
689
|
+
)
|
|
690
|
+
return None
|
|
@@ -60,5 +60,8 @@ class RedshiftReport(
|
|
|
60
60
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
61
61
|
lineage_phases_timer: Dict[str, PerfTimer] = field(default_factory=dict)
|
|
62
62
|
|
|
63
|
+
is_shared_database: bool = False
|
|
64
|
+
outbound_shares_count: Optional[int] = None
|
|
65
|
+
|
|
63
66
|
def report_dropped(self, key: str) -> None:
|
|
64
67
|
self.filtered.append(key)
|
|
@@ -5,7 +5,9 @@ import pydantic
|
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
8
|
-
from datahub.configuration.source_common import
|
|
8
|
+
from datahub.configuration.source_common import (
|
|
9
|
+
DatasetSourceConfigMixin,
|
|
10
|
+
)
|
|
9
11
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
10
12
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
11
13
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
@@ -152,10 +154,8 @@ class DataLakeSourceConfig(
|
|
|
152
154
|
return path_specs
|
|
153
155
|
|
|
154
156
|
@pydantic.validator("platform", always=True)
|
|
155
|
-
def platform_valid(cls, platform:
|
|
156
|
-
inferred_platform = values.get(
|
|
157
|
-
"platform", None
|
|
158
|
-
) # we may have inferred it above
|
|
157
|
+
def platform_valid(cls, platform: Any, values: dict) -> str:
|
|
158
|
+
inferred_platform = values.get("platform") # we may have inferred it above
|
|
159
159
|
platform = platform or inferred_platform
|
|
160
160
|
if not platform:
|
|
161
161
|
raise ValueError("platform must not be empty")
|
|
@@ -834,7 +834,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
834
834
|
min=min,
|
|
835
835
|
)
|
|
836
836
|
folders.extend(folders_list)
|
|
837
|
-
if
|
|
837
|
+
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
|
838
838
|
return folders
|
|
839
839
|
if folders:
|
|
840
840
|
return folders
|
|
@@ -847,7 +847,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
847
847
|
path_spec: PathSpec,
|
|
848
848
|
bucket: "Bucket",
|
|
849
849
|
prefix: str,
|
|
850
|
-
) ->
|
|
850
|
+
) -> Iterable[Folder]:
|
|
851
851
|
"""
|
|
852
852
|
Retrieves all the folders in a path by listing all the files in the prefix.
|
|
853
853
|
If the prefix is a full path then only that folder will be extracted.
|
|
@@ -877,51 +877,30 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
877
877
|
s3_objects = (
|
|
878
878
|
obj
|
|
879
879
|
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
880
|
-
if _is_allowed_path(
|
|
880
|
+
if _is_allowed_path(
|
|
881
|
+
path_spec, self.create_s3_path(obj.bucket_name, obj.key)
|
|
882
|
+
)
|
|
881
883
|
)
|
|
882
|
-
|
|
883
|
-
partitions: List[Folder] = []
|
|
884
884
|
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
885
885
|
s3_objects,
|
|
886
886
|
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
887
887
|
)
|
|
888
|
-
for
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
logger.warning(
|
|
903
|
-
f"Unable to find any files in the folder {key}. Skipping..."
|
|
904
|
-
)
|
|
905
|
-
continue
|
|
906
|
-
|
|
907
|
-
id = path_spec.get_partition_from_path(
|
|
908
|
-
self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
888
|
+
for _, group in grouped_s3_objects_by_dirname:
|
|
889
|
+
max_file = max(group, key=lambda x: x.last_modified)
|
|
890
|
+
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
891
|
+
|
|
892
|
+
# If partition_id is None, it means the folder is not a partition
|
|
893
|
+
partition_id = path_spec.get_partition_from_path(max_file_s3_path)
|
|
894
|
+
|
|
895
|
+
yield Folder(
|
|
896
|
+
partition_id=partition_id,
|
|
897
|
+
is_partition=bool(partition_id),
|
|
898
|
+
creation_time=min(obj.last_modified for obj in group),
|
|
899
|
+
modification_time=max_file.last_modified,
|
|
900
|
+
sample_file=max_file_s3_path,
|
|
901
|
+
size=sum(obj.size for obj in group),
|
|
909
902
|
)
|
|
910
903
|
|
|
911
|
-
# If id is None, it means the folder is not a partition
|
|
912
|
-
partitions.append(
|
|
913
|
-
Folder(
|
|
914
|
-
partition_id=id,
|
|
915
|
-
is_partition=bool(id),
|
|
916
|
-
creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
|
|
917
|
-
modification_time=modification_time,
|
|
918
|
-
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
|
|
919
|
-
size=file_size,
|
|
920
|
-
)
|
|
921
|
-
)
|
|
922
|
-
|
|
923
|
-
return partitions
|
|
924
|
-
|
|
925
904
|
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
|
|
926
905
|
if self.source_config.aws_config is None:
|
|
927
906
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
@@ -1000,7 +979,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1000
979
|
min=True,
|
|
1001
980
|
)
|
|
1002
981
|
dirs_to_process.append(dirs_to_process_min[0])
|
|
1003
|
-
folders = []
|
|
982
|
+
folders: List[Folder] = []
|
|
1004
983
|
for dir in dirs_to_process:
|
|
1005
984
|
logger.info(f"Getting files from folder: {dir}")
|
|
1006
985
|
prefix_to_process = urlparse(dir).path.lstrip("/")
|