acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -51,13 +51,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
51
51
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
52
52
|
StatefulIngestionSourceBase,
|
|
53
53
|
)
|
|
54
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
54
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
55
|
+
LINEAGE_EXTRACTION,
|
|
56
|
+
METADATA_EXTRACTION,
|
|
57
|
+
PROFILING,
|
|
58
|
+
)
|
|
55
59
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
56
60
|
DatasetLineageTypeClass,
|
|
57
61
|
UpstreamClass,
|
|
58
62
|
UpstreamLineage,
|
|
59
63
|
)
|
|
60
|
-
from datahub.metadata.schema_classes import
|
|
64
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
61
65
|
from datahub.metadata.urns import CorpUserUrn
|
|
62
66
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
63
67
|
KnownQueryLineageInfo,
|
|
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
|
|
|
89
93
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
90
94
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
91
95
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
96
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
92
97
|
class DremioSource(StatefulIngestionSourceBase):
|
|
93
98
|
"""
|
|
94
99
|
This plugin integrates with Dremio to extract and ingest metadata into DataHub.
|
|
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
126
131
|
self.default_db = "dremio"
|
|
127
132
|
self.config = config
|
|
128
133
|
self.report = DremioSourceReport()
|
|
134
|
+
|
|
135
|
+
# Set time window for query lineage extraction
|
|
136
|
+
self.report.window_start_time, self.report.window_end_time = (
|
|
137
|
+
self.config.start_time,
|
|
138
|
+
self.config.end_time,
|
|
139
|
+
)
|
|
140
|
+
|
|
129
141
|
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
130
142
|
|
|
131
143
|
# Initialize API operations
|
|
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
154
166
|
generate_operations=True,
|
|
155
167
|
usage_config=self.config.usage,
|
|
156
168
|
)
|
|
169
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
157
170
|
|
|
158
171
|
# For profiling
|
|
159
172
|
self.profiler = DremioProfiler(config, self.report, dremio_api)
|
|
@@ -190,84 +203,88 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
190
203
|
|
|
191
204
|
self.source_map = self._build_source_map()
|
|
192
205
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
207
|
+
# Process Containers
|
|
208
|
+
containers = self.dremio_catalog.get_containers()
|
|
209
|
+
for container in containers:
|
|
210
|
+
try:
|
|
211
|
+
yield from self.process_container(container)
|
|
212
|
+
logger.info(
|
|
213
|
+
f"Dremio container {container.container_name} emitted successfully"
|
|
214
|
+
)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
self.report.num_containers_failed += 1
|
|
217
|
+
self.report.report_failure(
|
|
218
|
+
message="Failed to process Dremio container",
|
|
219
|
+
context=f"{'.'.join(container.path)}.{container.container_name}",
|
|
220
|
+
exc=exc,
|
|
221
|
+
)
|
|
208
222
|
|
|
209
|
-
|
|
210
|
-
|
|
223
|
+
# Process Datasets
|
|
224
|
+
datasets = self.dremio_catalog.get_datasets()
|
|
211
225
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
226
|
+
for dataset_info in datasets:
|
|
227
|
+
try:
|
|
228
|
+
yield from self.process_dataset(dataset_info)
|
|
229
|
+
logger.info(
|
|
230
|
+
f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
|
|
231
|
+
)
|
|
232
|
+
except Exception as exc:
|
|
233
|
+
self.report.num_datasets_failed += 1 # Increment failed datasets
|
|
234
|
+
self.report.report_failure(
|
|
235
|
+
message="Failed to process Dremio dataset",
|
|
236
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
237
|
+
exc=exc,
|
|
238
|
+
)
|
|
225
239
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
self.get_query_lineage_workunits()
|
|
229
|
-
|
|
230
|
-
# Process Glossary Terms
|
|
231
|
-
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
232
|
-
|
|
233
|
-
for glossary_term in glossary_terms:
|
|
234
|
-
try:
|
|
235
|
-
yield from self.process_glossary_term(glossary_term)
|
|
236
|
-
except Exception as exc:
|
|
237
|
-
self.report.report_failure(
|
|
238
|
-
message="Failed to process Glossary terms",
|
|
239
|
-
context=f"{glossary_term.glossary_term}",
|
|
240
|
-
exc=exc,
|
|
241
|
-
)
|
|
240
|
+
# Process Glossary Terms
|
|
241
|
+
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
242
242
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
243
|
+
for glossary_term in glossary_terms:
|
|
244
|
+
try:
|
|
245
|
+
yield from self.process_glossary_term(glossary_term)
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
self.report.report_failure(
|
|
248
|
+
message="Failed to process Glossary terms",
|
|
249
|
+
context=f"{glossary_term.glossary_term}",
|
|
250
|
+
exc=exc,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Optionally Process Query Lineage
|
|
254
|
+
if self.config.include_query_lineage:
|
|
255
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
256
|
+
self.get_query_lineage_workunits()
|
|
257
|
+
|
|
258
|
+
# Generate workunit for aggregated SQL parsing results
|
|
259
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
260
|
+
yield mcp.as_workunit()
|
|
261
|
+
|
|
262
|
+
# Profiling
|
|
263
|
+
if self.config.is_profiling_enabled():
|
|
264
|
+
with (
|
|
265
|
+
self.report.new_stage(PROFILING),
|
|
266
|
+
ThreadPoolExecutor(
|
|
267
|
+
max_workers=self.config.profiling.max_workers
|
|
268
|
+
) as executor,
|
|
269
|
+
):
|
|
270
|
+
future_to_dataset = {
|
|
271
|
+
executor.submit(self.generate_profiles, dataset): dataset
|
|
272
|
+
for dataset in datasets
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
for future in as_completed(future_to_dataset):
|
|
276
|
+
dataset_info = future_to_dataset[future]
|
|
277
|
+
try:
|
|
278
|
+
yield from future.result()
|
|
279
|
+
except Exception as exc:
|
|
280
|
+
self.report.profiling_skipped_other[
|
|
281
|
+
dataset_info.resource_name
|
|
282
|
+
] += 1
|
|
283
|
+
self.report.report_failure(
|
|
284
|
+
message="Failed to profile dataset",
|
|
285
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
286
|
+
exc=exc,
|
|
287
|
+
)
|
|
271
288
|
|
|
272
289
|
def process_container(
|
|
273
290
|
self, container_info: DremioContainer
|
|
@@ -388,8 +405,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
388
405
|
env=self.config.env,
|
|
389
406
|
platform_instance=self.config.platform_instance,
|
|
390
407
|
)
|
|
391
|
-
|
|
392
|
-
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
408
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
393
409
|
|
|
394
410
|
def generate_view_lineage(
|
|
395
411
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -417,11 +433,8 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
417
433
|
]
|
|
418
434
|
)
|
|
419
435
|
mcp = MetadataChangeProposalWrapper(
|
|
420
|
-
entityType="dataset",
|
|
421
436
|
entityUrn=dataset_urn,
|
|
422
|
-
aspectName=lineage.ASPECT_NAME,
|
|
423
437
|
aspect=lineage,
|
|
424
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
425
438
|
)
|
|
426
439
|
|
|
427
440
|
for upstream_urn in upstream_urns:
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
1
5
|
class DremioSQLQueries:
|
|
2
6
|
QUERY_DATASETS_CE = """
|
|
3
7
|
SELECT* FROM
|
|
@@ -235,28 +239,83 @@ class DremioSQLQueries:
|
|
|
235
239
|
TABLE_NAME ASC
|
|
236
240
|
"""
|
|
237
241
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _get_default_start_timestamp_millis() -> str:
|
|
244
|
+
"""Get default start timestamp (1 day ago) in milliseconds precision format"""
|
|
245
|
+
one_day_ago = datetime.now() - timedelta(days=1)
|
|
246
|
+
return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
|
|
247
|
+
:-3
|
|
248
|
+
] # Truncate to milliseconds
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _get_default_end_timestamp_millis() -> str:
|
|
252
|
+
"""Get default end timestamp (now) in milliseconds precision format"""
|
|
253
|
+
now = datetime.now()
|
|
254
|
+
return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_query_all_jobs(
|
|
258
|
+
start_timestamp_millis: Optional[str] = None,
|
|
259
|
+
end_timestamp_millis: Optional[str] = None,
|
|
260
|
+
) -> str:
|
|
261
|
+
"""
|
|
262
|
+
Get query for all jobs with optional time filtering.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
|
|
266
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
SQL query string with time filtering applied
|
|
270
|
+
"""
|
|
271
|
+
if start_timestamp_millis is None:
|
|
272
|
+
start_timestamp_millis = (
|
|
273
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
274
|
+
)
|
|
275
|
+
if end_timestamp_millis is None:
|
|
276
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
277
|
+
|
|
278
|
+
return f"""
|
|
279
|
+
SELECT
|
|
280
|
+
job_id,
|
|
281
|
+
user_name,
|
|
282
|
+
submitted_ts,
|
|
283
|
+
query,
|
|
284
|
+
queried_datasets
|
|
285
|
+
FROM
|
|
286
|
+
SYS.JOBS_RECENT
|
|
287
|
+
WHERE
|
|
288
|
+
STATUS = 'COMPLETED'
|
|
289
|
+
AND LENGTH(queried_datasets)>0
|
|
290
|
+
AND user_name != '$dremio$'
|
|
291
|
+
AND query_type not like '%INTERNAL%'
|
|
292
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
293
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_query_all_jobs_cloud(
|
|
298
|
+
start_timestamp_millis: Optional[str] = None,
|
|
299
|
+
end_timestamp_millis: Optional[str] = None,
|
|
300
|
+
) -> str:
|
|
301
|
+
"""
|
|
302
|
+
Get query for all jobs in Dremio Cloud with optional time filtering.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
|
|
306
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SQL query string with time filtering applied
|
|
310
|
+
"""
|
|
311
|
+
if start_timestamp_millis is None:
|
|
312
|
+
start_timestamp_millis = (
|
|
313
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
314
|
+
)
|
|
315
|
+
if end_timestamp_millis is None:
|
|
316
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
256
317
|
|
|
257
|
-
|
|
258
|
-
# queried_datasets correctly documented as [varchar]
|
|
259
|
-
QUERY_ALL_JOBS_CLOUD = """
|
|
318
|
+
return f"""
|
|
260
319
|
SELECT
|
|
261
320
|
job_id,
|
|
262
321
|
user_name,
|
|
@@ -270,6 +329,8 @@ class DremioSQLQueries:
|
|
|
270
329
|
AND ARRAY_SIZE(queried_datasets)>0
|
|
271
330
|
AND user_name != '$dremio$'
|
|
272
331
|
AND query_type not like '%INTERNAL%'
|
|
332
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
333
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
273
334
|
"""
|
|
274
335
|
|
|
275
336
|
QUERY_TYPES = [
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
20
|
from datahub.ingestion.api.decorators import (
|
|
21
|
+
SourceCapability,
|
|
21
22
|
SupportStatus,
|
|
23
|
+
capability,
|
|
22
24
|
config_class,
|
|
23
25
|
platform_name,
|
|
24
26
|
support_status,
|
|
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
|
|
|
187
189
|
@platform_name("Metadata File")
|
|
188
190
|
@config_class(FileSourceConfig)
|
|
189
191
|
@support_status(SupportStatus.CERTIFIED)
|
|
192
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
190
193
|
class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
|
|
191
194
|
"""
|
|
192
195
|
This plugin pulls metadata from a previously generated file.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
|
-
from datahub.api.entities.datajob import
|
|
5
|
+
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
6
6
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
7
7
|
DataProcessInstance,
|
|
8
8
|
InstanceRunResult,
|
|
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
42
42
|
FineGrainedLineageDownstreamType,
|
|
43
43
|
FineGrainedLineageUpstreamType,
|
|
44
44
|
)
|
|
45
|
-
from datahub.
|
|
46
|
-
from datahub.
|
|
45
|
+
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
46
|
+
from datahub.sdk.dataflow import DataFlow
|
|
47
|
+
from datahub.sdk.datajob import DataJob
|
|
48
|
+
from datahub.sdk.entity import Entity
|
|
47
49
|
|
|
48
50
|
# Logger instance
|
|
49
51
|
logger = logging.getLogger(__name__)
|
|
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
75
77
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
76
78
|
|
|
77
79
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
78
|
-
input_dataset_urn_list: List[DatasetUrn] = []
|
|
79
|
-
output_dataset_urn_list: List[DatasetUrn] = []
|
|
80
|
+
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
81
|
+
output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
80
82
|
fine_grained_lineage: List[FineGrainedLineage] = []
|
|
81
83
|
|
|
82
84
|
# TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
|
|
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
178
180
|
)
|
|
179
181
|
)
|
|
180
182
|
|
|
181
|
-
datajob.
|
|
182
|
-
datajob.
|
|
183
|
-
datajob.
|
|
183
|
+
datajob.set_inlets(input_dataset_urn_list)
|
|
184
|
+
datajob.set_outlets(output_dataset_urn_list)
|
|
185
|
+
datajob.set_fine_grained_lineages(fine_grained_lineage)
|
|
184
186
|
|
|
185
187
|
return dict(
|
|
186
188
|
**{
|
|
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
197
199
|
|
|
198
200
|
def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
|
|
199
201
|
return DataFlow(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
+
platform=Constant.ORCHESTRATOR,
|
|
203
|
+
name=connector.connector_id,
|
|
202
204
|
env=self.config.env,
|
|
203
|
-
|
|
205
|
+
display_name=connector.connector_name,
|
|
204
206
|
platform_instance=self.config.platform_instance,
|
|
205
207
|
)
|
|
206
208
|
|
|
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
213
215
|
)
|
|
214
216
|
owner_email = self.audit_log.get_user_email(connector.user_id)
|
|
215
217
|
datajob = DataJob(
|
|
216
|
-
|
|
218
|
+
name=connector.connector_id,
|
|
217
219
|
flow_urn=dataflow_urn,
|
|
218
220
|
platform_instance=self.config.platform_instance,
|
|
219
|
-
|
|
220
|
-
owners=
|
|
221
|
+
display_name=connector.connector_name,
|
|
222
|
+
owners=[CorpUserUrn(owner_email)] if owner_email else None,
|
|
221
223
|
)
|
|
222
224
|
|
|
223
225
|
# Map connector source and destination table with dataset entity
|
|
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
232
234
|
"sync_frequency": str(connector.sync_frequency),
|
|
233
235
|
"destination_id": connector.destination_id,
|
|
234
236
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
**lineage_properties,
|
|
238
|
-
}
|
|
237
|
+
|
|
238
|
+
datajob.set_custom_properties({**connector_properties, **lineage_properties})
|
|
239
239
|
|
|
240
240
|
return datajob
|
|
241
241
|
|
|
242
242
|
def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
|
|
243
|
+
# hack: convert to old instance for DataProcessInstance.from_datajob compatibility
|
|
244
|
+
datajob_v1 = DataJobV1(
|
|
245
|
+
id=datajob.name,
|
|
246
|
+
flow_urn=datajob.flow_urn,
|
|
247
|
+
platform_instance=self.config.platform_instance,
|
|
248
|
+
name=datajob.name,
|
|
249
|
+
inlets=datajob.inlets,
|
|
250
|
+
outlets=datajob.outlets,
|
|
251
|
+
fine_grained_lineages=datajob.fine_grained_lineages,
|
|
252
|
+
)
|
|
243
253
|
return DataProcessInstance.from_datajob(
|
|
244
|
-
datajob=
|
|
254
|
+
datajob=datajob_v1,
|
|
245
255
|
id=job.job_id,
|
|
246
256
|
clone_inlets=True,
|
|
247
257
|
clone_outlets=True,
|
|
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
278
288
|
|
|
279
289
|
def _get_connector_workunits(
|
|
280
290
|
self, connector: Connector
|
|
281
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
291
|
+
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
282
292
|
self.report.report_connectors_scanned()
|
|
283
293
|
# Create dataflow entity with same name as connector name
|
|
284
294
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
285
|
-
|
|
286
|
-
yield mcp.as_workunit()
|
|
295
|
+
yield dataflow
|
|
287
296
|
|
|
288
297
|
# Map Fivetran's connector entity with Datahub's datajob entity
|
|
289
298
|
datajob = self._generate_datajob_from_connector(connector)
|
|
290
|
-
|
|
291
|
-
yield mcp.as_workunit()
|
|
299
|
+
yield datajob
|
|
292
300
|
|
|
293
301
|
# Map Fivetran's job/sync history entity with Datahub's data process entity
|
|
294
302
|
if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
|
|
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
310
318
|
).workunit_processor,
|
|
311
319
|
]
|
|
312
320
|
|
|
313
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
321
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
314
322
|
"""
|
|
315
323
|
Datahub Ingestion framework invoke this method
|
|
316
324
|
"""
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
18
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
19
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
19
20
|
from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
|
|
20
21
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
|
|
21
22
|
from datahub.ingestion.source.data_lake_common.object_store import (
|
|
@@ -82,7 +83,14 @@ class GCSSourceReport(DataLakeSourceReport):
|
|
|
82
83
|
@platform_name("Google Cloud Storage", id=PLATFORM_GCS)
|
|
83
84
|
@config_class(GCSSourceConfig)
|
|
84
85
|
@support_status(SupportStatus.INCUBATING)
|
|
85
|
-
@capability(
|
|
86
|
+
@capability(
|
|
87
|
+
SourceCapability.CONTAINERS,
|
|
88
|
+
"Enabled by default",
|
|
89
|
+
subtype_modifier=[
|
|
90
|
+
SourceCapabilityModifier.GCS_BUCKET,
|
|
91
|
+
SourceCapabilityModifier.FOLDER,
|
|
92
|
+
],
|
|
93
|
+
)
|
|
86
94
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
87
95
|
@capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
|
|
88
96
|
class GCSSource(StatefulIngestionSourceBase):
|
|
@@ -112,6 +120,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
120
|
env=self.config.env,
|
|
113
121
|
max_rows=self.config.max_rows,
|
|
114
122
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
123
|
+
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
115
124
|
)
|
|
116
125
|
return s3_config
|
|
117
126
|
|
|
@@ -138,7 +147,9 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
138
147
|
|
|
139
148
|
def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
|
|
140
149
|
config = self.create_equivalent_s3_config()
|
|
141
|
-
|
|
150
|
+
# Create a new context for S3 source without graph to avoid duplicate checkpointer registration
|
|
151
|
+
s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
|
|
152
|
+
s3_source = S3Source(config, s3_ctx)
|
|
142
153
|
return self.s3_source_overrides(s3_source)
|
|
143
154
|
|
|
144
155
|
def s3_source_overrides(self, source: S3Source) -> S3Source:
|