acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,14 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from datahub.emitter.mce_builder import (
|
|
7
|
-
make_data_platform_urn,
|
|
8
|
-
make_dataplatform_instance_urn,
|
|
9
7
|
make_dataset_urn_with_platform_instance,
|
|
10
8
|
make_schema_field_urn,
|
|
11
9
|
)
|
|
12
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
10
|
from datahub.emitter.mcp_builder import (
|
|
14
11
|
ContainerKey,
|
|
15
|
-
add_dataset_to_container,
|
|
16
|
-
gen_containers,
|
|
17
12
|
)
|
|
18
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
19
14
|
from datahub.ingestion.api.decorators import (
|
|
@@ -31,6 +26,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
|
|
|
31
26
|
CassandraColumn,
|
|
32
27
|
CassandraEntities,
|
|
33
28
|
CassandraKeyspace,
|
|
29
|
+
CassandraSharedDatasetFields,
|
|
34
30
|
CassandraTable,
|
|
35
31
|
CassandraView,
|
|
36
32
|
)
|
|
@@ -51,24 +47,21 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
51
47
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
52
48
|
StatefulIngestionSourceBase,
|
|
53
49
|
)
|
|
54
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
55
50
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
56
51
|
SchemaField,
|
|
57
|
-
SchemaMetadata,
|
|
58
52
|
)
|
|
59
53
|
from datahub.metadata.schema_classes import (
|
|
60
|
-
DataPlatformInstanceClass,
|
|
61
54
|
DatasetLineageTypeClass,
|
|
62
|
-
DatasetPropertiesClass,
|
|
63
55
|
FineGrainedLineageClass,
|
|
64
56
|
FineGrainedLineageDownstreamTypeClass,
|
|
65
57
|
FineGrainedLineageUpstreamTypeClass,
|
|
66
|
-
OtherSchemaClass,
|
|
67
|
-
SubTypesClass,
|
|
68
58
|
UpstreamClass,
|
|
69
59
|
UpstreamLineageClass,
|
|
70
60
|
ViewPropertiesClass,
|
|
71
61
|
)
|
|
62
|
+
from datahub.sdk.container import Container
|
|
63
|
+
from datahub.sdk.dataset import Dataset
|
|
64
|
+
from datahub.sdk.entity import Entity
|
|
72
65
|
|
|
73
66
|
logger = logging.getLogger(__name__)
|
|
74
67
|
|
|
@@ -133,6 +126,13 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
133
126
|
def get_workunits_internal(
|
|
134
127
|
self,
|
|
135
128
|
) -> Iterable[MetadataWorkUnit]:
|
|
129
|
+
for metadata in self._get_metadata():
|
|
130
|
+
if isinstance(metadata, MetadataWorkUnit):
|
|
131
|
+
yield metadata
|
|
132
|
+
else:
|
|
133
|
+
yield from metadata.as_workunits()
|
|
134
|
+
|
|
135
|
+
def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
136
136
|
if not self.cassandra_api.authenticate():
|
|
137
137
|
return
|
|
138
138
|
keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
|
|
@@ -145,7 +145,7 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
145
145
|
self.report.report_dropped(keyspace_name)
|
|
146
146
|
continue
|
|
147
147
|
|
|
148
|
-
yield
|
|
148
|
+
yield self._generate_keyspace_container(keyspace)
|
|
149
149
|
|
|
150
150
|
try:
|
|
151
151
|
yield from self._extract_tables_from_keyspace(keyspace_name)
|
|
@@ -170,21 +170,20 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
170
170
|
if self.config.is_profiling_enabled():
|
|
171
171
|
yield from self.profiler.get_workunits(self.cassandra_data)
|
|
172
172
|
|
|
173
|
-
def _generate_keyspace_container(
|
|
174
|
-
self, keyspace: CassandraKeyspace
|
|
175
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
173
|
+
def _generate_keyspace_container(self, keyspace: CassandraKeyspace) -> Container:
|
|
176
174
|
keyspace_container_key = self._generate_keyspace_container_key(
|
|
177
175
|
keyspace.keyspace_name
|
|
178
176
|
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
177
|
+
|
|
178
|
+
return Container(
|
|
179
|
+
keyspace_container_key,
|
|
180
|
+
display_name=keyspace.keyspace_name,
|
|
182
181
|
qualified_name=keyspace.keyspace_name,
|
|
182
|
+
subtype=DatasetContainerSubTypes.KEYSPACE,
|
|
183
183
|
extra_properties={
|
|
184
184
|
"durable_writes": str(keyspace.durable_writes),
|
|
185
185
|
"replication": json.dumps(keyspace.replication),
|
|
186
186
|
},
|
|
187
|
-
sub_types=[DatasetContainerSubTypes.KEYSPACE],
|
|
188
187
|
)
|
|
189
188
|
|
|
190
189
|
def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey:
|
|
@@ -196,105 +195,55 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
196
195
|
)
|
|
197
196
|
|
|
198
197
|
# get all tables for a given keyspace, iterate over them to extract column metadata
|
|
199
|
-
def _extract_tables_from_keyspace(
|
|
200
|
-
self, keyspace_name: str
|
|
201
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
198
|
+
def _extract_tables_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
|
|
202
199
|
self.cassandra_data.keyspaces.append(keyspace_name)
|
|
203
200
|
tables: List[CassandraTable] = self.cassandra_api.get_tables(keyspace_name)
|
|
204
201
|
for table in tables:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
202
|
+
dataset = self._generate_table(keyspace_name, table)
|
|
203
|
+
if dataset:
|
|
204
|
+
yield dataset
|
|
205
|
+
|
|
206
|
+
def _generate_table(
|
|
207
|
+
self, keyspace_name: str, table: CassandraTable
|
|
208
|
+
) -> Optional[Dataset]:
|
|
209
|
+
table_name: str = table.table_name
|
|
210
|
+
dataset_name: str = f"{keyspace_name}.{table_name}"
|
|
211
|
+
|
|
212
|
+
self.report.report_entity_scanned(dataset_name, ent_type="Table")
|
|
213
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
214
|
+
self.report.report_dropped(dataset_name)
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name)
|
|
218
|
+
|
|
219
|
+
schema_fields = None
|
|
220
|
+
try:
|
|
221
|
+
schema_fields = self._extract_columns_from_table(keyspace_name, table_name)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self.report.failure(
|
|
224
|
+
message="Failed to extract columns from table",
|
|
225
|
+
context=dataset_name,
|
|
226
|
+
exc=e,
|
|
221
227
|
)
|
|
222
228
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
entityUrn=dataset_urn,
|
|
237
|
-
aspect=StatusClass(removed=False),
|
|
238
|
-
).as_workunit()
|
|
239
|
-
|
|
240
|
-
yield MetadataChangeProposalWrapper(
|
|
241
|
-
entityUrn=dataset_urn,
|
|
242
|
-
aspect=SubTypesClass(
|
|
243
|
-
typeNames=[
|
|
244
|
-
DatasetSubTypes.TABLE,
|
|
245
|
-
]
|
|
246
|
-
),
|
|
247
|
-
).as_workunit()
|
|
248
|
-
|
|
249
|
-
yield MetadataChangeProposalWrapper(
|
|
250
|
-
entityUrn=dataset_urn,
|
|
251
|
-
aspect=DatasetPropertiesClass(
|
|
252
|
-
name=table_name,
|
|
253
|
-
qualifiedName=f"{keyspace_name}.{table_name}",
|
|
254
|
-
description=table.comment,
|
|
255
|
-
customProperties={
|
|
256
|
-
"bloom_filter_fp_chance": str(table.bloom_filter_fp_chance),
|
|
257
|
-
"caching": json.dumps(table.caching),
|
|
258
|
-
"compaction": json.dumps(table.compaction),
|
|
259
|
-
"compression": json.dumps(table.compression),
|
|
260
|
-
"crc_check_chance": str(table.crc_check_chance),
|
|
261
|
-
"dclocal_read_repair_chance": str(
|
|
262
|
-
table.dclocal_read_repair_chance
|
|
263
|
-
),
|
|
264
|
-
"default_time_to_live": str(table.default_time_to_live),
|
|
265
|
-
"extensions": json.dumps(table.extensions),
|
|
266
|
-
"gc_grace_seconds": str(table.gc_grace_seconds),
|
|
267
|
-
"max_index_interval": str(table.max_index_interval),
|
|
268
|
-
"min_index_interval": str(table.min_index_interval),
|
|
269
|
-
"memtable_flush_period_in_ms": str(
|
|
270
|
-
table.memtable_flush_period_in_ms
|
|
271
|
-
),
|
|
272
|
-
"read_repair_chance": str(table.read_repair_chance),
|
|
273
|
-
"speculative_retry": str(table.speculative_retry),
|
|
274
|
-
},
|
|
275
|
-
),
|
|
276
|
-
).as_workunit()
|
|
277
|
-
|
|
278
|
-
yield from add_dataset_to_container(
|
|
279
|
-
container_key=self._generate_keyspace_container_key(keyspace_name),
|
|
280
|
-
dataset_urn=dataset_urn,
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
if self.config.platform_instance:
|
|
284
|
-
yield MetadataChangeProposalWrapper(
|
|
285
|
-
entityUrn=dataset_urn,
|
|
286
|
-
aspect=DataPlatformInstanceClass(
|
|
287
|
-
platform=make_data_platform_urn(self.platform),
|
|
288
|
-
instance=make_dataplatform_instance_urn(
|
|
289
|
-
self.platform, self.config.platform_instance
|
|
290
|
-
),
|
|
291
|
-
),
|
|
292
|
-
).as_workunit()
|
|
229
|
+
return Dataset(
|
|
230
|
+
platform=self.platform,
|
|
231
|
+
name=dataset_name,
|
|
232
|
+
env=self.config.env,
|
|
233
|
+
platform_instance=self.config.platform_instance,
|
|
234
|
+
subtype=DatasetSubTypes.TABLE,
|
|
235
|
+
parent_container=self._generate_keyspace_container_key(keyspace_name),
|
|
236
|
+
schema=schema_fields,
|
|
237
|
+
display_name=table_name,
|
|
238
|
+
qualified_name=dataset_name,
|
|
239
|
+
description=table.comment,
|
|
240
|
+
custom_properties=self._get_dataset_custom_props(table),
|
|
241
|
+
)
|
|
293
242
|
|
|
294
243
|
# get all columns for a given table, iterate over them to extract column metadata
|
|
295
244
|
def _extract_columns_from_table(
|
|
296
|
-
self, keyspace_name: str, table_name: str
|
|
297
|
-
) ->
|
|
245
|
+
self, keyspace_name: str, table_name: str
|
|
246
|
+
) -> Optional[List[SchemaField]]:
|
|
298
247
|
column_infos: List[CassandraColumn] = self.cassandra_api.get_columns(
|
|
299
248
|
keyspace_name, table_name
|
|
300
249
|
)
|
|
@@ -305,147 +254,117 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
305
254
|
self.report.report_warning(
|
|
306
255
|
message="Table has no columns, skipping", context=table_name
|
|
307
256
|
)
|
|
308
|
-
return
|
|
257
|
+
return None
|
|
309
258
|
|
|
259
|
+
# Tricky: we also save the column info to a global store.
|
|
310
260
|
jsonable_column_infos: List[Dict[str, Any]] = []
|
|
311
261
|
for column in column_infos:
|
|
312
262
|
self.cassandra_data.columns.setdefault(table_name, []).append(column)
|
|
313
263
|
jsonable_column_infos.append(dataclasses.asdict(column))
|
|
314
264
|
|
|
315
|
-
|
|
316
|
-
schemaName=table_name,
|
|
317
|
-
platform=make_data_platform_urn(self.platform),
|
|
318
|
-
version=0,
|
|
319
|
-
hash="",
|
|
320
|
-
platformSchema=OtherSchemaClass(
|
|
321
|
-
rawSchema=json.dumps(jsonable_column_infos)
|
|
322
|
-
),
|
|
323
|
-
fields=schema_fields,
|
|
324
|
-
)
|
|
325
|
-
|
|
326
|
-
yield MetadataChangeProposalWrapper(
|
|
327
|
-
entityUrn=dataset_urn,
|
|
328
|
-
aspect=schema_metadata,
|
|
329
|
-
).as_workunit()
|
|
265
|
+
return schema_fields
|
|
330
266
|
|
|
331
|
-
def _extract_views_from_keyspace(
|
|
332
|
-
self, keyspace_name: str
|
|
333
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
267
|
+
def _extract_views_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
|
|
334
268
|
views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name)
|
|
335
269
|
for view in views:
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
270
|
+
dataset = self._generate_view(keyspace_name, view)
|
|
271
|
+
if dataset:
|
|
272
|
+
yield dataset
|
|
273
|
+
|
|
274
|
+
def _generate_view(
|
|
275
|
+
self, keyspace_name: str, view: CassandraView
|
|
276
|
+
) -> Optional[Dataset]:
|
|
277
|
+
view_name: str = view.view_name
|
|
278
|
+
dataset_name: str = f"{keyspace_name}.{view_name}"
|
|
279
|
+
|
|
280
|
+
self.report.report_entity_scanned(dataset_name, ent_type="View")
|
|
281
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
282
|
+
# TODO: Maybe add a view_pattern instead of reusing table_pattern?
|
|
283
|
+
self.report.report_dropped(dataset_name)
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
schema_fields = None
|
|
287
|
+
try:
|
|
288
|
+
schema_fields = self._extract_columns_from_table(keyspace_name, view_name)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
self.report.failure(
|
|
291
|
+
message="Failed to extract columns from views",
|
|
292
|
+
context=view_name,
|
|
293
|
+
exc=e,
|
|
344
294
|
)
|
|
345
295
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
yield MetadataChangeProposalWrapper(
|
|
361
|
-
entityUrn=dataset_urn,
|
|
362
|
-
aspect=ViewPropertiesClass(
|
|
296
|
+
dataset = Dataset(
|
|
297
|
+
platform=self.platform,
|
|
298
|
+
name=dataset_name,
|
|
299
|
+
env=self.config.env,
|
|
300
|
+
platform_instance=self.config.platform_instance,
|
|
301
|
+
subtype=DatasetSubTypes.VIEW,
|
|
302
|
+
parent_container=self._generate_keyspace_container_key(keyspace_name),
|
|
303
|
+
schema=schema_fields,
|
|
304
|
+
display_name=view_name,
|
|
305
|
+
qualified_name=dataset_name,
|
|
306
|
+
description=view.comment,
|
|
307
|
+
custom_properties=self._get_dataset_custom_props(view),
|
|
308
|
+
extra_aspects=[
|
|
309
|
+
ViewPropertiesClass(
|
|
363
310
|
materialized=True,
|
|
364
311
|
viewLogic=view.where_clause, # Use the WHERE clause as view logic
|
|
365
312
|
viewLanguage="CQL", # Use "CQL" as the language
|
|
366
313
|
),
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
yield MetadataChangeProposalWrapper(
|
|
370
|
-
entityUrn=dataset_urn,
|
|
371
|
-
aspect=DatasetPropertiesClass(
|
|
372
|
-
name=view_name,
|
|
373
|
-
qualifiedName=f"{keyspace_name}.{view_name}",
|
|
374
|
-
description=view.comment,
|
|
375
|
-
customProperties={
|
|
376
|
-
"bloom_filter_fp_chance": str(view.bloom_filter_fp_chance),
|
|
377
|
-
"caching": json.dumps(view.caching),
|
|
378
|
-
"compaction": json.dumps(view.compaction),
|
|
379
|
-
"compression": json.dumps(view.compression),
|
|
380
|
-
"crc_check_chance": str(view.crc_check_chance),
|
|
381
|
-
"include_all_columns": str(view.include_all_columns),
|
|
382
|
-
"dclocal_read_repair_chance": str(
|
|
383
|
-
view.dclocal_read_repair_chance
|
|
384
|
-
),
|
|
385
|
-
"default_time_to_live": str(view.default_time_to_live),
|
|
386
|
-
"extensions": json.dumps(view.extensions),
|
|
387
|
-
"gc_grace_seconds": str(view.gc_grace_seconds),
|
|
388
|
-
"max_index_interval": str(view.max_index_interval),
|
|
389
|
-
"min_index_interval": str(view.min_index_interval),
|
|
390
|
-
"memtable_flush_period_in_ms": str(
|
|
391
|
-
view.memtable_flush_period_in_ms
|
|
392
|
-
),
|
|
393
|
-
"read_repair_chance": str(view.read_repair_chance),
|
|
394
|
-
"speculative_retry": str(view.speculative_retry),
|
|
395
|
-
},
|
|
396
|
-
),
|
|
397
|
-
).as_workunit()
|
|
314
|
+
],
|
|
315
|
+
)
|
|
398
316
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
317
|
+
# Construct and emit lineage off of 'base_table_name'
|
|
318
|
+
# NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name
|
|
319
|
+
upstream_urn: str = make_dataset_urn_with_platform_instance(
|
|
320
|
+
platform=self.platform,
|
|
321
|
+
name=f"{keyspace_name}.{view.base_table_name}",
|
|
322
|
+
env=self.config.env,
|
|
323
|
+
platform_instance=self.config.platform_instance,
|
|
324
|
+
)
|
|
325
|
+
fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource(
|
|
326
|
+
view_name, str(dataset.urn), upstream_urn
|
|
327
|
+
)
|
|
328
|
+
upstream_lineage = UpstreamLineageClass(
|
|
329
|
+
upstreams=[
|
|
330
|
+
UpstreamClass(
|
|
331
|
+
dataset=upstream_urn,
|
|
332
|
+
type=DatasetLineageTypeClass.VIEW,
|
|
408
333
|
)
|
|
334
|
+
],
|
|
335
|
+
fineGrainedLineages=fineGrainedLineages,
|
|
336
|
+
)
|
|
409
337
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
338
|
+
dataset.set_upstreams(upstream_lineage)
|
|
339
|
+
|
|
340
|
+
return dataset
|
|
341
|
+
|
|
342
|
+
def _get_dataset_custom_props(
|
|
343
|
+
self, dataset: CassandraSharedDatasetFields
|
|
344
|
+
) -> Dict[str, str]:
|
|
345
|
+
props = {
|
|
346
|
+
"bloom_filter_fp_chance": str(dataset.bloom_filter_fp_chance),
|
|
347
|
+
"caching": json.dumps(dataset.caching),
|
|
348
|
+
"compaction": json.dumps(dataset.compaction),
|
|
349
|
+
"compression": json.dumps(dataset.compression),
|
|
350
|
+
"crc_check_chance": str(dataset.crc_check_chance),
|
|
351
|
+
"dclocal_read_repair_chance": str(dataset.dclocal_read_repair_chance),
|
|
352
|
+
"default_time_to_live": str(dataset.default_time_to_live),
|
|
353
|
+
"extensions": json.dumps(dataset.extensions),
|
|
354
|
+
"gc_grace_seconds": str(dataset.gc_grace_seconds),
|
|
355
|
+
"max_index_interval": str(dataset.max_index_interval),
|
|
356
|
+
"min_index_interval": str(dataset.min_index_interval),
|
|
357
|
+
"memtable_flush_period_in_ms": str(dataset.memtable_flush_period_in_ms),
|
|
358
|
+
"read_repair_chance": str(dataset.read_repair_chance),
|
|
359
|
+
"speculative_retry": str(dataset.speculative_retry),
|
|
360
|
+
}
|
|
361
|
+
if isinstance(dataset, CassandraView):
|
|
362
|
+
props.update(
|
|
363
|
+
{
|
|
364
|
+
"include_all_columns": str(dataset.include_all_columns),
|
|
365
|
+
}
|
|
437
366
|
)
|
|
438
|
-
|
|
439
|
-
if self.config.platform_instance:
|
|
440
|
-
yield MetadataChangeProposalWrapper(
|
|
441
|
-
entityUrn=dataset_urn,
|
|
442
|
-
aspect=DataPlatformInstanceClass(
|
|
443
|
-
platform=make_data_platform_urn(self.platform),
|
|
444
|
-
instance=make_dataplatform_instance_urn(
|
|
445
|
-
self.platform, self.config.platform_instance
|
|
446
|
-
),
|
|
447
|
-
),
|
|
448
|
-
).as_workunit()
|
|
367
|
+
return props
|
|
449
368
|
|
|
450
369
|
def get_upstream_fields_of_field_in_datasource(
|
|
451
370
|
self, table_name: str, dataset_urn: str, upstream_urn: str
|
|
@@ -23,9 +23,9 @@ class CassandraKeyspace:
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@dataclass
|
|
26
|
-
class
|
|
26
|
+
class CassandraSharedDatasetFields:
|
|
27
27
|
keyspace_name: str
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
bloom_filter_fp_chance: Optional[float]
|
|
30
30
|
caching: Optional[Dict[str, str]]
|
|
31
31
|
comment: Optional[str]
|
|
@@ -43,6 +43,11 @@ class CassandraTable:
|
|
|
43
43
|
speculative_retry: Optional[str]
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
@dataclass
|
|
47
|
+
class CassandraTable(CassandraSharedDatasetFields):
|
|
48
|
+
table_name: str
|
|
49
|
+
|
|
50
|
+
|
|
46
51
|
@dataclass
|
|
47
52
|
class CassandraColumn:
|
|
48
53
|
keyspace_name: str
|
|
@@ -55,8 +60,10 @@ class CassandraColumn:
|
|
|
55
60
|
|
|
56
61
|
|
|
57
62
|
@dataclass
|
|
58
|
-
class CassandraView(
|
|
63
|
+
class CassandraView(CassandraSharedDatasetFields):
|
|
59
64
|
view_name: str
|
|
65
|
+
|
|
66
|
+
base_table_name: str
|
|
60
67
|
include_all_columns: Optional[bool]
|
|
61
68
|
where_clause: str = ""
|
|
62
69
|
|
|
@@ -152,7 +159,8 @@ class CassandraAPI:
|
|
|
152
159
|
self.report.failure(message="Failed to authenticate to Cassandra", exc=e)
|
|
153
160
|
return False
|
|
154
161
|
|
|
155
|
-
def get(self, query: str, parameters: Optional[List] =
|
|
162
|
+
def get(self, query: str, parameters: Optional[List] = None) -> List:
|
|
163
|
+
parameters = parameters or []
|
|
156
164
|
if not self._cassandra_session:
|
|
157
165
|
return []
|
|
158
166
|
|
|
@@ -261,7 +269,7 @@ class CassandraAPI:
|
|
|
261
269
|
views = self.get(CassandraQueries.GET_VIEWS_QUERY, [keyspace_name])
|
|
262
270
|
view_list = [
|
|
263
271
|
CassandraView(
|
|
264
|
-
|
|
272
|
+
base_table_name=row.base_table_name,
|
|
265
273
|
keyspace_name=row.keyspace_name,
|
|
266
274
|
view_name=row.view_name,
|
|
267
275
|
bloom_filter_fp_chance=row.bloom_filter_fp_chance,
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import tempfile
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, root_validator
|
|
6
|
+
|
|
7
|
+
from datahub.configuration import ConfigModel
|
|
8
|
+
from datahub.configuration.validate_multiline_string import pydantic_multiline_string
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GCPCredential(ConfigModel):
|
|
12
|
+
project_id: Optional[str] = Field(description="Project id to set the credentials")
|
|
13
|
+
private_key_id: str = Field(description="Private key id")
|
|
14
|
+
private_key: str = Field(
|
|
15
|
+
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
|
|
16
|
+
)
|
|
17
|
+
client_email: str = Field(description="Client email")
|
|
18
|
+
client_id: str = Field(description="Client Id")
|
|
19
|
+
auth_uri: str = Field(
|
|
20
|
+
default="https://accounts.google.com/o/oauth2/auth",
|
|
21
|
+
description="Authentication uri",
|
|
22
|
+
)
|
|
23
|
+
token_uri: str = Field(
|
|
24
|
+
default="https://oauth2.googleapis.com/token", description="Token uri"
|
|
25
|
+
)
|
|
26
|
+
auth_provider_x509_cert_url: str = Field(
|
|
27
|
+
default="https://www.googleapis.com/oauth2/v1/certs",
|
|
28
|
+
description="Auth provider x509 certificate url",
|
|
29
|
+
)
|
|
30
|
+
type: str = Field(default="service_account", description="Authentication type")
|
|
31
|
+
client_x509_cert_url: Optional[str] = Field(
|
|
32
|
+
default=None,
|
|
33
|
+
description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
_fix_private_key_newlines = pydantic_multiline_string("private_key")
|
|
37
|
+
|
|
38
|
+
@root_validator(skip_on_failure=True)
|
|
39
|
+
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
40
|
+
if values.get("client_x509_cert_url") is None:
|
|
41
|
+
values["client_x509_cert_url"] = (
|
|
42
|
+
f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
|
|
43
|
+
)
|
|
44
|
+
return values
|
|
45
|
+
|
|
46
|
+
def create_credential_temp_file(self, project_id: Optional[str] = None) -> str:
|
|
47
|
+
configs = self.dict()
|
|
48
|
+
if project_id:
|
|
49
|
+
configs["project_id"] = project_id
|
|
50
|
+
with tempfile.NamedTemporaryFile(delete=False) as fp:
|
|
51
|
+
cred_json = json.dumps(configs, indent=4, separators=(",", ": "))
|
|
52
|
+
fp.write(cred_json.encode())
|
|
53
|
+
return fp.name
|
|
@@ -60,8 +60,15 @@ class BIContainerSubTypes(StrEnum):
|
|
|
60
60
|
MODE_COLLECTION = "Collection"
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
class FlowContainerSubTypes(StrEnum):
|
|
64
|
+
MSSQL_JOB = "Job"
|
|
65
|
+
MSSQL_PROCEDURE_CONTAINER = "Procedures Container"
|
|
66
|
+
|
|
67
|
+
|
|
63
68
|
class JobContainerSubTypes(StrEnum):
|
|
64
69
|
NIFI_PROCESS_GROUP = "Process Group"
|
|
70
|
+
MSSQL_JOBSTEP = "Job Step"
|
|
71
|
+
MSSQL_STORED_PROCEDURE = "Stored Procedure"
|
|
65
72
|
|
|
66
73
|
|
|
67
74
|
class BIAssetSubTypes(StrEnum):
|
|
@@ -85,3 +92,8 @@ class BIAssetSubTypes(StrEnum):
|
|
|
85
92
|
# SAP Analytics Cloud
|
|
86
93
|
SAC_STORY = "Story"
|
|
87
94
|
SAC_APPLICATION = "Application"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class MLAssetSubTypes(StrEnum):
|
|
98
|
+
MLFLOW_TRAINING_RUN = "ML Training Run"
|
|
99
|
+
MLFLOW_EXPERIMENT = "ML Experiment"
|
|
@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
|
|
|
314
314
|
"datajob": EditableDataJobPropertiesClass,
|
|
315
315
|
"dataflow": EditableDataFlowPropertiesClass,
|
|
316
316
|
"notebook": EditableNotebookPropertiesClass,
|
|
317
|
-
}.get(entityType
|
|
317
|
+
}.get(entityType)
|
|
318
318
|
|
|
319
319
|
if not entityClass:
|
|
320
320
|
raise ValueError(
|
|
@@ -640,8 +640,8 @@ class CSVEnricherSource(Source):
|
|
|
640
640
|
)
|
|
641
641
|
except Exception as e:
|
|
642
642
|
raise ConfigurationError(
|
|
643
|
-
f"Cannot read remote file {self.config.filename}
|
|
644
|
-
)
|
|
643
|
+
f"Cannot read remote file {self.config.filename}: {e}"
|
|
644
|
+
) from e
|
|
645
645
|
else:
|
|
646
646
|
with open(pathlib.Path(self.config.filename), encoding="utf-8-sig") as f:
|
|
647
647
|
rows = list(csv.DictReader(f, delimiter=self.config.delimiter))
|
|
@@ -454,10 +454,8 @@ class PathSpec(ConfigModel):
|
|
|
454
454
|
return None
|
|
455
455
|
partition = partition_split[0]
|
|
456
456
|
# If partition is in the form of /value1/value2/value3 we infer it from the path and assign partition_0, partition_1, partition_2 etc
|
|
457
|
-
num
|
|
458
|
-
for partition_value in partition.split("/"):
|
|
457
|
+
for num, partition_value in enumerate(partition.split("/")):
|
|
459
458
|
partition_keys.append((f"partition_{num}", partition_value))
|
|
460
|
-
num += 1
|
|
461
459
|
return partition_keys
|
|
462
460
|
|
|
463
461
|
return None
|