acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import threading
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import Any, Dict, Optional
|
|
4
5
|
|
|
5
6
|
from humanfriendly import format_timespan
|
|
6
7
|
from pydantic import Field, validator
|
|
7
8
|
from pyiceberg.catalog import Catalog, load_catalog
|
|
9
|
+
from pyiceberg.catalog.rest import RestCatalog
|
|
10
|
+
from requests.adapters import HTTPAdapter
|
|
8
11
|
from sortedcontainers import SortedList
|
|
12
|
+
from urllib3.util import Retry
|
|
9
13
|
|
|
10
14
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
11
15
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
@@ -25,6 +29,23 @@ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
|
25
29
|
|
|
26
30
|
logger = logging.getLogger(__name__)
|
|
27
31
|
|
|
32
|
+
DEFAULT_REST_TIMEOUT = 120
|
|
33
|
+
DEFAULT_REST_RETRY_POLICY = {"total": 3, "backoff_factor": 0.1}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TimeoutHTTPAdapter(HTTPAdapter):
|
|
37
|
+
def __init__(self, *args, **kwargs):
|
|
38
|
+
if "timeout" in kwargs:
|
|
39
|
+
self.timeout = kwargs["timeout"]
|
|
40
|
+
del kwargs["timeout"]
|
|
41
|
+
super().__init__(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
def send(self, request, **kwargs):
|
|
44
|
+
timeout = kwargs.get("timeout")
|
|
45
|
+
if timeout is None and hasattr(self, "timeout"):
|
|
46
|
+
kwargs["timeout"] = self.timeout
|
|
47
|
+
return super().send(request, **kwargs)
|
|
48
|
+
|
|
28
49
|
|
|
29
50
|
class IcebergProfilingConfig(ConfigModel):
|
|
30
51
|
enabled: bool = Field(
|
|
@@ -145,7 +166,26 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
|
|
|
145
166
|
logger.debug(
|
|
146
167
|
"Initializing the catalog %s with config: %s", catalog_name, catalog_config
|
|
147
168
|
)
|
|
148
|
-
|
|
169
|
+
catalog = load_catalog(name=catalog_name, **catalog_config)
|
|
170
|
+
if isinstance(catalog, RestCatalog):
|
|
171
|
+
logger.debug(
|
|
172
|
+
"Recognized REST catalog type being configured, attempting to configure HTTP Adapter for the session"
|
|
173
|
+
)
|
|
174
|
+
retry_policy: Dict[str, Any] = DEFAULT_REST_RETRY_POLICY.copy()
|
|
175
|
+
retry_policy.update(catalog_config.get("connection", {}).get("retry", {}))
|
|
176
|
+
retries = Retry(**retry_policy)
|
|
177
|
+
logger.debug(f"Retry policy to be set: {retry_policy}")
|
|
178
|
+
timeout = catalog_config.get("connection", {}).get(
|
|
179
|
+
"timeout", DEFAULT_REST_TIMEOUT
|
|
180
|
+
)
|
|
181
|
+
logger.debug(f"Timeout to be set: {timeout}")
|
|
182
|
+
catalog._session.mount(
|
|
183
|
+
"http://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
|
|
184
|
+
)
|
|
185
|
+
catalog._session.mount(
|
|
186
|
+
"https://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
|
|
187
|
+
)
|
|
188
|
+
return catalog
|
|
149
189
|
|
|
150
190
|
|
|
151
191
|
class TopTableTimings:
|
|
@@ -156,18 +196,21 @@ class TopTableTimings:
|
|
|
156
196
|
def __init__(self, size: int = 10):
|
|
157
197
|
self._size = size
|
|
158
198
|
self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
|
|
199
|
+
self._lock = threading.Lock()
|
|
159
200
|
|
|
160
201
|
def add(self, entity: Dict[str, Any]) -> None:
|
|
161
202
|
if self._VALUE_FIELD not in entity:
|
|
162
203
|
return
|
|
163
|
-
self.
|
|
164
|
-
|
|
165
|
-
self.top_entites.
|
|
204
|
+
with self._lock:
|
|
205
|
+
self.top_entites.add(entity)
|
|
206
|
+
if len(self.top_entites) > self._size:
|
|
207
|
+
self.top_entites.pop()
|
|
166
208
|
|
|
167
209
|
def __str__(self) -> str:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
210
|
+
with self._lock:
|
|
211
|
+
if len(self.top_entites) == 0:
|
|
212
|
+
return "no timings reported"
|
|
213
|
+
return str(list(self.top_entites))
|
|
171
214
|
|
|
172
215
|
|
|
173
216
|
class TimingClass:
|
|
@@ -175,24 +218,31 @@ class TimingClass:
|
|
|
175
218
|
|
|
176
219
|
def __init__(self):
|
|
177
220
|
self.times = SortedList()
|
|
221
|
+
self._lock = threading.Lock()
|
|
178
222
|
|
|
179
223
|
def add_timing(self, t: float) -> None:
|
|
180
|
-
self.
|
|
224
|
+
with self._lock:
|
|
225
|
+
self.times.add(t)
|
|
181
226
|
|
|
182
227
|
def __str__(self) -> str:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
228
|
+
with self._lock:
|
|
229
|
+
if len(self.times) == 0:
|
|
230
|
+
return "no timings reported"
|
|
231
|
+
total = sum(self.times)
|
|
232
|
+
avg = total / len(self.times)
|
|
233
|
+
return str(
|
|
234
|
+
{
|
|
235
|
+
"average_time": format_timespan(avg, detailed=True, max_units=3),
|
|
236
|
+
"min_time": format_timespan(
|
|
237
|
+
self.times[0], detailed=True, max_units=3
|
|
238
|
+
),
|
|
239
|
+
"max_time": format_timespan(
|
|
240
|
+
self.times[-1], detailed=True, max_units=3
|
|
241
|
+
),
|
|
242
|
+
# total_time does not provide correct information in case we run in more than 1 thread
|
|
243
|
+
"total_time": format_timespan(total, detailed=True, max_units=3),
|
|
244
|
+
}
|
|
245
|
+
)
|
|
196
246
|
|
|
197
247
|
|
|
198
248
|
@dataclass
|
|
@@ -5,7 +5,7 @@ import urllib
|
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from time import sleep
|
|
8
|
-
from typing import Dict, Iterable, List, Optional, Union
|
|
8
|
+
from typing import Dict, Iterable, List, Optional, Set, Union
|
|
9
9
|
|
|
10
10
|
import nest_asyncio
|
|
11
11
|
from okta.client import Client as OktaClient
|
|
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
|
|
|
14
14
|
from pydantic import validator
|
|
15
15
|
from pydantic.fields import Field
|
|
16
16
|
|
|
17
|
-
from datahub.configuration.common import ConfigModel
|
|
18
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
18
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
19
|
from datahub.ingestion.api.decorators import (
|
|
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
|
|
|
56
55
|
nest_asyncio.apply()
|
|
57
56
|
|
|
58
57
|
|
|
59
|
-
class OktaConfig(StatefulIngestionConfigBase
|
|
58
|
+
class OktaConfig(StatefulIngestionConfigBase):
|
|
60
59
|
# Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
|
|
61
60
|
okta_domain: str = Field(
|
|
62
61
|
description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
|
|
@@ -77,6 +76,10 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
|
|
|
77
76
|
default=True,
|
|
78
77
|
description="Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.",
|
|
79
78
|
)
|
|
79
|
+
ingest_groups_users: bool = Field(
|
|
80
|
+
default=True,
|
|
81
|
+
description="Only ingest users belonging to the selected groups. This option is only useful when `ingest_users` is set to False and `ingest_group_membership` to True.",
|
|
82
|
+
)
|
|
80
83
|
|
|
81
84
|
# Optional: Customize the mapping to DataHub Username from an attribute appearing in the Okta User
|
|
82
85
|
# profile. Reference: https://developer.okta.com/docs/reference/api/users/
|
|
@@ -344,6 +347,7 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
344
347
|
aspect=StatusClass(removed=False),
|
|
345
348
|
).as_workunit()
|
|
346
349
|
|
|
350
|
+
okta_users: Set[User] = set()
|
|
347
351
|
# Step 2: Populate GroupMembership Aspects for CorpUsers
|
|
348
352
|
datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
|
|
349
353
|
defaultdict(lambda: GroupMembershipClass(groups=[]))
|
|
@@ -372,6 +376,9 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
372
376
|
self.report.report_failure("okta_user_mapping", error_str)
|
|
373
377
|
continue
|
|
374
378
|
|
|
379
|
+
if self.config.ingest_groups_users:
|
|
380
|
+
okta_users.add(okta_user)
|
|
381
|
+
|
|
375
382
|
# Update the GroupMembership aspect for this group member.
|
|
376
383
|
datahub_corp_user_urn_to_group_membership[
|
|
377
384
|
datahub_corp_user_urn
|
|
@@ -379,7 +386,10 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
379
386
|
|
|
380
387
|
# Step 3: Produce MetadataWorkUnits for CorpUsers.
|
|
381
388
|
if self.config.ingest_users:
|
|
382
|
-
|
|
389
|
+
# we can just throw away collected okta users so far and fetch them all
|
|
390
|
+
okta_users = set(self._get_okta_users(event_loop))
|
|
391
|
+
|
|
392
|
+
if okta_users:
|
|
383
393
|
filtered_okta_users = filter(self._filter_okta_user, okta_users)
|
|
384
394
|
datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users)
|
|
385
395
|
for user_count, datahub_corp_user_snapshot in enumerate(
|
|
@@ -558,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
558
568
|
if (
|
|
559
569
|
self.config.include_deprovisioned_users is False
|
|
560
570
|
and okta_user.status == UserStatus.DEPROVISIONED
|
|
561
|
-
)
|
|
562
|
-
return False
|
|
563
|
-
elif (
|
|
571
|
+
) or (
|
|
564
572
|
self.config.include_suspended_users is False
|
|
565
573
|
and okta_user.status == UserStatus.SUSPENDED
|
|
566
574
|
):
|
|
@@ -658,6 +666,27 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
658
666
|
self.config.okta_profile_to_username_regex,
|
|
659
667
|
)
|
|
660
668
|
|
|
669
|
+
def _map_okta_user_profile_custom_properties(
|
|
670
|
+
self, profile: UserProfile
|
|
671
|
+
) -> Dict[str, str]:
|
|
672
|
+
# filter out the common fields that are already mapped to the CorpUserInfo aspect and the private ones
|
|
673
|
+
return {
|
|
674
|
+
k: str(v)
|
|
675
|
+
for k, v in profile.__dict__.items()
|
|
676
|
+
if v
|
|
677
|
+
and k
|
|
678
|
+
not in [
|
|
679
|
+
"displayName",
|
|
680
|
+
"firstName",
|
|
681
|
+
"lastName",
|
|
682
|
+
"email",
|
|
683
|
+
"title",
|
|
684
|
+
"countryCode",
|
|
685
|
+
"department",
|
|
686
|
+
]
|
|
687
|
+
and not k.startswith("_")
|
|
688
|
+
}
|
|
689
|
+
|
|
661
690
|
# Converts Okta User Profile into a CorpUserInfo.
|
|
662
691
|
def _map_okta_user_profile(self, profile: UserProfile) -> CorpUserInfoClass:
|
|
663
692
|
# TODO: Extract user's manager if provided.
|
|
@@ -675,6 +704,7 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
675
704
|
title=profile.title,
|
|
676
705
|
countryCode=profile.countryCode,
|
|
677
706
|
departmentName=profile.department,
|
|
707
|
+
customProperties=self._map_okta_user_profile_custom_properties(profile),
|
|
678
708
|
)
|
|
679
709
|
|
|
680
710
|
def _make_corp_group_urn(self, name: str) -> str:
|
|
@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
272
272
|
return schema_registry_class.create(config, report)
|
|
273
273
|
except Exception as e:
|
|
274
274
|
logger.debug(e, exc_info=e)
|
|
275
|
-
raise ImportError(config.schema_registry_class)
|
|
275
|
+
raise ImportError(config.schema_registry_class) from e
|
|
276
276
|
|
|
277
277
|
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
|
|
278
278
|
super().__init__(config, ctx)
|
|
@@ -110,7 +110,7 @@ class ConnectorManifest:
|
|
|
110
110
|
|
|
111
111
|
name: str
|
|
112
112
|
type: str
|
|
113
|
-
config: Dict
|
|
113
|
+
config: Dict[str, str]
|
|
114
114
|
tasks: Dict
|
|
115
115
|
url: Optional[str] = None
|
|
116
116
|
flow_property_bag: Optional[Dict[str, str]] = None
|
|
@@ -141,12 +141,7 @@ def get_dataset_name(
|
|
|
141
141
|
database_name: Optional[str],
|
|
142
142
|
source_table: str,
|
|
143
143
|
) -> str:
|
|
144
|
-
if database_name
|
|
145
|
-
dataset_name = database_name + "." + source_table
|
|
146
|
-
else:
|
|
147
|
-
dataset_name = source_table
|
|
148
|
-
|
|
149
|
-
return dataset_name
|
|
144
|
+
return database_name + "." + source_table if database_name else source_table
|
|
150
145
|
|
|
151
146
|
|
|
152
147
|
def get_platform_instance(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Iterable, List, Optional, Type
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Type
|
|
3
3
|
|
|
4
4
|
import jpype
|
|
5
5
|
import jpype.imports
|
|
@@ -121,7 +121,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
121
121
|
connector_manifest.config, self.config.provided_configs
|
|
122
122
|
)
|
|
123
123
|
connector_manifest.url = connector_url
|
|
124
|
-
connector_manifest.topic_names = self._get_connector_topics(
|
|
124
|
+
connector_manifest.topic_names = self._get_connector_topics(
|
|
125
|
+
connector_name=connector_name,
|
|
126
|
+
config=connector_manifest.config,
|
|
127
|
+
connector_type=connector_manifest.type,
|
|
128
|
+
)
|
|
125
129
|
connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
|
|
126
130
|
|
|
127
131
|
class_type: Type[BaseConnector] = BaseConnector
|
|
@@ -203,7 +207,9 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
203
207
|
|
|
204
208
|
return response.json()
|
|
205
209
|
|
|
206
|
-
def _get_connector_topics(
|
|
210
|
+
def _get_connector_topics(
|
|
211
|
+
self, connector_name: str, config: Dict[str, str], connector_type: str
|
|
212
|
+
) -> List[str]:
|
|
207
213
|
try:
|
|
208
214
|
response = self.session.get(
|
|
209
215
|
f"{self.config.connect_uri}/connectors/{connector_name}/topics",
|
|
@@ -215,7 +221,21 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
215
221
|
)
|
|
216
222
|
return []
|
|
217
223
|
|
|
218
|
-
|
|
224
|
+
processed_topics = response.json()[connector_name]["topics"]
|
|
225
|
+
|
|
226
|
+
if connector_type == SINK:
|
|
227
|
+
try:
|
|
228
|
+
return SinkTopicFilter().filter_stale_topics(processed_topics, config)
|
|
229
|
+
except Exception as e:
|
|
230
|
+
self.report.warning(
|
|
231
|
+
title="Error parsing sink conector topics configuration",
|
|
232
|
+
message="Some stale lineage tasks might show up for connector",
|
|
233
|
+
context=connector_name,
|
|
234
|
+
exc=e,
|
|
235
|
+
)
|
|
236
|
+
return processed_topics
|
|
237
|
+
else:
|
|
238
|
+
return processed_topics
|
|
219
239
|
|
|
220
240
|
def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
|
|
221
241
|
connector_name = connector.name
|
|
@@ -359,3 +379,76 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
359
379
|
return builder.make_dataset_urn_with_platform_instance(
|
|
360
380
|
platform, name, platform_instance, self.config.env
|
|
361
381
|
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class SinkTopicFilter:
|
|
385
|
+
"""Helper class to filter Kafka Connect topics based on configuration."""
|
|
386
|
+
|
|
387
|
+
def filter_stale_topics(
|
|
388
|
+
self,
|
|
389
|
+
processed_topics: List[str],
|
|
390
|
+
sink_config: Dict[str, str],
|
|
391
|
+
) -> List[str]:
|
|
392
|
+
"""
|
|
393
|
+
Kafka-connect's /topics API returns the set of topic names the connector has been using
|
|
394
|
+
since its creation or since the last time its set of active topics was reset. This means-
|
|
395
|
+
if a topic was ever used by a connector, it will be returned, even if it is no longer used.
|
|
396
|
+
To remove these stale topics from the list, we double-check the list returned by the API
|
|
397
|
+
against the sink connector's config.
|
|
398
|
+
Sink connectors configure exactly one of `topics` or `topics.regex`
|
|
399
|
+
https://kafka.apache.org/documentation/#sinkconnectorconfigs_topics
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
processed_topics: List of topics currently being processed
|
|
403
|
+
sink_config: Configuration dictionary for the sink connector
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
List of filtered topics that match the configuration
|
|
407
|
+
|
|
408
|
+
Raises:
|
|
409
|
+
ValueError: If sink connector configuration is missing both 'topics' and 'topics.regex' fields
|
|
410
|
+
|
|
411
|
+
"""
|
|
412
|
+
# Absence of topics config is a defensive NOOP,
|
|
413
|
+
# although this should never happen in real world
|
|
414
|
+
if not self.has_topic_config(sink_config):
|
|
415
|
+
logger.warning(
|
|
416
|
+
f"Found sink without topics config {sink_config.get(CONNECTOR_CLASS)}"
|
|
417
|
+
)
|
|
418
|
+
return processed_topics
|
|
419
|
+
|
|
420
|
+
# Handle explicit topic list
|
|
421
|
+
if sink_config.get("topics"):
|
|
422
|
+
return self._filter_by_topic_list(processed_topics, sink_config["topics"])
|
|
423
|
+
else:
|
|
424
|
+
# Handle regex pattern
|
|
425
|
+
return self._filter_by_topic_regex(
|
|
426
|
+
processed_topics, sink_config["topics.regex"]
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
def has_topic_config(self, sink_config: Dict[str, str]) -> bool:
|
|
430
|
+
"""Check if sink config has either topics or topics.regex."""
|
|
431
|
+
return bool(sink_config.get("topics") or sink_config.get("topics.regex"))
|
|
432
|
+
|
|
433
|
+
def _filter_by_topic_list(
|
|
434
|
+
self, processed_topics: List[str], topics_config: str
|
|
435
|
+
) -> List[str]:
|
|
436
|
+
"""Filter topics based on explicit topic list from config."""
|
|
437
|
+
config_topics = [
|
|
438
|
+
topic.strip() for topic in topics_config.split(",") if topic.strip()
|
|
439
|
+
]
|
|
440
|
+
return [topic for topic in processed_topics if topic in config_topics]
|
|
441
|
+
|
|
442
|
+
def _filter_by_topic_regex(
|
|
443
|
+
self, processed_topics: List[str], regex_pattern: str
|
|
444
|
+
) -> List[str]:
|
|
445
|
+
"""Filter topics based on regex pattern from config."""
|
|
446
|
+
from java.util.regex import Pattern
|
|
447
|
+
|
|
448
|
+
regex_matcher = Pattern.compile(regex_pattern)
|
|
449
|
+
|
|
450
|
+
return [
|
|
451
|
+
topic
|
|
452
|
+
for topic in processed_topics
|
|
453
|
+
if regex_matcher.matcher(topic).matches()
|
|
454
|
+
]
|
|
@@ -175,7 +175,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
175
175
|
class BQParser:
|
|
176
176
|
project: str
|
|
177
177
|
target_platform: str
|
|
178
|
-
sanitizeTopics:
|
|
178
|
+
sanitizeTopics: bool
|
|
179
179
|
transforms: list
|
|
180
180
|
topicsToTables: Optional[str] = None
|
|
181
181
|
datasets: Optional[str] = None
|
|
@@ -187,7 +187,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
187
187
|
connector_manifest: ConnectorManifest,
|
|
188
188
|
) -> BQParser:
|
|
189
189
|
project = connector_manifest.config["project"]
|
|
190
|
-
sanitizeTopics = connector_manifest.config.get("sanitizeTopics"
|
|
190
|
+
sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
191
191
|
transform_names = (
|
|
192
192
|
self.connector_manifest.config.get("transforms", "").split(",")
|
|
193
193
|
if self.connector_manifest.config.get("transforms")
|
|
@@ -107,9 +107,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
|
|
|
107
107
|
assert database_name
|
|
108
108
|
db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
|
|
109
109
|
|
|
110
|
-
topic_prefix = self.connector_manifest.config.get("topic.prefix"
|
|
110
|
+
topic_prefix = self.connector_manifest.config.get("topic.prefix") or ""
|
|
111
111
|
|
|
112
|
-
query = self.connector_manifest.config.get("query"
|
|
112
|
+
query = self.connector_manifest.config.get("query") or ""
|
|
113
113
|
|
|
114
114
|
transform_names = (
|
|
115
115
|
self.connector_manifest.config.get("transforms", "").split(",")
|
|
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
447
447
|
) -> DebeziumParser:
|
|
448
448
|
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
449
449
|
|
|
450
|
-
if
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
database_name=None,
|
|
455
|
-
)
|
|
456
|
-
elif connector_class == "MySqlConnector":
|
|
450
|
+
if (
|
|
451
|
+
connector_class == "io.debezium.connector.mysql.MySqlConnector"
|
|
452
|
+
or connector_class == "MySqlConnector"
|
|
453
|
+
):
|
|
457
454
|
parser = self.DebeziumParser(
|
|
458
455
|
source_platform="mysql",
|
|
459
456
|
server_name=self.get_server_name(connector_manifest),
|
|
@@ -923,7 +923,7 @@ class LookerExplore:
|
|
|
923
923
|
tags=cast(List, dict.get("tags")) if dict.get("tags") is not None else [],
|
|
924
924
|
)
|
|
925
925
|
|
|
926
|
-
@classmethod
|
|
926
|
+
@classmethod
|
|
927
927
|
def from_api( # noqa: C901
|
|
928
928
|
cls,
|
|
929
929
|
model: str,
|
|
@@ -931,7 +931,7 @@ class LookerExplore:
|
|
|
931
931
|
client: LookerAPI,
|
|
932
932
|
reporter: SourceReport,
|
|
933
933
|
source_config: LookerDashboardSourceConfig,
|
|
934
|
-
) -> Optional["LookerExplore"]:
|
|
934
|
+
) -> Optional["LookerExplore"]:
|
|
935
935
|
try:
|
|
936
936
|
explore = client.lookml_model_explore(model, explore_name)
|
|
937
937
|
views: Set[str] = set()
|
|
@@ -1183,7 +1183,7 @@ class LookerExplore:
|
|
|
1183
1183
|
base_url = remove_port_from_url(base_url)
|
|
1184
1184
|
return f"{base_url}/embed/explore/{self.model_name}/{self.name}"
|
|
1185
1185
|
|
|
1186
|
-
def _to_metadata_events(
|
|
1186
|
+
def _to_metadata_events(
|
|
1187
1187
|
self,
|
|
1188
1188
|
config: LookerCommonConfig,
|
|
1189
1189
|
reporter: SourceReport,
|
|
@@ -1673,10 +1673,11 @@ class LookerUserRegistry:
|
|
|
1673
1673
|
primary_key="",
|
|
1674
1674
|
)
|
|
1675
1675
|
|
|
1676
|
-
# Extract user email mappings
|
|
1676
|
+
# Extract user email mappings.
|
|
1677
|
+
# Sort it to ensure the order is deterministic.
|
|
1677
1678
|
user_email_cache = {
|
|
1678
1679
|
user_id: user.email
|
|
1679
|
-
for user_id, user in self._user_cache.items()
|
|
1680
|
+
for user_id, user in sorted(self._user_cache.items())
|
|
1680
1681
|
if user.email
|
|
1681
1682
|
}
|
|
1682
1683
|
|
|
@@ -33,14 +33,14 @@ class LookerViewFileLoader:
|
|
|
33
33
|
base_projects_folder: Dict[str, pathlib.Path],
|
|
34
34
|
reporter: LookMLSourceReport,
|
|
35
35
|
source_config: LookMLSourceConfig,
|
|
36
|
-
manifest_constants: Dict[str, LookerConstant] =
|
|
36
|
+
manifest_constants: Optional[Dict[str, LookerConstant]] = None,
|
|
37
37
|
) -> None:
|
|
38
38
|
self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
|
|
39
39
|
self._root_project_name = root_project_name
|
|
40
40
|
self._base_projects_folder = base_projects_folder
|
|
41
41
|
self.reporter = reporter
|
|
42
42
|
self.source_config = source_config
|
|
43
|
-
self.manifest_constants = manifest_constants
|
|
43
|
+
self.manifest_constants = manifest_constants or {}
|
|
44
44
|
|
|
45
45
|
def _load_viewfile(
|
|
46
46
|
self, project_name: str, path: str, reporter: LookMLSourceReport
|
|
@@ -205,8 +205,9 @@ class LookerAPI:
|
|
|
205
205
|
def folder_ancestors(
|
|
206
206
|
self,
|
|
207
207
|
folder_id: str,
|
|
208
|
-
fields: Union[str, List[str]] =
|
|
208
|
+
fields: Optional[Union[str, List[str]]] = None,
|
|
209
209
|
) -> Sequence[Folder]:
|
|
210
|
+
fields = fields or ["id", "name", "parent_id"]
|
|
210
211
|
self.client_stats.folder_calls += 1
|
|
211
212
|
try:
|
|
212
213
|
return self.client.folder_ancestors(
|
|
@@ -383,7 +383,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
383
383
|
|
|
384
384
|
self.reachable_explores[(model, explore)].append(via)
|
|
385
385
|
|
|
386
|
-
def _get_looker_dashboard_element(
|
|
386
|
+
def _get_looker_dashboard_element(
|
|
387
387
|
self, element: DashboardElement
|
|
388
388
|
) -> Optional[LookerDashboardElement]:
|
|
389
389
|
# Dashboard elements can use raw usage_queries against explores
|
|
@@ -464,9 +464,10 @@ def process_lookml_template_language(
|
|
|
464
464
|
source_config: LookMLSourceConfig,
|
|
465
465
|
view_lkml_file_dict: dict,
|
|
466
466
|
reporter: LookMLSourceReport,
|
|
467
|
-
manifest_constants: Dict[str, "LookerConstant"] =
|
|
467
|
+
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
|
|
468
468
|
resolve_constants: bool = False,
|
|
469
469
|
) -> None:
|
|
470
|
+
manifest_constants = manifest_constants or {}
|
|
470
471
|
if "views" not in view_lkml_file_dict:
|
|
471
472
|
return
|
|
472
473
|
|
|
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
|
|
|
507
508
|
path: Union[str, pathlib.Path],
|
|
508
509
|
source_config: LookMLSourceConfig,
|
|
509
510
|
reporter: LookMLSourceReport,
|
|
510
|
-
manifest_constants: Dict[str, "LookerConstant"] =
|
|
511
|
+
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
|
|
511
512
|
resolve_constants: bool = False,
|
|
512
513
|
) -> dict:
|
|
514
|
+
manifest_constants = manifest_constants or {}
|
|
513
515
|
parsed = load_lkml(path)
|
|
514
516
|
|
|
515
517
|
process_lookml_template_language(
|
|
@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
501
501
|
raise ValueError(
|
|
502
502
|
f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
|
|
503
503
|
f"in your config file"
|
|
504
|
-
)
|
|
504
|
+
) from None
|
|
505
505
|
|
|
506
506
|
def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
|
|
507
507
|
manifest_file = folder / "manifest.lkml"
|
|
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
1006
1006
|
def report_skipped_unreachable_views(
|
|
1007
1007
|
self,
|
|
1008
1008
|
viewfile_loader: LookerViewFileLoader,
|
|
1009
|
-
processed_view_map: Dict[str, Set[str]] =
|
|
1009
|
+
processed_view_map: Optional[Dict[str, Set[str]]] = None,
|
|
1010
1010
|
) -> None:
|
|
1011
|
+
processed_view_map = processed_view_map or {}
|
|
1011
1012
|
view_files: Dict[str, List[pathlib.Path]] = {}
|
|
1012
1013
|
for project, folder_path in self.base_projects_folder.items():
|
|
1013
1014
|
folder = pathlib.Path(folder_path)
|