acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/datajob/datajob.py +17 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/emitter/mcp_builder.py +4 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +4 -1
- datahub/ingestion/api/source_helpers.py +26 -1
- datahub/ingestion/graph/client.py +104 -0
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +300 -0
- datahub/ingestion/source/iceberg/iceberg.py +106 -18
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +6 -7
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +16 -3
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/sigma/sigma.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -43,7 +43,7 @@ class AllowedValue(ConfigModel):
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
VALID_ENTITY_TYPE_URNS = [
|
|
46
|
-
Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES
|
|
46
|
+
Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES
|
|
47
47
|
]
|
|
48
48
|
_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
|
|
49
49
|
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -216,9 +216,9 @@ def run(
|
|
|
216
216
|
@click.option(
|
|
217
217
|
"--executor-id",
|
|
218
218
|
type=str,
|
|
219
|
-
default="default",
|
|
220
219
|
help="Executor id to route execution requests to. Do not use this unless you have configured a custom executor.",
|
|
221
220
|
required=False,
|
|
221
|
+
default=None,
|
|
222
222
|
)
|
|
223
223
|
@click.option(
|
|
224
224
|
"--cli-version",
|
|
@@ -239,7 +239,7 @@ def run(
|
|
|
239
239
|
type=str,
|
|
240
240
|
help="Timezone for the schedule in 'America/New_York' format. Uses UTC by default.",
|
|
241
241
|
required=False,
|
|
242
|
-
default=
|
|
242
|
+
default=None,
|
|
243
243
|
)
|
|
244
244
|
@click.option(
|
|
245
245
|
"--debug", type=bool, help="Should we debug.", required=False, default=False
|
|
@@ -255,10 +255,10 @@ def deploy(
|
|
|
255
255
|
name: Optional[str],
|
|
256
256
|
config: str,
|
|
257
257
|
urn: Optional[str],
|
|
258
|
-
executor_id: str,
|
|
258
|
+
executor_id: Optional[str],
|
|
259
259
|
cli_version: Optional[str],
|
|
260
260
|
schedule: Optional[str],
|
|
261
|
-
time_zone: str,
|
|
261
|
+
time_zone: Optional[str],
|
|
262
262
|
extra_pip: Optional[str],
|
|
263
263
|
debug: bool = False,
|
|
264
264
|
) -> None:
|
datahub/cli/migrate.py
CHANGED
|
@@ -76,13 +76,13 @@ class MigrationReport:
|
|
|
76
76
|
def __repr__(self) -> str:
|
|
77
77
|
repr = f"{self._get_prefix()}Migration Report:\n--------------\n"
|
|
78
78
|
repr += f"{self._get_prefix()}Migration Run Id: {self.run_id}\n"
|
|
79
|
-
repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created
|
|
80
|
-
repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected
|
|
81
|
-
repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated
|
|
79
|
+
repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created]))}\n"
|
|
80
|
+
repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected]))}\n"
|
|
81
|
+
repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated]))}\n"
|
|
82
82
|
repr += f"{self._get_prefix()}Details:\n"
|
|
83
|
-
repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created
|
|
84
|
-
repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected
|
|
85
|
-
repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated
|
|
83
|
+
repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created]) or 'None'}\n"
|
|
84
|
+
repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected]) or 'None'}\n"
|
|
85
|
+
repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated]) or 'None'}\n"
|
|
86
86
|
return repr
|
|
87
87
|
|
|
88
88
|
|
datahub/configuration/common.py
CHANGED
|
@@ -317,7 +317,7 @@ class KeyValuePattern(ConfigModel):
|
|
|
317
317
|
return KeyValuePattern()
|
|
318
318
|
|
|
319
319
|
def value(self, string: str) -> List[str]:
|
|
320
|
-
matching_keys = [key for key in self.rules
|
|
320
|
+
matching_keys = [key for key in self.rules if re.match(key, string)]
|
|
321
321
|
if not matching_keys:
|
|
322
322
|
return []
|
|
323
323
|
elif self.first_match_only:
|
datahub/emitter/mcp_builder.py
CHANGED
datahub/ingestion/api/common.py
CHANGED
|
@@ -12,6 +12,9 @@ if TYPE_CHECKING:
|
|
|
12
12
|
|
|
13
13
|
T = TypeVar("T")
|
|
14
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from datahub.ingestion.run.pipeline_config import FlagsConfig
|
|
17
|
+
|
|
15
18
|
|
|
16
19
|
@dataclass
|
|
17
20
|
class RecordEnvelope(Generic[T]):
|
|
@@ -60,6 +63,12 @@ class PipelineContext:
|
|
|
60
63
|
|
|
61
64
|
self._set_dataset_urn_to_lower_if_needed()
|
|
62
65
|
|
|
66
|
+
@property
|
|
67
|
+
def flags(self) -> "FlagsConfig":
|
|
68
|
+
from datahub.ingestion.run.pipeline_config import FlagsConfig
|
|
69
|
+
|
|
70
|
+
return self.pipeline_config.flags if self.pipeline_config else FlagsConfig()
|
|
71
|
+
|
|
63
72
|
def _set_dataset_urn_to_lower_if_needed(self) -> None:
|
|
64
73
|
# TODO: Get rid of this function once lower-casing is the standard.
|
|
65
74
|
if self.graph:
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -39,6 +39,7 @@ from datahub.ingestion.api.closeable import Closeable
|
|
|
39
39
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
40
40
|
from datahub.ingestion.api.report import Report
|
|
41
41
|
from datahub.ingestion.api.source_helpers import (
|
|
42
|
+
AutoSystemMetadata,
|
|
42
43
|
auto_browse_path_v2,
|
|
43
44
|
auto_fix_duplicate_schema_field_paths,
|
|
44
45
|
auto_fix_empty_field_paths,
|
|
@@ -475,8 +476,10 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
475
476
|
return stream
|
|
476
477
|
|
|
477
478
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
479
|
+
workunit_processors = self.get_workunit_processors()
|
|
480
|
+
workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
|
|
478
481
|
return self._apply_workunit_processors(
|
|
479
|
-
|
|
482
|
+
workunit_processors, auto_workunit(self.get_workunits_internal())
|
|
480
483
|
)
|
|
481
484
|
|
|
482
485
|
def get_workunits_internal(
|
|
@@ -13,9 +13,14 @@ from typing import (
|
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
16
|
-
from datahub.emitter.mce_builder import
|
|
16
|
+
from datahub.emitter.mce_builder import (
|
|
17
|
+
get_sys_time,
|
|
18
|
+
make_dataplatform_instance_urn,
|
|
19
|
+
parse_ts_millis,
|
|
20
|
+
)
|
|
17
21
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
22
|
from datahub.emitter.mcp_builder import entity_supports_aspect
|
|
23
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
19
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
20
25
|
from datahub.metadata.schema_classes import (
|
|
21
26
|
BrowsePathEntryClass,
|
|
@@ -544,3 +549,23 @@ def _prepend_platform_instance(
|
|
|
544
549
|
return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
|
|
545
550
|
|
|
546
551
|
return entries
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
class AutoSystemMetadata:
|
|
555
|
+
def __init__(self, ctx: PipelineContext):
|
|
556
|
+
self.ctx = ctx
|
|
557
|
+
|
|
558
|
+
def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
|
|
559
|
+
for wu in stream:
|
|
560
|
+
yield self.stamp_wu(wu)
|
|
561
|
+
|
|
562
|
+
def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
|
|
563
|
+
if self.ctx.flags.set_system_metadata:
|
|
564
|
+
if not wu.metadata.systemMetadata:
|
|
565
|
+
wu.metadata.systemMetadata = SystemMetadataClass()
|
|
566
|
+
wu.metadata.systemMetadata.runId = self.ctx.run_id
|
|
567
|
+
if not wu.metadata.systemMetadata.lastObserved:
|
|
568
|
+
wu.metadata.systemMetadata.lastObserved = get_sys_time()
|
|
569
|
+
if self.ctx.flags.set_system_metadata_pipeline_name:
|
|
570
|
+
wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
|
|
571
|
+
return wu
|
|
@@ -27,6 +27,7 @@ from pydantic import BaseModel
|
|
|
27
27
|
from requests.models import HTTPError
|
|
28
28
|
from typing_extensions import deprecated
|
|
29
29
|
|
|
30
|
+
from datahub._codegen.aspect import _Aspect
|
|
30
31
|
from datahub.cli import config_utils
|
|
31
32
|
from datahub.configuration.common import ConfigModel, GraphError, OperationalError
|
|
32
33
|
from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
|
|
@@ -1697,6 +1698,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1697
1698
|
|
|
1698
1699
|
return res["runAssertionsForAsset"]
|
|
1699
1700
|
|
|
1701
|
+
@deprecated("Use get_entities instead which returns typed aspects")
|
|
1700
1702
|
def get_entities_v2(
|
|
1701
1703
|
self,
|
|
1702
1704
|
entity_name: str,
|
|
@@ -1736,6 +1738,108 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1736
1738
|
retval[entity_urn][aspect_key] = aspect_value
|
|
1737
1739
|
return retval
|
|
1738
1740
|
|
|
1741
|
+
def get_entities(
|
|
1742
|
+
self,
|
|
1743
|
+
entity_name: str,
|
|
1744
|
+
urns: List[str],
|
|
1745
|
+
aspects: Optional[List[str]] = None,
|
|
1746
|
+
with_system_metadata: bool = False,
|
|
1747
|
+
) -> Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]]:
|
|
1748
|
+
"""
|
|
1749
|
+
Get entities using the OpenAPI v3 endpoint, deserializing aspects into typed objects.
|
|
1750
|
+
|
|
1751
|
+
Args:
|
|
1752
|
+
entity_name: The entity type name
|
|
1753
|
+
urns: List of entity URNs to fetch
|
|
1754
|
+
aspects: Optional list of aspect names to fetch. If None, all aspects will be fetched.
|
|
1755
|
+
with_system_metadata: If True, return system metadata along with each aspect.
|
|
1756
|
+
|
|
1757
|
+
Returns:
|
|
1758
|
+
A dictionary mapping URNs to a dictionary of aspect name to tuples of
|
|
1759
|
+
(typed aspect object, system metadata). If with_system_metadata is False,
|
|
1760
|
+
the system metadata in the tuple will be None.
|
|
1761
|
+
"""
|
|
1762
|
+
aspects = aspects or []
|
|
1763
|
+
|
|
1764
|
+
request_payload = []
|
|
1765
|
+
for urn in urns:
|
|
1766
|
+
entity_request: Dict[str, Any] = {"urn": urn}
|
|
1767
|
+
for aspect_name in aspects:
|
|
1768
|
+
entity_request[aspect_name] = {}
|
|
1769
|
+
request_payload.append(entity_request)
|
|
1770
|
+
|
|
1771
|
+
headers: Dict[str, Any] = {
|
|
1772
|
+
"Accept": "application/json",
|
|
1773
|
+
"Content-Type": "application/json",
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
|
|
1777
|
+
if with_system_metadata:
|
|
1778
|
+
url += "?systemMetadata=true"
|
|
1779
|
+
|
|
1780
|
+
response = self._session.post(
|
|
1781
|
+
url, data=json.dumps(request_payload), headers=headers
|
|
1782
|
+
)
|
|
1783
|
+
response.raise_for_status()
|
|
1784
|
+
entities = response.json()
|
|
1785
|
+
|
|
1786
|
+
result: Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]] = {}
|
|
1787
|
+
|
|
1788
|
+
for entity in entities:
|
|
1789
|
+
entity_urn = entity.get("urn")
|
|
1790
|
+
if entity_urn is None:
|
|
1791
|
+
logger.warning(
|
|
1792
|
+
f"Missing URN in entity response: {entity}, skipping deserialization"
|
|
1793
|
+
)
|
|
1794
|
+
continue
|
|
1795
|
+
|
|
1796
|
+
entity_aspects: Dict[
|
|
1797
|
+
str, Tuple[_Aspect, Optional[SystemMetadataClass]]
|
|
1798
|
+
] = {}
|
|
1799
|
+
|
|
1800
|
+
for aspect_name, aspect_obj in entity.items():
|
|
1801
|
+
if aspect_name == "urn":
|
|
1802
|
+
continue
|
|
1803
|
+
|
|
1804
|
+
aspect_class = ASPECT_NAME_MAP.get(aspect_name)
|
|
1805
|
+
if aspect_class is None:
|
|
1806
|
+
logger.warning(
|
|
1807
|
+
f"Unknown aspect type {aspect_name}, skipping deserialization"
|
|
1808
|
+
)
|
|
1809
|
+
continue
|
|
1810
|
+
|
|
1811
|
+
aspect_value = aspect_obj.get("value")
|
|
1812
|
+
if aspect_value is None:
|
|
1813
|
+
logger.warning(
|
|
1814
|
+
f"Unknown aspect value for aspect {aspect_name}, skipping deserialization"
|
|
1815
|
+
)
|
|
1816
|
+
continue
|
|
1817
|
+
|
|
1818
|
+
try:
|
|
1819
|
+
post_json_obj = post_json_transform(aspect_value)
|
|
1820
|
+
typed_aspect = aspect_class.from_obj(post_json_obj)
|
|
1821
|
+
assert isinstance(typed_aspect, aspect_class) and isinstance(
|
|
1822
|
+
typed_aspect, _Aspect
|
|
1823
|
+
)
|
|
1824
|
+
|
|
1825
|
+
system_metadata = None
|
|
1826
|
+
if with_system_metadata:
|
|
1827
|
+
system_metadata_obj = aspect_obj.get("systemMetadata")
|
|
1828
|
+
if system_metadata_obj:
|
|
1829
|
+
system_metadata = SystemMetadataClass.from_obj(
|
|
1830
|
+
system_metadata_obj
|
|
1831
|
+
)
|
|
1832
|
+
|
|
1833
|
+
entity_aspects[aspect_name] = (typed_aspect, system_metadata)
|
|
1834
|
+
except Exception as e:
|
|
1835
|
+
logger.error(f"Error deserializing aspect {aspect_name}: {e}")
|
|
1836
|
+
raise
|
|
1837
|
+
|
|
1838
|
+
if entity_aspects:
|
|
1839
|
+
result[entity_urn] = entity_aspects
|
|
1840
|
+
|
|
1841
|
+
return result
|
|
1842
|
+
|
|
1739
1843
|
def upsert_custom_assertion(
|
|
1740
1844
|
self,
|
|
1741
1845
|
urn: Optional[str],
|
|
@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
|
|
|
39
39
|
from datahub.ingestion.sink.datahub_rest import DatahubRestSink
|
|
40
40
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
|
41
41
|
from datahub.ingestion.source.source_registry import source_registry
|
|
42
|
-
from datahub.ingestion.transformer.system_metadata_transformer import (
|
|
43
|
-
SystemMetadataTransformer,
|
|
44
|
-
)
|
|
45
42
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
46
43
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
47
44
|
from datahub.telemetry import stats
|
|
@@ -286,9 +283,6 @@ class Pipeline:
|
|
|
286
283
|
f"Transformer type:{transformer_type},{transformer_class} configured"
|
|
287
284
|
)
|
|
288
285
|
|
|
289
|
-
# Add the system metadata transformer at the end of the list.
|
|
290
|
-
self.transformers.append(SystemMetadataTransformer(self.ctx))
|
|
291
|
-
|
|
292
286
|
def _configure_reporting(self, report_to: Optional[str]) -> None:
|
|
293
287
|
if self.dry_run:
|
|
294
288
|
# In dry run mode, we don't want to report anything.
|
|
@@ -323,7 +323,7 @@ class ModelProcessor:
|
|
|
323
323
|
model_training_jobs = model_training_jobs.union(
|
|
324
324
|
{
|
|
325
325
|
job_urn
|
|
326
|
-
for job_urn, job_direction in data_url_matched_jobs
|
|
326
|
+
for job_urn, job_direction in data_url_matched_jobs
|
|
327
327
|
if job_direction == JobDirection.TRAINING
|
|
328
328
|
}
|
|
329
329
|
)
|
|
@@ -331,7 +331,7 @@ class ModelProcessor:
|
|
|
331
331
|
model_downstream_jobs = model_downstream_jobs.union(
|
|
332
332
|
{
|
|
333
333
|
job_urn
|
|
334
|
-
for job_urn, job_direction in data_url_matched_jobs
|
|
334
|
+
for job_urn, job_direction in data_url_matched_jobs
|
|
335
335
|
if job_direction == JobDirection.DOWNSTREAM
|
|
336
336
|
}
|
|
337
337
|
)
|
|
@@ -368,7 +368,7 @@ class ModelProcessor:
|
|
|
368
368
|
model_training_jobs = model_training_jobs.union(
|
|
369
369
|
{
|
|
370
370
|
job_urn
|
|
371
|
-
for job_urn, job_direction in name_matched_jobs
|
|
371
|
+
for job_urn, job_direction in name_matched_jobs
|
|
372
372
|
if job_direction == JobDirection.TRAINING
|
|
373
373
|
}
|
|
374
374
|
)
|
|
@@ -376,7 +376,7 @@ class ModelProcessor:
|
|
|
376
376
|
model_downstream_jobs = model_downstream_jobs.union(
|
|
377
377
|
{
|
|
378
378
|
job_urn
|
|
379
|
-
for job_urn, job_direction in name_matched_jobs
|
|
379
|
+
for job_urn, job_direction in name_matched_jobs
|
|
380
380
|
if job_direction == JobDirection.DOWNSTREAM
|
|
381
381
|
}
|
|
382
382
|
)
|
|
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
|
|
|
375
375
|
memory_footprint.total_size(lineage)
|
|
376
376
|
)
|
|
377
377
|
|
|
378
|
-
for lineage_key in lineage
|
|
378
|
+
for lineage_key in lineage:
|
|
379
379
|
# For views, we do not use the upstreams obtained by parsing audit logs
|
|
380
380
|
# as they may contain indirectly referenced tables.
|
|
381
381
|
if (
|
|
@@ -362,7 +362,7 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
362
362
|
if self.config.include_table_item is None:
|
|
363
363
|
return
|
|
364
364
|
dataset_name = f"{region}.{table_name}"
|
|
365
|
-
if dataset_name not in self.config.include_table_item
|
|
365
|
+
if dataset_name not in self.config.include_table_item:
|
|
366
366
|
return
|
|
367
367
|
primary_key_list = self.config.include_table_item.get(dataset_name)
|
|
368
368
|
assert isinstance(primary_key_list, List)
|
|
@@ -215,6 +215,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
215
215
|
datajob = DataJob(
|
|
216
216
|
id=connector.connector_id,
|
|
217
217
|
flow_urn=dataflow_urn,
|
|
218
|
+
platform_instance=self.config.platform_instance,
|
|
218
219
|
name=connector.connector_name,
|
|
219
220
|
owners={owner_email} if owner_email else set(),
|
|
220
221
|
)
|
|
@@ -190,7 +190,7 @@ class FivetranLogAPI:
|
|
|
190
190
|
jobs: List[Job] = []
|
|
191
191
|
if connector_sync_log is None:
|
|
192
192
|
return jobs
|
|
193
|
-
for sync_id in connector_sync_log
|
|
193
|
+
for sync_id in connector_sync_log:
|
|
194
194
|
if len(connector_sync_log[sync_id]) != 2:
|
|
195
195
|
# If both sync-start and sync-end event log not present for this sync that means sync is still in progress
|
|
196
196
|
continue
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
from datahub.metadata.urns import DataPlatformUrn
|
|
2
|
+
|
|
1
3
|
HEX_PLATFORM_NAME = "hex"
|
|
4
|
+
HEX_PLATFORM_URN = DataPlatformUrn(platform_name=HEX_PLATFORM_NAME)
|
|
2
5
|
HEX_API_BASE_URL_DEFAULT = "https://app.hex.tech/api/v1"
|
|
3
6
|
HEX_API_PAGE_SIZE_DEFAULT = 100
|
|
7
|
+
|
|
8
|
+
DATAHUB_API_PAGE_SIZE_DEFAULT = 100
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
1
3
|
from typing import Any, Dict, Iterable, List, Optional
|
|
2
4
|
|
|
3
|
-
from pydantic import Field, SecretStr
|
|
5
|
+
from pydantic import Field, SecretStr, root_validator
|
|
4
6
|
from typing_extensions import assert_never
|
|
5
7
|
|
|
6
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
9
|
+
from datahub.configuration.datetimes import parse_user_datetime
|
|
7
10
|
from datahub.configuration.source_common import (
|
|
8
11
|
EnvConfigMixin,
|
|
9
12
|
PlatformInstanceConfigMixin,
|
|
@@ -21,12 +24,17 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
|
21
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
25
|
from datahub.ingestion.source.hex.api import HexApi, HexApiReport
|
|
23
26
|
from datahub.ingestion.source.hex.constants import (
|
|
27
|
+
DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
24
28
|
HEX_API_BASE_URL_DEFAULT,
|
|
25
29
|
HEX_API_PAGE_SIZE_DEFAULT,
|
|
26
30
|
HEX_PLATFORM_NAME,
|
|
27
31
|
)
|
|
28
32
|
from datahub.ingestion.source.hex.mapper import Mapper
|
|
29
33
|
from datahub.ingestion.source.hex.model import Component, Project
|
|
34
|
+
from datahub.ingestion.source.hex.query_fetcher import (
|
|
35
|
+
HexQueryFetcher,
|
|
36
|
+
HexQueryFetcherReport,
|
|
37
|
+
)
|
|
30
38
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
31
39
|
StaleEntityRemovalHandler,
|
|
32
40
|
StaleEntityRemovalSourceReport,
|
|
@@ -34,9 +42,10 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
34
42
|
)
|
|
35
43
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
36
44
|
StatefulIngestionConfigBase,
|
|
37
|
-
StatefulIngestionReport,
|
|
38
45
|
StatefulIngestionSourceBase,
|
|
39
46
|
)
|
|
47
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
48
|
+
from datahub.sdk.main_client import DataHubClient
|
|
40
49
|
|
|
41
50
|
|
|
42
51
|
class HexSourceConfig(
|
|
@@ -93,9 +102,73 @@ class HexSourceConfig(
|
|
|
93
102
|
default=True,
|
|
94
103
|
description="Set ownership identity from owner/creator email",
|
|
95
104
|
)
|
|
105
|
+
include_lineage: bool = Field(
|
|
106
|
+
default=True,
|
|
107
|
+
description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
|
|
108
|
+
)
|
|
109
|
+
lineage_start_time: Optional[datetime] = Field(
|
|
110
|
+
default=None,
|
|
111
|
+
description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
|
|
112
|
+
)
|
|
113
|
+
lineage_end_time: Optional[datetime] = Field(
|
|
114
|
+
default=None,
|
|
115
|
+
description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
|
|
116
|
+
)
|
|
117
|
+
datahub_page_size: int = Field(
|
|
118
|
+
default=DATAHUB_API_PAGE_SIZE_DEFAULT,
|
|
119
|
+
description="Number of items to fetch per DataHub API call.",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
@root_validator(pre=True)
|
|
123
|
+
def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
124
|
+
# lineage_end_time default = now
|
|
125
|
+
if "lineage_end_time" not in data or data["lineage_end_time"] is None:
|
|
126
|
+
data["lineage_end_time"] = datetime.now(tz=timezone.utc)
|
|
127
|
+
# if string is given, parse it
|
|
128
|
+
if isinstance(data["lineage_end_time"], str):
|
|
129
|
+
data["lineage_end_time"] = parse_user_datetime(data["lineage_end_time"])
|
|
130
|
+
# if no timezone is given, assume UTC
|
|
131
|
+
if data["lineage_end_time"].tzinfo is None:
|
|
132
|
+
data["lineage_end_time"] = data["lineage_end_time"].replace(
|
|
133
|
+
tzinfo=timezone.utc
|
|
134
|
+
)
|
|
135
|
+
# at this point, we ensure there is a non null datetime with UTC timezone for lineage_end_time
|
|
136
|
+
assert (
|
|
137
|
+
data["lineage_end_time"]
|
|
138
|
+
and isinstance(data["lineage_end_time"], datetime)
|
|
139
|
+
and data["lineage_end_time"].tzinfo is not None
|
|
140
|
+
and data["lineage_end_time"].tzinfo == timezone.utc
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# lineage_start_time default = lineage_end_time - 1 day
|
|
144
|
+
if "lineage_start_time" not in data or data["lineage_start_time"] is None:
|
|
145
|
+
data["lineage_start_time"] = data["lineage_end_time"] - timedelta(days=1)
|
|
146
|
+
# if string is given, parse it
|
|
147
|
+
if isinstance(data["lineage_start_time"], str):
|
|
148
|
+
data["lineage_start_time"] = parse_user_datetime(data["lineage_start_time"])
|
|
149
|
+
# if no timezone is given, assume UTC
|
|
150
|
+
if data["lineage_start_time"].tzinfo is None:
|
|
151
|
+
data["lineage_start_time"] = data["lineage_start_time"].replace(
|
|
152
|
+
tzinfo=timezone.utc
|
|
153
|
+
)
|
|
154
|
+
# at this point, we ensure there is a non null datetime with UTC timezone for lineage_start_time
|
|
155
|
+
assert (
|
|
156
|
+
data["lineage_start_time"]
|
|
157
|
+
and isinstance(data["lineage_start_time"], datetime)
|
|
158
|
+
and data["lineage_start_time"].tzinfo is not None
|
|
159
|
+
and data["lineage_start_time"].tzinfo == timezone.utc
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return data
|
|
96
163
|
|
|
97
164
|
|
|
98
|
-
|
|
165
|
+
@dataclass
|
|
166
|
+
class HexReport(
|
|
167
|
+
StaleEntityRemovalSourceReport,
|
|
168
|
+
HexApiReport,
|
|
169
|
+
IngestionStageReport,
|
|
170
|
+
HexQueryFetcherReport,
|
|
171
|
+
):
|
|
99
172
|
pass
|
|
100
173
|
|
|
101
174
|
|
|
@@ -110,7 +183,7 @@ class HexSource(StatefulIngestionSourceBase):
|
|
|
110
183
|
def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
|
|
111
184
|
super().__init__(config, ctx)
|
|
112
185
|
self.source_config = config
|
|
113
|
-
self.report = HexReport()
|
|
186
|
+
self.report: HexReport = HexReport()
|
|
114
187
|
self.platform = HEX_PLATFORM_NAME
|
|
115
188
|
self.hex_api = HexApi(
|
|
116
189
|
report=self.report,
|
|
@@ -129,6 +202,28 @@ class HexSource(StatefulIngestionSourceBase):
|
|
|
129
202
|
categories_as_tags=self.source_config.categories_as_tags,
|
|
130
203
|
set_ownership_from_email=self.source_config.set_ownership_from_email,
|
|
131
204
|
)
|
|
205
|
+
self.project_registry: Dict[str, Project] = {}
|
|
206
|
+
self.component_registry: Dict[str, Component] = {}
|
|
207
|
+
|
|
208
|
+
self.datahub_client: Optional[DataHubClient] = None
|
|
209
|
+
self.query_fetcher: Optional[HexQueryFetcher] = None
|
|
210
|
+
if self.source_config.include_lineage:
|
|
211
|
+
graph = ctx.require_graph("Lineage")
|
|
212
|
+
assert self.source_config.lineage_start_time and isinstance(
|
|
213
|
+
self.source_config.lineage_start_time, datetime
|
|
214
|
+
)
|
|
215
|
+
assert self.source_config.lineage_end_time and isinstance(
|
|
216
|
+
self.source_config.lineage_end_time, datetime
|
|
217
|
+
)
|
|
218
|
+
self.datahub_client = DataHubClient(graph=graph)
|
|
219
|
+
self.query_fetcher = HexQueryFetcher(
|
|
220
|
+
datahub_client=self.datahub_client,
|
|
221
|
+
workspace_name=self.source_config.workspace_name,
|
|
222
|
+
start_datetime=self.source_config.lineage_start_time,
|
|
223
|
+
end_datetime=self.source_config.lineage_end_time,
|
|
224
|
+
report=self.report,
|
|
225
|
+
page_size=self.source_config.datahub_page_size,
|
|
226
|
+
)
|
|
132
227
|
|
|
133
228
|
@classmethod
|
|
134
229
|
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "HexSource":
|
|
@@ -143,25 +238,58 @@ class HexSource(StatefulIngestionSourceBase):
|
|
|
143
238
|
).workunit_processor,
|
|
144
239
|
]
|
|
145
240
|
|
|
146
|
-
def get_report(self) ->
|
|
241
|
+
def get_report(self) -> HexReport:
|
|
147
242
|
return self.report
|
|
148
243
|
|
|
149
244
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
if self.source_config.project_title_pattern.allowed(
|
|
155
|
-
project_or_component.title
|
|
156
|
-
):
|
|
157
|
-
yield from self.mapper.map_project(project=project_or_component)
|
|
158
|
-
elif isinstance(project_or_component, Component):
|
|
159
|
-
if (
|
|
160
|
-
self.source_config.include_components
|
|
161
|
-
and self.source_config.component_title_pattern.allowed(
|
|
245
|
+
with self.report.new_stage("Fetch Hex assets from Hex API"):
|
|
246
|
+
for project_or_component in self.hex_api.fetch_projects():
|
|
247
|
+
if isinstance(project_or_component, Project):
|
|
248
|
+
if self.source_config.project_title_pattern.allowed(
|
|
162
249
|
project_or_component.title
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
250
|
+
):
|
|
251
|
+
self.project_registry[project_or_component.id] = (
|
|
252
|
+
project_or_component
|
|
253
|
+
)
|
|
254
|
+
elif isinstance(project_or_component, Component):
|
|
255
|
+
if (
|
|
256
|
+
self.source_config.include_components
|
|
257
|
+
and self.source_config.component_title_pattern.allowed(
|
|
258
|
+
project_or_component.title
|
|
259
|
+
)
|
|
260
|
+
):
|
|
261
|
+
self.component_registry[project_or_component.id] = (
|
|
262
|
+
project_or_component
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
assert_never(project_or_component)
|
|
266
|
+
|
|
267
|
+
if self.source_config.include_lineage:
|
|
268
|
+
assert self.datahub_client and self.query_fetcher
|
|
269
|
+
|
|
270
|
+
with self.report.new_stage(
|
|
271
|
+
"Fetch Hex lineage from existing Queries in DataHub"
|
|
272
|
+
):
|
|
273
|
+
for query_metadata in self.query_fetcher.fetch():
|
|
274
|
+
project = self.project_registry.get(query_metadata.hex_project_id)
|
|
275
|
+
if project:
|
|
276
|
+
project.upstream_datasets.extend(
|
|
277
|
+
query_metadata.dataset_subjects
|
|
278
|
+
)
|
|
279
|
+
project.upstream_schema_fields.extend(
|
|
280
|
+
query_metadata.schema_field_subjects
|
|
281
|
+
)
|
|
282
|
+
else:
|
|
283
|
+
self.report.report_warning(
|
|
284
|
+
title="Missing project for lineage",
|
|
285
|
+
message="Lineage missed because missed project, likely due to filter patterns or deleted project.",
|
|
286
|
+
context=str(query_metadata),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
with self.report.new_stage("Emit"):
|
|
290
|
+
yield from self.mapper.map_workspace()
|
|
291
|
+
|
|
292
|
+
for project in self.project_registry.values():
|
|
293
|
+
yield from self.mapper.map_project(project=project)
|
|
294
|
+
for component in self.component_registry.values():
|
|
295
|
+
yield from self.mapper.map_component(component=component)
|