acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (51) hide show
  1. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
  2. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataproduct/dataproduct.py +26 -0
  5. datahub/cli/config_utils.py +18 -10
  6. datahub/cli/docker_check.py +2 -1
  7. datahub/cli/docker_cli.py +4 -2
  8. datahub/cli/graphql_cli.py +1422 -0
  9. datahub/cli/quickstart_versioning.py +2 -2
  10. datahub/cli/specific/dataproduct_cli.py +2 -4
  11. datahub/cli/specific/user_cli.py +172 -1
  12. datahub/configuration/env_vars.py +331 -0
  13. datahub/configuration/kafka.py +6 -4
  14. datahub/emitter/mce_builder.py +2 -4
  15. datahub/emitter/rest_emitter.py +15 -15
  16. datahub/entrypoints.py +2 -0
  17. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  18. datahub/ingestion/api/source.py +5 -0
  19. datahub/ingestion/graph/client.py +197 -0
  20. datahub/ingestion/graph/config.py +2 -2
  21. datahub/ingestion/sink/datahub_rest.py +6 -5
  22. datahub/ingestion/source/aws/aws_common.py +20 -13
  23. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
  24. datahub/ingestion/source/grafana/models.py +5 -0
  25. datahub/ingestion/source/iceberg/iceberg.py +39 -19
  26. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
  27. datahub/ingestion/source/mode.py +13 -0
  28. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  29. datahub/ingestion/source/schema_inference/object.py +22 -6
  30. datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
  31. datahub/ingestion/source/sql/mssql/source.py +7 -1
  32. datahub/ingestion/source/sql/teradata.py +80 -65
  33. datahub/ingestion/source/unity/config.py +31 -0
  34. datahub/ingestion/source/unity/proxy.py +73 -0
  35. datahub/ingestion/source/unity/source.py +27 -70
  36. datahub/ingestion/source/unity/usage.py +46 -4
  37. datahub/metadata/_internal_schema_classes.py +544 -544
  38. datahub/metadata/_urns/urn_defs.py +1728 -1728
  39. datahub/metadata/schema.avsc +15157 -15157
  40. datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
  41. datahub/sql_parsing/sqlglot_lineage.py +7 -0
  42. datahub/telemetry/telemetry.py +8 -3
  43. datahub/utilities/file_backed_collections.py +2 -2
  44. datahub/utilities/is_pytest.py +3 -2
  45. datahub/utilities/logging_manager.py +22 -6
  46. datahub/utilities/sample_data.py +5 -4
  47. datahub/emitter/sql_parsing_builder.py +0 -306
  48. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
  49. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
  50. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
  51. {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ import enum
4
4
  import functools
5
5
  import json
6
6
  import logging
7
- import os
8
7
  import pathlib
9
8
  import tempfile
10
9
  import uuid
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
14
13
 
15
14
  import datahub.emitter.mce_builder as builder
16
15
  import datahub.metadata.schema_classes as models
16
+ from datahub.configuration.env_vars import get_sql_agg_query_log
17
17
  from datahub.configuration.time_window_config import get_time_bucket
18
18
  from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
19
19
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
- from datahub.emitter.sql_parsing_builder import compute_upstream_fields
21
20
  from datahub.ingestion.api.closeable import Closeable
22
21
  from datahub.ingestion.api.report import Report
23
22
  from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -84,7 +83,7 @@ class QueryLogSetting(enum.Enum):
84
83
  _DEFAULT_USER_URN = CorpUserUrn("_ingestion")
85
84
  _MISSING_SESSION_ID = "__MISSING_SESSION_ID"
86
85
  _DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
87
- os.getenv("DATAHUB_SQL_AGG_QUERY_LOG") or QueryLogSetting.DISABLED.name
86
+ get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
88
87
  ]
89
88
  MAX_UPSTREAM_TABLES_COUNT = 300
90
89
  MAX_FINEGRAINEDLINEAGE_COUNT = 2000
@@ -868,7 +867,7 @@ class SqlParsingAggregator(Closeable):
868
867
  downstream=parsed.out_tables[0] if parsed.out_tables else None,
869
868
  column_lineage=parsed.column_lineage,
870
869
  # TODO: We need a full list of columns referenced, not just the out tables.
871
- column_usage=compute_upstream_fields(parsed),
870
+ column_usage=self._compute_upstream_fields(parsed),
872
871
  inferred_schema=infer_output_schema(parsed),
873
872
  confidence_score=parsed.debug_info.confidence,
874
873
  extra_info=observed.extra_info,
@@ -1157,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
1157
1156
  actor=None,
1158
1157
  upstreams=parsed.in_tables,
1159
1158
  column_lineage=parsed.column_lineage or [],
1160
- column_usage=compute_upstream_fields(parsed),
1159
+ column_usage=self._compute_upstream_fields(parsed),
1161
1160
  confidence_score=parsed.debug_info.confidence,
1162
1161
  )
1163
1162
  )
@@ -1741,6 +1740,16 @@ class SqlParsingAggregator(Closeable):
1741
1740
 
1742
1741
  return resolved_query
1743
1742
 
1743
+ @staticmethod
1744
+ def _compute_upstream_fields(
1745
+ result: SqlParsingResult,
1746
+ ) -> Dict[UrnStr, Set[UrnStr]]:
1747
+ upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
1748
+ for cl in result.column_lineage or []:
1749
+ for upstream in cl.upstreams:
1750
+ upstream_fields[upstream.table].add(upstream.column)
1751
+ return upstream_fields
1752
+
1744
1753
  def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
1745
1754
  if not self._usage_aggregator:
1746
1755
  return
@@ -691,6 +691,13 @@ def _column_level_lineage(
691
691
  select_statement=select_statement,
692
692
  )
693
693
 
694
+ # Handle VALUES expressions separately - they have no upstream tables and no column lineage
695
+ if isinstance(select_statement, sqlglot.exp.Values):
696
+ return _ColumnLineageWithDebugInfo(
697
+ column_lineage=[],
698
+ select_statement=select_statement,
699
+ )
700
+
694
701
  assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
695
702
  try:
696
703
  root_scope = sqlglot.optimizer.build_scope(select_statement)
@@ -16,6 +16,11 @@ from datahub._version import __version__, nice_version_name
16
16
  from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
17
17
  from datahub.cli.env_utils import get_boolean_env_variable
18
18
  from datahub.configuration.common import ExceptionWithProps
19
+ from datahub.configuration.env_vars import (
20
+ get_sentry_dsn,
21
+ get_sentry_environment,
22
+ get_telemetry_timeout,
23
+ )
19
24
  from datahub.metadata.schema_classes import _custom_package_path
20
25
  from datahub.utilities.perf_timer import PerfTimer
21
26
 
@@ -97,11 +102,11 @@ if any(var in os.environ for var in CI_ENV_VARS):
97
102
  if _custom_package_path:
98
103
  ENV_ENABLED = False
99
104
 
100
- TIMEOUT = int(os.environ.get("DATAHUB_TELEMETRY_TIMEOUT", "10"))
105
+ TIMEOUT = int(get_telemetry_timeout())
101
106
  MIXPANEL_ENDPOINT = "track.datahubproject.io/mp"
102
107
  MIXPANEL_TOKEN = "5ee83d940754d63cacbf7d34daa6f44a"
103
- SENTRY_DSN: Optional[str] = os.environ.get("SENTRY_DSN", None)
104
- SENTRY_ENVIRONMENT: str = os.environ.get("SENTRY_ENVIRONMENT", "dev")
108
+ SENTRY_DSN: Optional[str] = get_sentry_dsn()
109
+ SENTRY_ENVIRONMENT: str = get_sentry_environment()
105
110
 
106
111
 
107
112
  def _default_global_properties() -> Dict[str, Any]:
@@ -1,7 +1,6 @@
1
1
  import collections
2
2
  import gzip
3
3
  import logging
4
- import os
5
4
  import pathlib
6
5
  import pickle
7
6
  import shutil
@@ -28,6 +27,7 @@ from typing import (
28
27
  Union,
29
28
  )
30
29
 
30
+ from datahub.configuration.env_vars import get_override_sqlite_version_req
31
31
  from datahub.ingestion.api.closeable import Closeable
32
32
  from datahub.utilities.sentinels import Unset, unset
33
33
 
@@ -36,7 +36,7 @@ logger: logging.Logger = logging.getLogger(__name__)
36
36
 
37
37
  def _get_sqlite_version_override() -> bool:
38
38
  """Check if SQLite version requirement should be overridden at runtime."""
39
- override_str = os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
39
+ override_str = get_override_sqlite_version_req()
40
40
  return bool(override_str and override_str.lower() != "false")
41
41
 
42
42
 
@@ -1,6 +1,7 @@
1
- import os
2
1
  import sys
3
2
 
3
+ from datahub.configuration.env_vars import get_test_mode
4
+
4
5
 
5
6
  def is_pytest_running() -> bool:
6
- return "pytest" in sys.modules and os.environ.get("DATAHUB_TEST_MODE") == "1"
7
+ return "pytest" in sys.modules and get_test_mode() == "1"
@@ -15,13 +15,13 @@ import collections
15
15
  import contextlib
16
16
  import itertools
17
17
  import logging
18
- import os
19
18
  import pathlib
20
19
  import sys
21
20
  from typing import Deque, Iterator, Optional
22
21
 
23
22
  import click
24
23
 
24
+ from datahub.configuration.env_vars import get_no_color, get_suppress_logging_manager
25
25
  from datahub.utilities.tee_io import TeeIO
26
26
 
27
27
  BASE_LOGGING_FORMAT = (
@@ -38,7 +38,7 @@ IN_MEMORY_LOG_BUFFER_SIZE = 2000 # lines
38
38
  IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH = 2000 # characters
39
39
 
40
40
 
41
- NO_COLOR = os.environ.get("NO_COLOR", False)
41
+ NO_COLOR = get_no_color()
42
42
 
43
43
 
44
44
  def extract_name_from_filename(filename: str, fallback_name: str) -> str:
@@ -179,6 +179,18 @@ class _LogBuffer:
179
179
  return text
180
180
 
181
181
 
182
+ class _ResilientStreamHandler(logging.StreamHandler):
183
+ """StreamHandler that gracefully handles closed streams."""
184
+
185
+ def emit(self, record: logging.LogRecord) -> None:
186
+ try:
187
+ super().emit(record)
188
+ except (ValueError, OSError):
189
+ # Stream was closed (e.g., during pytest teardown)
190
+ # Silently ignore to prevent test failures
191
+ pass
192
+
193
+
182
194
  class _BufferLogHandler(logging.Handler):
183
195
  def __init__(self, storage: _LogBuffer) -> None:
184
196
  super().__init__()
@@ -201,7 +213,11 @@ class _BufferLogHandler(logging.Handler):
201
213
  def _remove_all_handlers(logger: logging.Logger) -> None:
202
214
  for handler in logger.handlers[:]:
203
215
  logger.removeHandler(handler)
204
- handler.close()
216
+ try:
217
+ handler.close()
218
+ except (ValueError, OSError):
219
+ # Handler stream may already be closed (e.g., during pytest teardown)
220
+ pass
205
221
 
206
222
 
207
223
  _log_buffer = _LogBuffer(maxlen=IN_MEMORY_LOG_BUFFER_SIZE)
@@ -219,14 +235,14 @@ _default_formatter = logging.Formatter(BASE_LOGGING_FORMAT)
219
235
  def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[None]:
220
236
  _log_buffer.clear()
221
237
 
222
- if os.environ.get("DATAHUB_SUPPRESS_LOGGING_MANAGER") == "1":
238
+ if get_suppress_logging_manager() == "1":
223
239
  # If we're running in pytest, we don't want to configure logging.
224
240
  yield
225
241
  return
226
242
 
227
243
  with contextlib.ExitStack() as stack:
228
244
  # Create stdout handler.
229
- stream_handler = logging.StreamHandler()
245
+ stream_handler = _ResilientStreamHandler()
230
246
  stream_handler.addFilter(_DatahubLogFilter(debug=debug))
231
247
  stream_handler.setFormatter(_stream_formatter)
232
248
 
@@ -237,7 +253,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
237
253
  tee = TeeIO(sys.stdout, file)
238
254
  stack.enter_context(contextlib.redirect_stdout(tee)) # type: ignore
239
255
 
240
- file_handler = logging.StreamHandler(file)
256
+ file_handler = _ResilientStreamHandler(file)
241
257
  file_handler.addFilter(_DatahubLogFilter(debug=True))
242
258
  file_handler.setFormatter(_default_formatter)
243
259
  else:
@@ -1,12 +1,13 @@
1
- import os
2
1
  import pathlib
3
2
  import tempfile
4
3
 
5
4
  import requests
6
5
 
7
- DOCKER_COMPOSE_BASE = os.getenv(
8
- "DOCKER_COMPOSE_BASE",
9
- "https://raw.githubusercontent.com/datahub-project/datahub/master",
6
+ from datahub.configuration.env_vars import get_docker_compose_base
7
+
8
+ DOCKER_COMPOSE_BASE = (
9
+ get_docker_compose_base()
10
+ or "https://raw.githubusercontent.com/datahub-project/datahub/master"
10
11
  )
11
12
  BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
12
13
  BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}"
@@ -1,306 +0,0 @@
1
- import logging
2
- import time
3
- from collections import defaultdict
4
- from dataclasses import dataclass, field
5
- from datetime import datetime
6
- from typing import Collection, Dict, Iterable, List, Optional, Set
7
-
8
- from datahub.emitter.mce_builder import make_schema_field_urn
9
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
10
- from datahub.ingestion.api.workunit import MetadataWorkUnit
11
- from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator
12
- from datahub.metadata.schema_classes import (
13
- AuditStampClass,
14
- DatasetLineageTypeClass,
15
- FineGrainedLineageClass,
16
- FineGrainedLineageDownstreamTypeClass,
17
- FineGrainedLineageUpstreamTypeClass,
18
- OperationClass,
19
- OperationTypeClass,
20
- UpstreamClass,
21
- UpstreamLineageClass,
22
- )
23
- from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
24
- from datahub.utilities.file_backed_collections import FileBackedDict
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
- # TODO: Use this over other sources' equivalent code, if possible
29
-
30
- DatasetUrn = str
31
- FieldUrn = str
32
- UserUrn = str
33
-
34
-
35
- @dataclass
36
- class LineageEdge:
37
- """Stores information about a single lineage edge, from an upstream table to a downstream table."""
38
-
39
- downstream_urn: DatasetUrn
40
- upstream_urn: DatasetUrn
41
- audit_stamp: Optional[datetime]
42
- actor: Optional[UserUrn]
43
- type: str = DatasetLineageTypeClass.TRANSFORMED
44
-
45
- # Maps downstream_col -> {upstream_col}
46
- column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set))
47
-
48
- def gen_upstream_aspect(self) -> UpstreamClass:
49
- return UpstreamClass(
50
- auditStamp=(
51
- AuditStampClass(
52
- time=int(self.audit_stamp.timestamp() * 1000),
53
- actor=self.actor or "",
54
- )
55
- if self.audit_stamp
56
- else None
57
- ),
58
- dataset=self.upstream_urn,
59
- type=self.type,
60
- )
61
-
62
- def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]:
63
- for downstream_col, upstream_cols in self.column_map.items():
64
- yield FineGrainedLineageClass(
65
- upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
66
- # Sort to avoid creating multiple aspects in backend with same lineage but different order
67
- upstreams=sorted(
68
- make_schema_field_urn(self.upstream_urn, col)
69
- for col in upstream_cols
70
- ),
71
- downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
72
- downstreams=[
73
- make_schema_field_urn(self.downstream_urn, downstream_col)
74
- ],
75
- )
76
-
77
-
78
- @dataclass
79
- class SqlParsingBuilder:
80
- # Open question: does it make sense to iterate over out_tables? When will we have multiple?
81
-
82
- generate_lineage: bool = True
83
- generate_usage_statistics: bool = True
84
- generate_operations: bool = True
85
- usage_config: Optional[BaseUsageConfig] = None
86
-
87
- # Maps downstream urn -> upstream urn -> LineageEdge
88
- # Builds up a single LineageEdge for each upstream -> downstream pair
89
- _lineage_map: FileBackedDict[Dict[DatasetUrn, LineageEdge]] = field(
90
- default_factory=FileBackedDict, init=False
91
- )
92
-
93
- # TODO: Replace with FileBackedDict approach like in BigQuery usage
94
- _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False)
95
-
96
- def __post_init__(self) -> None:
97
- if self.usage_config:
98
- self._usage_aggregator = UsageAggregator(self.usage_config)
99
- elif self.generate_usage_statistics:
100
- logger.info("No usage config provided, not generating usage statistics")
101
- self.generate_usage_statistics = False
102
-
103
- def process_sql_parsing_result(
104
- self,
105
- result: SqlParsingResult,
106
- *,
107
- query: str,
108
- query_timestamp: Optional[datetime] = None,
109
- is_view_ddl: bool = False,
110
- user: Optional[UserUrn] = None,
111
- custom_operation_type: Optional[str] = None,
112
- include_urns: Optional[Set[DatasetUrn]] = None,
113
- include_column_lineage: bool = True,
114
- ) -> Iterable[MetadataWorkUnit]:
115
- """Process a single query and yield any generated workunits.
116
-
117
- Args:
118
- result: The result of parsing the query, or a mock result if parsing failed.
119
- query: The SQL query to parse and process.
120
- query_timestamp: When the query was run.
121
- is_view_ddl: Whether the query is a DDL statement that creates a view.
122
- user: The urn of the user who ran the query.
123
- custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed.
124
- include_urns: If provided, only generate workunits for these urns.
125
- """
126
- downstreams_to_ingest = result.out_tables
127
- upstreams_to_ingest = result.in_tables
128
- if include_urns:
129
- logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}")
130
- downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns)
131
- upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns)
132
-
133
- if self.generate_lineage:
134
- for downstream_urn in downstreams_to_ingest:
135
- # Set explicitly so that FileBackedDict registers any mutations
136
- self._lineage_map[downstream_urn] = _merge_lineage_data(
137
- downstream_urn=downstream_urn,
138
- upstream_urns=result.in_tables,
139
- column_lineage=(
140
- result.column_lineage if include_column_lineage else None
141
- ),
142
- upstream_edges=self._lineage_map.get(downstream_urn, {}),
143
- query_timestamp=query_timestamp,
144
- is_view_ddl=is_view_ddl,
145
- user=user,
146
- )
147
-
148
- if self.generate_usage_statistics and query_timestamp is not None:
149
- upstream_fields = compute_upstream_fields(result)
150
- for upstream_urn in upstreams_to_ingest:
151
- self._usage_aggregator.aggregate_event(
152
- resource=upstream_urn,
153
- start_time=query_timestamp,
154
- query=query,
155
- user=user,
156
- fields=sorted(upstream_fields.get(upstream_urn, [])),
157
- )
158
-
159
- if self.generate_operations and query_timestamp is not None:
160
- for downstream_urn in downstreams_to_ingest:
161
- yield from _gen_operation_workunit(
162
- result,
163
- downstream_urn=downstream_urn,
164
- query_timestamp=query_timestamp,
165
- user=user,
166
- custom_operation_type=custom_operation_type,
167
- )
168
-
169
- def add_lineage(
170
- self,
171
- downstream_urn: DatasetUrn,
172
- upstream_urns: Collection[DatasetUrn],
173
- timestamp: Optional[datetime] = None,
174
- is_view_ddl: bool = False,
175
- user: Optional[UserUrn] = None,
176
- ) -> None:
177
- """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails."""
178
- # Set explicitly so that FileBackedDict registers any mutations
179
- self._lineage_map[downstream_urn] = _merge_lineage_data(
180
- downstream_urn=downstream_urn,
181
- upstream_urns=upstream_urns,
182
- column_lineage=None,
183
- upstream_edges=self._lineage_map.get(downstream_urn, {}),
184
- query_timestamp=timestamp,
185
- is_view_ddl=is_view_ddl,
186
- user=user,
187
- )
188
-
189
- def gen_workunits(self) -> Iterable[MetadataWorkUnit]:
190
- if self.generate_lineage:
191
- for mcp in self._gen_lineage_mcps():
192
- yield mcp.as_workunit()
193
- if self.generate_usage_statistics:
194
- yield from self._gen_usage_statistics_workunits()
195
-
196
- def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
197
- for downstream_urn in self._lineage_map:
198
- upstreams: List[UpstreamClass] = []
199
- fine_upstreams: List[FineGrainedLineageClass] = []
200
- for edge in self._lineage_map[downstream_urn].values():
201
- upstreams.append(edge.gen_upstream_aspect())
202
- fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects())
203
-
204
- if not upstreams:
205
- continue
206
-
207
- upstream_lineage = UpstreamLineageClass(
208
- upstreams=sorted(upstreams, key=lambda x: x.dataset),
209
- fineGrainedLineages=sorted(
210
- fine_upstreams,
211
- key=lambda x: (x.downstreams, x.upstreams),
212
- )
213
- or None,
214
- )
215
- yield MetadataChangeProposalWrapper(
216
- entityUrn=downstream_urn, aspect=upstream_lineage
217
- )
218
-
219
- def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]:
220
- yield from self._usage_aggregator.generate_workunits(
221
- resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn
222
- )
223
-
224
-
225
- def _merge_lineage_data(
226
- downstream_urn: DatasetUrn,
227
- *,
228
- upstream_urns: Collection[DatasetUrn],
229
- column_lineage: Optional[List[ColumnLineageInfo]],
230
- upstream_edges: Dict[DatasetUrn, LineageEdge],
231
- query_timestamp: Optional[datetime],
232
- is_view_ddl: bool,
233
- user: Optional[UserUrn],
234
- ) -> Dict[str, LineageEdge]:
235
- for upstream_urn in upstream_urns:
236
- edge = upstream_edges.setdefault(
237
- upstream_urn,
238
- LineageEdge(
239
- downstream_urn=downstream_urn,
240
- upstream_urn=upstream_urn,
241
- audit_stamp=query_timestamp,
242
- actor=user,
243
- type=(
244
- DatasetLineageTypeClass.VIEW
245
- if is_view_ddl
246
- else DatasetLineageTypeClass.TRANSFORMED
247
- ),
248
- ),
249
- )
250
- if query_timestamp and ( # Use the most recent query
251
- edge.audit_stamp is None or query_timestamp > edge.audit_stamp
252
- ):
253
- edge.audit_stamp = query_timestamp
254
- if user:
255
- edge.actor = user
256
-
257
- # Note: Inefficient as we loop through all column_lineage entries for each downstream table
258
- for cl in column_lineage or []:
259
- if cl.downstream.table == downstream_urn:
260
- for upstream_column_info in cl.upstreams:
261
- if upstream_column_info.table not in upstream_urns:
262
- continue
263
- column_map = upstream_edges[upstream_column_info.table].column_map
264
- column_map[cl.downstream.column].add(upstream_column_info.column)
265
-
266
- return upstream_edges
267
-
268
-
269
- def compute_upstream_fields(
270
- result: SqlParsingResult,
271
- ) -> Dict[DatasetUrn, Set[DatasetUrn]]:
272
- upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set)
273
- for cl in result.column_lineage or []:
274
- for upstream in cl.upstreams:
275
- upstream_fields[upstream.table].add(upstream.column)
276
- return upstream_fields
277
-
278
-
279
- def _gen_operation_workunit(
280
- result: SqlParsingResult,
281
- *,
282
- downstream_urn: DatasetUrn,
283
- query_timestamp: datetime,
284
- user: Optional[UserUrn],
285
- custom_operation_type: Optional[str],
286
- ) -> Iterable[MetadataWorkUnit]:
287
- operation_type = result.query_type.to_operation_type()
288
- # Filter out SELECT and other undesired statements
289
- if operation_type is None:
290
- return
291
- elif operation_type == OperationTypeClass.UNKNOWN:
292
- if custom_operation_type is None:
293
- return
294
- else:
295
- operation_type = OperationTypeClass.CUSTOM
296
-
297
- aspect = OperationClass(
298
- timestampMillis=int(time.time() * 1000),
299
- operationType=operation_type,
300
- lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000),
301
- actor=user,
302
- customOperationType=custom_operation_type,
303
- )
304
- yield MetadataChangeProposalWrapper(
305
- entityUrn=downstream_urn, aspect=aspect
306
- ).as_workunit()