acryl-datahub 0.15.0.4rc3__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (95) hide show
  1. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2507 -2470
  2. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +95 -86
  3. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/__init__.py +1 -25
  5. datahub/_version.py +13 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
  7. datahub/cli/check_cli.py +1 -1
  8. datahub/cli/cli_utils.py +3 -3
  9. datahub/cli/container_cli.py +1 -64
  10. datahub/cli/iceberg_cli.py +707 -0
  11. datahub/cli/ingest_cli.py +2 -2
  12. datahub/emitter/composite_emitter.py +36 -0
  13. datahub/emitter/rest_emitter.py +1 -1
  14. datahub/entrypoints.py +26 -5
  15. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  16. datahub/ingestion/api/registry.py +1 -1
  17. datahub/ingestion/glossary/classification_mixin.py +6 -0
  18. datahub/ingestion/glossary/classifier.py +3 -2
  19. datahub/ingestion/graph/client.py +2 -1
  20. datahub/ingestion/graph/entity_versioning.py +201 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  22. datahub/ingestion/run/connection.py +1 -1
  23. datahub/ingestion/run/pipeline.py +3 -3
  24. datahub/ingestion/source/abs/report.py +2 -2
  25. datahub/ingestion/source/apply/__init__.py +0 -0
  26. datahub/ingestion/source/apply/datahub_apply.py +223 -0
  27. datahub/ingestion/source/aws/glue.py +5 -2
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  29. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  30. datahub/ingestion/source/dbt/dbt_core.py +1 -1
  31. datahub/ingestion/source/delta_lake/report.py +2 -2
  32. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  33. datahub/ingestion/source/elastic_search.py +2 -1
  34. datahub/ingestion/source/ge_profiling_config.py +11 -7
  35. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  36. datahub/ingestion/source/identity/azure_ad.py +6 -14
  37. datahub/ingestion/source/identity/okta.py +2 -1
  38. datahub/ingestion/source/kafka/kafka.py +2 -1
  39. datahub/ingestion/source/kafka_connect/common.py +2 -1
  40. datahub/ingestion/source/ldap.py +2 -1
  41. datahub/ingestion/source/looker/looker_config.py +3 -1
  42. datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
  43. datahub/ingestion/source/looker/looker_file_loader.py +14 -3
  44. datahub/ingestion/source/looker/looker_template_language.py +104 -14
  45. datahub/ingestion/source/looker/lookml_config.py +29 -8
  46. datahub/ingestion/source/looker/lookml_source.py +110 -22
  47. datahub/ingestion/source/mode.py +2 -4
  48. datahub/ingestion/source/mongodb.py +2 -1
  49. datahub/ingestion/source/nifi.py +2 -1
  50. datahub/ingestion/source/powerbi/config.py +2 -2
  51. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  52. datahub/ingestion/source/redash.py +5 -5
  53. datahub/ingestion/source/salesforce.py +4 -1
  54. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  55. datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
  56. datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
  57. datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
  58. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
  59. datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
  60. datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
  61. datahub/ingestion/source/sql/clickhouse.py +5 -43
  62. datahub/ingestion/source/sql/mssql/job_models.py +37 -8
  63. datahub/ingestion/source/sql/mssql/source.py +17 -0
  64. datahub/ingestion/source/sql/sql_config.py +0 -10
  65. datahub/ingestion/source/tableau/tableau.py +16 -13
  66. datahub/ingestion/source/tableau/tableau_common.py +1 -1
  67. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  68. datahub/ingestion/source/unity/proxy.py +2 -2
  69. datahub/ingestion/source/unity/report.py +1 -0
  70. datahub/ingestion/source_config/operation_config.py +9 -0
  71. datahub/ingestion/source_report/pulsar.py +5 -4
  72. datahub/metadata/_schema_classes.py +304 -6
  73. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  74. datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  75. datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  76. datahub/metadata/schema.avsc +211 -12
  77. datahub/metadata/schemas/AssertionInfo.avsc +2 -2
  78. datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
  79. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  80. datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  81. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  82. datahub/metadata/schemas/Deprecation.avsc +12 -0
  83. datahub/metadata/schemas/DisplayProperties.avsc +62 -0
  84. datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  85. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
  86. datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
  87. datahub/metadata/schemas/PostInfo.avsc +28 -2
  88. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  89. datahub/specific/dashboard.py +43 -1
  90. datahub/telemetry/telemetry.py +4 -4
  91. datahub/testing/check_imports.py +28 -0
  92. datahub/upgrade/upgrade.py +17 -9
  93. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/LICENSE +0 -0
  94. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
  95. {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
@@ -7,15 +7,18 @@
7
7
  "namespace": "com.linkedin.pegasus2avro.post",
8
8
  "fields": [
9
9
  {
10
+ "Searchable": {},
10
11
  "type": {
11
12
  "type": "enum",
12
13
  "symbolDocs": {
14
+ "ENTITY_ANNOUNCEMENT": "The Post is an Entity level announcement.",
13
15
  "HOME_PAGE_ANNOUNCEMENT": "The Post is an Home Page announcement."
14
16
  },
15
17
  "name": "PostType",
16
18
  "namespace": "com.linkedin.pegasus2avro.post",
17
19
  "symbols": [
18
- "HOME_PAGE_ANNOUNCEMENT"
20
+ "HOME_PAGE_ANNOUNCEMENT",
21
+ "ENTITY_ANNOUNCEMENT"
19
22
  ],
20
23
  "doc": "Enum defining types of Posts."
21
24
  },
@@ -203,13 +206,25 @@
203
206
  "dataset",
204
207
  "schemaField",
205
208
  "chart",
209
+ "container",
206
210
  "dashboard",
207
211
  "dataFlow",
208
212
  "dataJob",
213
+ "dataProduct",
214
+ "glossaryTerm",
215
+ "glossaryNode",
216
+ "mlModel",
217
+ "mlFeature",
218
+ "notebook",
219
+ "mlFeatureTable",
220
+ "mlPrimaryKey",
221
+ "mlModelGroup",
222
+ "domain",
209
223
  "dataProduct"
210
224
  ],
211
225
  "name": "PostTarget"
212
226
  },
227
+ "Searchable": {},
213
228
  "java": {
214
229
  "class": "com.linkedin.pegasus2avro.common.urn.Urn"
215
230
  },
@@ -219,15 +234,26 @@
219
234
  ],
220
235
  "name": "target",
221
236
  "default": null,
222
- "doc": "Optional URN that the post is associated with.",
237
+ "doc": "Optional Entity URN that the post is associated with.",
223
238
  "Urn": "Urn",
224
239
  "entityTypes": [
225
240
  "dataset",
226
241
  "schemaField",
227
242
  "chart",
243
+ "container",
228
244
  "dashboard",
229
245
  "dataFlow",
230
246
  "dataJob",
247
+ "dataProduct",
248
+ "glossaryTerm",
249
+ "glossaryNode",
250
+ "mlModel",
251
+ "mlFeature",
252
+ "notebook",
253
+ "mlFeatureTable",
254
+ "mlPrimaryKey",
255
+ "mlModelGroup",
256
+ "domain",
231
257
  "dataProduct"
232
258
  ]
233
259
  }
@@ -12,7 +12,8 @@
12
12
  "status",
13
13
  "schemaFieldAliases",
14
14
  "documentation",
15
- "testResults"
15
+ "testResults",
16
+ "deprecation"
16
17
  ]
17
18
  },
18
19
  "name": "SchemaFieldKey",
@@ -161,7 +161,7 @@ class DashboardPatchBuilder(
161
161
  lastModified=self._mint_auditstamp(),
162
162
  )
163
163
 
164
- self._ensure_urn_type("dataset", [chart_edge], "add_chart_edge")
164
+ self._ensure_urn_type("chart", [chart_edge], "add_chart_edge")
165
165
  self._add_patch(
166
166
  DashboardInfo.ASPECT_NAME,
167
167
  "add",
@@ -271,6 +271,48 @@ class DashboardPatchBuilder(
271
271
 
272
272
  return self
273
273
 
274
+ def add_dashboard(
275
+ self, dashboard: Union[Edge, Urn, str]
276
+ ) -> "DashboardPatchBuilder":
277
+ """
278
+ Adds an dashboard to the DashboardPatchBuilder.
279
+
280
+ Args:
281
+ dashboard: The dashboard, which can be an Edge object, Urn object, or a string.
282
+
283
+ Returns:
284
+ The DashboardPatchBuilder instance.
285
+
286
+ Raises:
287
+ ValueError: If the dashboard is not a Dashboard urn.
288
+
289
+ Notes:
290
+ If `dashboard` is an Edge object, it is used directly. If `dashboard` is a Urn object or string,
291
+ it is converted to an Edge object and added with default audit stamps.
292
+ """
293
+ if isinstance(dashboard, Edge):
294
+ dashboard_urn: str = dashboard.destinationUrn
295
+ dashboard_edge: Edge = dashboard
296
+ elif isinstance(dashboard, (Urn, str)):
297
+ dashboard_urn = str(dashboard)
298
+ if not dashboard_urn.startswith("urn:li:dashboard:"):
299
+ raise ValueError(f"Input {dashboard} is not a Dashboard urn")
300
+
301
+ dashboard_edge = Edge(
302
+ destinationUrn=dashboard_urn,
303
+ created=self._mint_auditstamp(),
304
+ lastModified=self._mint_auditstamp(),
305
+ )
306
+
307
+ self._ensure_urn_type("dashboard", [dashboard_edge], "add_dashboard")
308
+ self._add_patch(
309
+ DashboardInfo.ASPECT_NAME,
310
+ "add",
311
+ path=("dashboards", dashboard_urn),
312
+ value=dashboard_edge,
313
+ )
314
+ return self
315
+
274
316
  def set_dashboard_url(
275
317
  self, dashboard_url: Optional[str]
276
318
  ) -> "DashboardPatchBuilder":
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar
12
12
  from mixpanel import Consumer, Mixpanel
13
13
  from typing_extensions import ParamSpec
14
14
 
15
- import datahub as datahub_package
15
+ from datahub._version import __version__, nice_version_name
16
16
  from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
17
17
  from datahub.cli.env_utils import get_boolean_env_variable
18
18
  from datahub.configuration.common import ExceptionWithProps
@@ -106,7 +106,7 @@ SENTRY_ENVIRONMENT: str = os.environ.get("SENTRY_ENVIRONMENT", "dev")
106
106
 
107
107
  def _default_telemetry_properties() -> Dict[str, Any]:
108
108
  return {
109
- "datahub_version": datahub_package.nice_version_name(),
109
+ "datahub_version": nice_version_name(),
110
110
  "python_version": platform.python_version(),
111
111
  "os": platform.system(),
112
112
  "arch": platform.machine(),
@@ -132,7 +132,7 @@ class Telemetry:
132
132
  sentry_sdk.init(
133
133
  dsn=SENTRY_DSN,
134
134
  environment=SENTRY_ENVIRONMENT,
135
- release=datahub_package.__version__,
135
+ release=__version__,
136
136
  )
137
137
  except Exception as e:
138
138
  # We need to print initialization errors to stderr, since logger is not initialized yet
@@ -277,7 +277,7 @@ class Telemetry:
277
277
  "environment",
278
278
  {
279
279
  "environment": SENTRY_ENVIRONMENT,
280
- "datahub_version": datahub_package.nice_version_name(),
280
+ "datahub_version": nice_version_name(),
281
281
  "os": platform.system(),
282
282
  "python_version": platform.python_version(),
283
283
  },
@@ -1,4 +1,5 @@
1
1
  import pathlib
2
+ import re
2
3
  from typing import List
3
4
 
4
5
 
@@ -32,3 +33,30 @@ def ensure_no_indirect_model_imports(dirs: List[pathlib.Path]) -> None:
32
33
  f"Disallowed import found in {file}: `{line.rstrip()}`. "
33
34
  f"Import from {replacement} instead."
34
35
  )
36
+
37
+
38
+ def ban_direct_datahub_imports(dirs: List[pathlib.Path]) -> None:
39
+ # We also want to ban all direct imports of datahub.
40
+ # The base `datahub` package is used to export public-facing classes.
41
+ # If we import it directly, we'll likely end up with circular imports.
42
+
43
+ banned_strings = [
44
+ r"^import datahub[\s$]",
45
+ r"^from datahub import",
46
+ ]
47
+ ignored_files = {
48
+ __file__,
49
+ }
50
+ for dir in dirs:
51
+ for file in dir.rglob("*.py"):
52
+ if str(file) in ignored_files:
53
+ continue
54
+
55
+ file_contents = file.read_text()
56
+
57
+ for banned_string in banned_strings:
58
+ if re.search(banned_string, file_contents, re.MULTILINE):
59
+ raise ValueError(
60
+ f"Disallowed bare datahub import found in {file}. "
61
+ f"Do not import datahub directly; instead import from the underlying file."
62
+ )
@@ -10,7 +10,7 @@ import humanfriendly
10
10
  from packaging.version import Version
11
11
  from pydantic import BaseModel
12
12
 
13
- from datahub import __version__
13
+ from datahub._version import __version__
14
14
  from datahub.cli.config_utils import load_client_config
15
15
  from datahub.ingestion.graph.client import DataHubGraph
16
16
  from datahub.utilities.perf_timer import PerfTimer
@@ -55,11 +55,19 @@ async def get_client_version_stats():
55
55
  async with session.get(pypi_url) as resp:
56
56
  response_json = await resp.json()
57
57
  try:
58
- releases = response_json.get("releases", [])
59
- sorted_releases = sorted(releases.keys(), key=lambda x: Version(x))
60
- latest_cli_release_string = [
61
- x for x in sorted_releases if "rc" not in x
62
- ][-1]
58
+ releases = response_json.get("releases", {})
59
+ filtered_releases = {
60
+ version: release_files
61
+ for version, release_files in releases.items()
62
+ if not all(
63
+ release_file.get("yanked") for release_file in release_files
64
+ )
65
+ and "rc" not in version
66
+ }
67
+ sorted_releases = sorted(
68
+ filtered_releases.keys(), key=lambda x: Version(x)
69
+ )
70
+ latest_cli_release_string = sorted_releases[-1]
63
71
  latest_cli_release = Version(latest_cli_release_string)
64
72
  current_version_info = releases.get(current_version_string)
65
73
  current_version_date = None
@@ -93,11 +101,11 @@ async def get_github_stats():
93
101
  async with aiohttp.ClientSession(
94
102
  headers={"Accept": "application/vnd.github.v3+json"}
95
103
  ) as session:
96
- gh_url = "https://api.github.com/repos/datahub-project/datahub/releases"
104
+ gh_url = "https://api.github.com/repos/datahub-project/datahub/releases/latest"
97
105
  async with session.get(gh_url) as gh_response:
98
106
  gh_response_json = await gh_response.json()
99
- latest_server_version = Version(gh_response_json[0].get("tag_name"))
100
- latest_server_date = gh_response_json[0].get("published_at")
107
+ latest_server_version = Version(gh_response_json.get("tag_name"))
108
+ latest_server_date = gh_response_json.get("published_at")
101
109
  return (latest_server_version, latest_server_date)
102
110
 
103
111