acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
  2. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/api/report.py +183 -35
  26. datahub/ingestion/autogenerated/capability_summary.json +3431 -0
  27. datahub/ingestion/autogenerated/lineage.json +401 -0
  28. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  29. datahub/ingestion/extractor/schema_util.py +13 -4
  30. datahub/ingestion/graph/client.py +2 -2
  31. datahub/ingestion/run/pipeline.py +47 -1
  32. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  33. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  34. datahub/ingestion/source/common/subtypes.py +1 -1
  35. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  37. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  38. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  39. datahub/ingestion/source/ge_data_profiler.py +28 -20
  40. datahub/ingestion/source/identity/okta.py +0 -13
  41. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  42. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  43. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  44. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  45. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  46. datahub/ingestion/source/redshift/usage.py +4 -3
  47. datahub/ingestion/source/s3/source.py +19 -3
  48. datahub/ingestion/source/sigma/sigma.py +6 -1
  49. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  50. datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
  51. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  52. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  53. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  54. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  55. datahub/ingestion/source/sql/sql_common.py +4 -0
  56. datahub/ingestion/source/sql/vertica.py +0 -4
  57. datahub/ingestion/source/sql_queries.py +2 -2
  58. datahub/ingestion/source/superset.py +56 -1
  59. datahub/ingestion/source/tableau/tableau.py +40 -34
  60. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  61. datahub/ingestion/source/unity/proxy.py +4 -3
  62. datahub/ingestion/source/unity/source.py +19 -9
  63. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  64. datahub/metadata/_internal_schema_classes.py +85 -4
  65. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  66. datahub/metadata/schema.avsc +54 -1
  67. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  68. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  69. datahub/sdk/lineage_client.py +2 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
  71. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  72. datahub/upgrade/upgrade.py +46 -13
  73. datahub/utilities/server_config_util.py +8 -0
  74. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  75. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
  76. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
  77. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
  78. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
datahub/cli/delete_cli.py CHANGED
@@ -18,8 +18,6 @@ from datahub.emitter.aspect import ASPECT_MAP, TIMESERIES_ASPECT_MAP
18
18
  from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
19
19
  from datahub.ingestion.graph.config import ClientMode
20
20
  from datahub.ingestion.graph.filters import RemovedStatusFilter
21
- from datahub.telemetry import telemetry
22
- from datahub.upgrade import upgrade
23
21
  from datahub.utilities.perf_timer import PerfTimer
24
22
  from datahub.utilities.urns.urn import guess_entity_type
25
23
 
@@ -116,7 +114,6 @@ class DeletionResult:
116
114
  help="specifies soft/hard deletion",
117
115
  )
118
116
  @click.option("-n", "--dry-run", required=False, is_flag=True)
119
- @telemetry.with_telemetry()
120
117
  def by_registry(
121
118
  registry_id: str,
122
119
  soft: bool,
@@ -171,7 +168,6 @@ def by_registry(
171
168
  @click.option(
172
169
  "-f", "--force", required=False, is_flag=True, help="force the delete if set"
173
170
  )
174
- @telemetry.with_telemetry()
175
171
  def references(urn: str, dry_run: bool, force: bool) -> None:
176
172
  """
177
173
  Delete all references to an entity (but not the entity itself).
@@ -369,8 +365,6 @@ def undo_by_filter(
369
365
  @click.option(
370
366
  "--workers", type=int, default=1, help="Num of workers to use for deletion."
371
367
  )
372
- @upgrade.check_upgrade
373
- @telemetry.with_telemetry()
374
368
  def by_filter(
375
369
  urn: Optional[str],
376
370
  urn_file: Optional[str],
@@ -1,8 +1,9 @@
1
1
  import enum
2
2
  import os
3
+ import pathlib
3
4
  from contextlib import contextmanager
4
5
  from dataclasses import dataclass
5
- from typing import Any, Dict, Iterator, List, Optional
6
+ from typing import Any, Dict, Iterator, List, Optional, Set
6
7
 
7
8
  import docker
8
9
  import docker.errors
@@ -13,6 +14,7 @@ from datahub.configuration.common import ExceptionWithProps
13
14
 
14
15
  # Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
15
16
  MIN_MEMORY_NEEDED = 3.8 # GB
17
+ MIN_DISK_SPACE_NEEDED = 12 # GB
16
18
 
17
19
  DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
18
20
  DATAHUB_COMPOSE_PROJECT_FILTER = {
@@ -37,6 +39,10 @@ class DockerLowMemoryError(Exception):
37
39
  SHOW_STACK_TRACE = False
38
40
 
39
41
 
42
+ class DockerLowDiskSpaceError(Exception):
43
+ SHOW_STACK_TRACE = False
44
+
45
+
40
46
  class DockerComposeVersionError(Exception):
41
47
  SHOW_STACK_TRACE = False
42
48
 
@@ -102,6 +108,24 @@ def run_quickstart_preflight_checks(client: docker.DockerClient) -> None:
102
108
  "You can increase the memory allocated to Docker in the Docker settings."
103
109
  )
104
110
 
111
+ result = client.containers.run(
112
+ "alpine:latest",
113
+ "sh -c \"df -B1 / | tail -1 | awk '{print $2, $4}'\"", # total, available
114
+ remove=True,
115
+ stdout=True,
116
+ stderr=True,
117
+ )
118
+
119
+ output = result.decode("utf-8").strip()
120
+ total_bytes, available_bytes = map(int, output.split())
121
+
122
+ available_gb = available_bytes / (1024**3)
123
+ if available_gb < MIN_DISK_SPACE_NEEDED:
124
+ raise DockerLowDiskSpaceError(
125
+ f"Total Docker disk space available {available_gb:.2f}GB is below the minimum threshold {MIN_DISK_SPACE_NEEDED}GB. "
126
+ "You can increase the disk space allocated to Docker in the Docker settings or free up disk space`"
127
+ )
128
+
105
129
 
106
130
  class ContainerStatus(enum.Enum):
107
131
  OK = "is ok"
@@ -126,10 +150,24 @@ class DockerContainerStatus:
126
150
  @dataclass
127
151
  class QuickstartStatus:
128
152
  containers: List[DockerContainerStatus]
153
+ volumes: Set[str]
154
+ # On moving to compose profiles, this CLI will no longer support running quickstart instances from earlier versions.
155
+ # While the check command can work, upgrades or
156
+ running_unsupported_version: bool
157
+
158
+ def __init__(
159
+ self,
160
+ containers: List[DockerContainerStatus],
161
+ volumes: List[str],
162
+ running_unsupported_version: bool = False,
163
+ ):
164
+ self.containers = containers
165
+ self.running_unsupported_version = running_unsupported_version
166
+ self.volumes = set(volumes)
129
167
 
130
168
  def errors(self) -> List[str]:
131
169
  if not self.containers:
132
- return ["quickstart.sh or dev.sh is not running"]
170
+ return ["datahub is not running"]
133
171
 
134
172
  return [
135
173
  f"{container.name} {container.status.value}"
@@ -176,6 +214,26 @@ class QuickstartStatus:
176
214
  },
177
215
  )
178
216
 
217
+ def get_containers(self) -> Set[str]:
218
+ if self.containers:
219
+ return {container.name for container in self.containers}
220
+ else:
221
+ return set()
222
+
223
+
224
+ def detect_legacy_quickstart_compose(containers: Set[str]) -> bool:
225
+ return "zookeeper" in containers
226
+
227
+
228
+ def _get_services_from_compose(compose_file: str) -> Set[str]:
229
+ with open(compose_file) as config_file:
230
+ return yaml.safe_load(config_file).get("services", {}).keys()
231
+
232
+
233
+ def _get_volumes_from_compose(compose_file: str) -> Set[str]:
234
+ with open(compose_file) as config_file:
235
+ return yaml.safe_load(config_file).get("volumes", {}).keys()
236
+
179
237
 
180
238
  def check_docker_quickstart() -> QuickstartStatus:
181
239
  container_statuses: List[DockerContainerStatus] = []
@@ -188,7 +246,7 @@ def check_docker_quickstart() -> QuickstartStatus:
188
246
  ignore_removed=True,
189
247
  )
190
248
  if len(containers) == 0:
191
- return QuickstartStatus([])
249
+ return QuickstartStatus([], [], running_unsupported_version=False)
192
250
 
193
251
  # load the expected containers from the docker-compose file
194
252
  config_files = (
@@ -197,16 +255,17 @@ def check_docker_quickstart() -> QuickstartStatus:
197
255
  .split(",")
198
256
  )
199
257
 
200
- # If using profiles, alternative check
258
+ # If using profiles, alternative check ##TODO: Does this really work? Check mixpanel for usage of this.
201
259
  if config_files and "/profiles/" in config_files[0]:
202
260
  return check_docker_quickstart_profiles(client)
203
261
 
204
262
  all_containers = set()
205
263
  for config_file in config_files:
206
- with open(config_file) as config_file:
207
- all_containers.update(
208
- yaml.safe_load(config_file).get("services", {}).keys()
209
- )
264
+ all_containers.update(_get_services_from_compose(config_file))
265
+
266
+ all_volumes = set()
267
+ for config_file in config_files:
268
+ all_volumes.update(_get_volumes_from_compose(config_file))
210
269
 
211
270
  existing_containers = set()
212
271
  # Check that the containers are running and healthy.
@@ -240,8 +299,12 @@ def check_docker_quickstart() -> QuickstartStatus:
240
299
  container_statuses.append(
241
300
  DockerContainerStatus(missing, ContainerStatus.MISSING)
242
301
  )
243
-
244
- return QuickstartStatus(container_statuses)
302
+ running_unsupported_version = detect_legacy_quickstart_compose(all_containers)
303
+ return QuickstartStatus(
304
+ containers=container_statuses,
305
+ volumes=list(all_volumes),
306
+ running_unsupported_version=running_unsupported_version,
307
+ )
245
308
 
246
309
 
247
310
  def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartStatus:
@@ -254,7 +317,7 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
254
317
  ignore_removed=True,
255
318
  )
256
319
  if len(containers) == 0:
257
- return QuickstartStatus([])
320
+ return QuickstartStatus([], [], running_unsupported_version=False)
258
321
 
259
322
  existing_containers = set()
260
323
  # Check that the containers are running and healthy.
@@ -273,4 +336,36 @@ def check_docker_quickstart_profiles(client: docker.DockerClient) -> QuickstartS
273
336
 
274
337
  container_statuses.append(DockerContainerStatus(name, status))
275
338
 
276
- return QuickstartStatus(container_statuses)
339
+ # TODO: Can this be handled with older verions?
340
+ return QuickstartStatus(
341
+ container_statuses, volumes=[], running_unsupported_version=False
342
+ )
343
+
344
+
345
+ def check_upgrade_supported(
346
+ quickstart_compose_file: List[pathlib.Path], quickstart_status: QuickstartStatus
347
+ ) -> bool:
348
+ if (
349
+ quickstart_status.running_unsupported_version
350
+ ): # we detected a legacy quickstart service
351
+ return False
352
+
353
+ if not quickstart_status.get_containers(): # no containers are running
354
+ return True
355
+
356
+ compose_services = set()
357
+ compose_volumes = set()
358
+
359
+ for compose_file in quickstart_compose_file:
360
+ compose_services.update(_get_services_from_compose(str(compose_file)))
361
+ compose_volumes.update(_get_volumes_from_compose(str(compose_file)))
362
+
363
+ # if all services and volumes are not the same, the state in the volumes may not be compatible with the new services.
364
+ # We are checking for containers and volumes per the compose file, not necessarily all of them being present
365
+ if (
366
+ compose_services == quickstart_status.get_containers()
367
+ and compose_volumes == quickstart_status.volumes
368
+ ):
369
+ return True
370
+ else:
371
+ return False