acryl-datahub 1.0.0rc11__py3-none-any.whl → 1.0.0rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
- datahub/_version.py,sha256=m3vMOf1XXwW_i72T14wHeXSyYmTku5A-KQz7nxQXArM,322
3
+ datahub/_version.py,sha256=W5gCw-PvGPeNClWZ5wYkU1EO0af_2kWCyO3nFe4JtkE,322
4
4
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
5
5
  datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
6
6
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -60,7 +60,7 @@ datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0
60
60
  datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
61
61
  datahub/api/graphql/operation.py,sha256=h7OXbVRrpJgoth1X4cgeIFhD5JY1MGKg2KjVlQK1gqE,5116
62
62
  datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- datahub/cli/check_cli.py,sha256=ajrWVMAHYbgvYi4OFitFXx7Y6oigvZFgIeUiKV9ECik,12859
63
+ datahub/cli/check_cli.py,sha256=GpAM7k1GMIIE7zQ6GHnu_78aSc4mPiEu2BaKAsYGPkA,14310
64
64
  datahub/cli/cli_utils.py,sha256=2uvPv6WqxbRdH7UteHwhRash4E0ncU5P6XebrFLeECo,13584
65
65
  datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
66
66
  datahub/cli/container_cli.py,sha256=uDOwewGEPYHQt-ppYEb8ESXhZjPNIZG0Rt3cm2FzPqc,1569
@@ -204,7 +204,7 @@ datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suab
204
204
  datahub/ingestion/source/ldap.py,sha256=CNr3foofIpoCXu_GGqfcajlQE2qkHr5isYwVcDutdkk,18695
205
205
  datahub/ingestion/source/metabase.py,sha256=j8DRV2GvisezidL1JZ5HJLF_hdFdtvaoyDoEdEyh0Ks,32603
206
206
  datahub/ingestion/source/mlflow.py,sha256=cqQivSyrptm15vn--xbT7eTRHJJVKMmQpoVqfzuDIDU,12858
207
- datahub/ingestion/source/mode.py,sha256=w85zCIZicfABx5dKCupsGpH1tgUMhS1El-jIWa2gwNU,63632
207
+ datahub/ingestion/source/mode.py,sha256=6WJKukK4VbNZwc5UM200iMlP_Chiwx8y2jFoclWgy0U,64044
208
208
  datahub/ingestion/source/mongodb.py,sha256=2C2Cxn8DXL53IbNiywIuKt8UT_EMcPg9f8su-OPSNGU,21237
209
209
  datahub/ingestion/source/nifi.py,sha256=w5TPnqPmpotvzSsJROi6nUiHWPUVC6u1g0CzXIE6FNs,56903
210
210
  datahub/ingestion/source/openapi.py,sha256=39ep3etbWh8NBPjTXXwH3mieC5P6bMVAjhvK7UvcTis,17372
@@ -215,7 +215,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
215
215
  datahub/ingestion/source/salesforce.py,sha256=d56tfYqg1rGDvMkLznmBJII55B1Zs8XTaQrrW-wHdLo,32679
216
216
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
217
217
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
218
- datahub/ingestion/source/superset.py,sha256=jrYsmLHR1ZZpl0JBcNjETt6QODmSdtP94HXVWt6pqn8,31004
218
+ datahub/ingestion/source/superset.py,sha256=zPUeVMCEhFXFY2PFOpgmZvhzELdXzKPGYvbs6gAZfWs,31019
219
219
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
220
220
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
221
221
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -1022,9 +1022,9 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1022
1022
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1023
1023
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1024
1024
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1025
- acryl_datahub-1.0.0rc11.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1026
- acryl_datahub-1.0.0rc11.dist-info/METADATA,sha256=hZCrduEZ7Qqkr76OUpdPLHm7AApR7AQHEaKKYq9uJZE,175337
1027
- acryl_datahub-1.0.0rc11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
1028
- acryl_datahub-1.0.0rc11.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1029
- acryl_datahub-1.0.0rc11.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1030
- acryl_datahub-1.0.0rc11.dist-info/RECORD,,
1025
+ acryl_datahub-1.0.0rc13.dist-info/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1026
+ acryl_datahub-1.0.0rc13.dist-info/METADATA,sha256=WIJxT5ufCOmAm3SU_GD0eN2OnkHcEc1_RZfGGEC1S7U,175337
1027
+ acryl_datahub-1.0.0rc13.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
1028
+ acryl_datahub-1.0.0rc13.dist-info/entry_points.txt,sha256=U1e5ZwqPX1OaIbvGrwvozcdB8SbzFYXQM7plpdLKKeo,9592
1029
+ acryl_datahub-1.0.0rc13.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1030
+ acryl_datahub-1.0.0rc13.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0rc11"
3
+ __version__ = "1.0.0rc13"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
datahub/cli/check_cli.py CHANGED
@@ -5,7 +5,8 @@ import pathlib
5
5
  import pprint
6
6
  import shutil
7
7
  import tempfile
8
- from typing import Dict, List, Optional, Union
8
+ from datetime import datetime
9
+ from typing import Any, Dict, List, Optional, Union
9
10
 
10
11
  import click
11
12
 
@@ -20,7 +21,10 @@ from datahub.ingestion.sink.sink_registry import sink_registry
20
21
  from datahub.ingestion.source.source_registry import source_registry
21
22
  from datahub.ingestion.transformer.transform_registry import transform_registry
22
23
  from datahub.telemetry import telemetry
23
- from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
24
+ from datahub.utilities.file_backed_collections import (
25
+ ConnectionWrapper,
26
+ FileBackedDict,
27
+ )
24
28
 
25
29
  logger = logging.getLogger(__name__)
26
30
 
@@ -391,29 +395,78 @@ def test_path_spec(config: str, input: str, path_spec_key: str) -> None:
391
395
  raise e
392
396
 
393
397
 
398
+ def _jsonify(data: Any) -> Any:
399
+ if dataclasses.is_dataclass(data):
400
+ # dataclasses.asdict() is recursive. We're doing the recursion
401
+ # manually here via _jsonify calls, so we can't use
402
+ # dataclasses.asdict() here.
403
+ return {
404
+ f.name: _jsonify(getattr(data, f.name)) for f in dataclasses.fields(data)
405
+ }
406
+ elif isinstance(data, list):
407
+ return [_jsonify(item) for item in data]
408
+ elif isinstance(data, dict):
409
+ return {_jsonify(k): _jsonify(v) for k, v in data.items()}
410
+ elif isinstance(data, datetime):
411
+ return data.isoformat()
412
+ else:
413
+ return data
414
+
415
+
394
416
  @check.command()
395
- @click.argument("query-log-file", type=click.Path(exists=True, dir_okay=False))
396
- @click.option("--output", type=click.Path())
397
- def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
417
+ @click.argument("db-file", type=click.Path(exists=True, dir_okay=False))
418
+ def extract_sql_agg_log(db_file: str) -> None:
398
419
  """Convert a sqlite db generated by the SqlParsingAggregator into a JSON."""
399
420
 
400
- from datahub.sql_parsing.sql_parsing_aggregator import LoggedQuery
421
+ if pathlib.Path(db_file).suffix != ".db":
422
+ raise click.UsageError("DB file must be a sqlite db")
423
+
424
+ output_dir = pathlib.Path(db_file).with_suffix("")
425
+ output_dir.mkdir(exist_ok=True)
426
+
427
+ shared_connection = ConnectionWrapper(pathlib.Path(db_file))
428
+
429
+ tables: List[str] = [
430
+ row[0]
431
+ for row in shared_connection.execute(
432
+ """\
433
+ SELECT
434
+ name
435
+ FROM
436
+ sqlite_schema
437
+ WHERE
438
+ type ='table' AND
439
+ name NOT LIKE 'sqlite_%';
440
+ """,
441
+ parameters={},
442
+ )
443
+ ]
444
+ logger.info(f"Extracting {len(tables)} tables from {db_file}: {tables}")
445
+
446
+ for table in tables:
447
+ table_output_path = output_dir / f"{table}.json"
448
+ if table_output_path.exists():
449
+ logger.info(f"Skipping {table_output_path} because it already exists")
450
+ continue
401
451
 
402
- assert dataclasses.is_dataclass(LoggedQuery)
452
+ # Some of the tables might actually be FileBackedList. Because
453
+ # the list is built on top of the FileBackedDict, we don't
454
+ # need to distinguish between the two cases.
403
455
 
404
- shared_connection = ConnectionWrapper(pathlib.Path(query_log_file))
405
- query_log = FileBackedList[LoggedQuery](
406
- shared_connection=shared_connection, tablename="stored_queries"
407
- )
408
- logger.info(f"Extracting {len(query_log)} queries from {query_log_file}")
409
- queries = [dataclasses.asdict(query) for query in query_log]
456
+ table_data: FileBackedDict[Any] = FileBackedDict(
457
+ shared_connection=shared_connection, tablename=table
458
+ )
410
459
 
411
- if output:
412
- with open(output, "w") as f:
413
- json.dump(queries, f, indent=2, default=str)
414
- logger.info(f"Extracted {len(queries)} queries to {output}")
415
- else:
416
- click.echo(json.dumps(queries, indent=2))
460
+ data = {}
461
+ with click.progressbar(
462
+ table_data.items(), length=len(table_data), label=f"Extracting {table}"
463
+ ) as items:
464
+ for k, v in items:
465
+ data[k] = _jsonify(v)
466
+
467
+ with open(table_output_path, "w") as f:
468
+ json.dump(data, f, indent=2, default=str)
469
+ logger.info(f"Extracted {len(data)} entries to {table_output_path}")
417
470
 
418
471
 
419
472
  @check.command()
@@ -159,7 +159,12 @@ class ModeConfig(
159
159
  )
160
160
 
161
161
  workspace: str = Field(
162
- description="The Mode workspace name. Find it in Settings > Workspace > Details."
162
+ description="The Mode workspace username. If you navigate to Workspace Settings > Details, "
163
+ "the url will be `https://app.mode.com/organizations/<workspace-username>`. "
164
+ # The lowercase comment is derived from a comment in a Mode API example.
165
+ # https://mode.com/developer/api-cookbook/management/get-all-reports/
166
+ # > "Note: workspace_name value should be all lowercase"
167
+ "This is distinct from the workspace's display name, and should be all lowercase."
163
168
  )
164
169
  _default_schema = pydantic_removed_field("default_schema")
165
170
 
@@ -384,7 +384,7 @@ class SupersetSource(StatefulIngestionSourceBase):
384
384
  ) -> DashboardSnapshot:
385
385
  dashboard_urn = make_dashboard_urn(
386
386
  platform=self.platform,
387
- name=dashboard_data["id"],
387
+ name=str(dashboard_data["id"]),
388
388
  platform_instance=self.config.platform_instance,
389
389
  )
390
390
  dashboard_snapshot = DashboardSnapshot(
@@ -416,7 +416,7 @@ class SupersetSource(StatefulIngestionSourceBase):
416
416
  chart_urns.append(
417
417
  make_chart_urn(
418
418
  platform=self.platform,
419
- name=value.get("meta", {}).get("chartId", "unknown"),
419
+ name=str(value.get("meta", {}).get("chartId", "unknown")),
420
420
  platform_instance=self.config.platform_instance,
421
421
  )
422
422
  )
@@ -499,7 +499,7 @@ class SupersetSource(StatefulIngestionSourceBase):
499
499
  def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
500
500
  chart_urn = make_chart_urn(
501
501
  platform=self.platform,
502
- name=chart_data["id"],
502
+ name=str(chart_data["id"]),
503
503
  platform_instance=self.config.platform_instance,
504
504
  )
505
505
  chart_snapshot = ChartSnapshot(