datacontract-cli 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacontract-cli might be problematic. Click here for more details.

Files changed (36) hide show
  1. datacontract/breaking/breaking_rules.py +4 -0
  2. datacontract/catalog/catalog.py +76 -0
  3. datacontract/cli.py +39 -3
  4. datacontract/data_contract.py +12 -1
  5. datacontract/engines/fastjsonschema/check_jsonschema.py +1 -2
  6. datacontract/engines/soda/check_soda_execute.py +9 -15
  7. datacontract/engines/soda/connections/duckdb.py +83 -14
  8. datacontract/engines/soda/connections/kafka.py +108 -105
  9. datacontract/export/avro_idl_converter.py +1 -2
  10. datacontract/export/dbt_converter.py +1 -2
  11. datacontract/export/great_expectations_converter.py +1 -2
  12. datacontract/export/html_export.py +3 -2
  13. datacontract/export/jsonschema_converter.py +1 -2
  14. datacontract/export/odcs_converter.py +1 -2
  15. datacontract/export/rdf_converter.py +1 -1
  16. datacontract/export/sodacl_converter.py +1 -2
  17. datacontract/export/terraform_converter.py +1 -2
  18. datacontract/imports/avro_importer.py +1 -2
  19. datacontract/imports/glue_importer.py +183 -0
  20. datacontract/imports/sql_importer.py +20 -9
  21. datacontract/integration/publish_opentelemetry.py +3 -6
  22. datacontract/lint/linters/example_model_linter.py +1 -2
  23. datacontract/lint/linters/field_pattern_linter.py +1 -2
  24. datacontract/lint/linters/notice_period_linter.py +1 -2
  25. datacontract/lint/linters/quality_schema_linter.py +1 -2
  26. datacontract/lint/resolve.py +9 -6
  27. datacontract/model/data_contract_specification.py +2 -0
  28. datacontract/templates/datacontract.html +76 -21
  29. datacontract/templates/index.html +168 -0
  30. datacontract/templates/style/output.css +113 -4
  31. {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/METADATA +180 -102
  32. {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/RECORD +36 -33
  33. {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/LICENSE +0 -0
  34. {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/WHEEL +0 -0
  35. {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/entry_points.txt +0 -0
  36. {datacontract_cli-0.10.1.dist-info → datacontract_cli-0.10.3.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,10 @@ class BreakingRules:
20
20
  field_ref_removed = Severity.WARNING
21
21
  field_ref_updated = Severity.WARNING
22
22
 
23
+ field_title_added = Severity.INFO
24
+ field_title_removed = Severity.INFO
25
+ field_title_updated = Severity.INFO
26
+
23
27
  field_type_added = Severity.WARNING
24
28
  field_type_removed = Severity.WARNING
25
29
  field_type_updated = Severity.ERROR
@@ -0,0 +1,76 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ import pytz
6
+ from jinja2 import PackageLoader, Environment, select_autoescape
7
+
8
+ from datacontract.data_contract import DataContract
9
+ from datacontract.export.html_export import get_version
10
+ from datacontract.model.data_contract_specification import \
11
+ DataContractSpecification
12
+
13
+
14
+ def create_data_contract_html(contracts, file: Path, path: Path):
15
+ data_contract = DataContract(data_contract_file=f"{file.absolute()}", inline_definitions=True, inline_quality=True)
16
+ html = data_contract.export(export_format="html")
17
+ spec = data_contract.get_data_contract_specification()
18
+ file_without_suffix = file.with_suffix(".html")
19
+ html_filepath = path / file_without_suffix
20
+ html_filepath.parent.mkdir(parents=True, exist_ok=True)
21
+ with open(html_filepath, "w") as f:
22
+ f.write(html)
23
+ contracts.append(
24
+ DataContractView(
25
+ html_filepath=html_filepath,
26
+ html_link=file_without_suffix,
27
+ spec=spec,
28
+ )
29
+ )
30
+ print(f"Created {html_filepath}")
31
+
32
+
33
+ @dataclass
34
+ class DataContractView:
35
+ """Class for keeping track of an item in inventory."""
36
+
37
+ html_filepath: Path
38
+ html_link: Path
39
+ spec: DataContractSpecification
40
+
41
+
42
+ def create_index_html(contracts, path):
43
+ index_filepath = path / "index.html"
44
+ with open(index_filepath, "w") as f:
45
+ # Load templates from templates folder
46
+ package_loader = PackageLoader("datacontract", "templates")
47
+ env = Environment(
48
+ loader=package_loader,
49
+ autoescape=select_autoescape(
50
+ enabled_extensions="html",
51
+ default_for_string=True,
52
+ ),
53
+ )
54
+
55
+ # Load the required template
56
+ # needs to be included in /MANIFEST.in
57
+ template = env.get_template("index.html")
58
+
59
+ # needs to be included in /MANIFEST.in
60
+ style_content, _, _ = package_loader.get_source(env, "style/output.css")
61
+
62
+ tz = pytz.timezone("UTC")
63
+ now = datetime.now(tz)
64
+ formatted_date = now.strftime("%d %b %Y %H:%M:%S UTC")
65
+ datacontract_cli_version = get_version()
66
+
67
+ # Render the template with necessary data
68
+ html_string = template.render(
69
+ style=style_content,
70
+ formatted_date=formatted_date,
71
+ datacontract_cli_version=datacontract_cli_version,
72
+ contracts=contracts,
73
+ contracts_size=len(contracts),
74
+ )
75
+ f.write(html_string)
76
+ print(f"Created {index_filepath}")
datacontract/cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from enum import Enum
2
2
  from importlib import metadata
3
+ from pathlib import Path
3
4
  from typing import Iterable, Optional
4
5
 
5
6
  import typer
@@ -10,6 +11,8 @@ from rich.table import Table
10
11
  from typer.core import TyperGroup
11
12
  from typing_extensions import Annotated
12
13
 
14
+ from datacontract.catalog.catalog import create_index_html, \
15
+ create_data_contract_html
13
16
  from datacontract.data_contract import DataContract
14
17
  from datacontract.init.download_datacontract_file import \
15
18
  download_datacontract_file, FileExistsException
@@ -160,6 +163,7 @@ class ExportFormat(str, Enum):
160
163
  @app.command()
161
164
  def export(
162
165
  format: Annotated[ExportFormat, typer.Option(help="The export format.")],
166
+ output: Annotated[Path, typer.Option(help="Specify the file path where the exported data will be saved. If no path is provided, the output will be printed to stdout.")] = None,
163
167
  server: Annotated[str, typer.Option(help="The server name to export.")] = None,
164
168
  model: Annotated[
165
169
  str,
@@ -169,10 +173,12 @@ def export(
169
173
  "models (default)."
170
174
  ),
171
175
  ] = "all",
176
+ # TODO: this should be a subcommand
172
177
  rdf_base: Annotated[
173
178
  Optional[str],
174
179
  typer.Option(help="[rdf] The base URI used to generate the RDF graph.", rich_help_panel="RDF Options"),
175
180
  ] = None,
181
+ # TODO: this should be a subcommand
176
182
  sql_server_type: Annotated[
177
183
  Optional[str],
178
184
  typer.Option(
@@ -195,26 +201,56 @@ def export(
195
201
  sql_server_type=sql_server_type,
196
202
  )
197
203
  # Don't interpret console markup in output.
198
- console.print(result, markup=False)
204
+ if output is None:
205
+ console.print(result, markup=False)
206
+ else:
207
+ with output.open('w') as f:
208
+ f.write(result)
209
+ console.print(f"Written result to {output}")
199
210
 
200
211
 
201
212
  class ImportFormat(str, Enum):
202
213
  sql = "sql"
203
214
  avro = "avro"
215
+ glue = "glue"
204
216
 
205
217
 
206
218
  @app.command(name="import")
207
219
  def import_(
208
220
  format: Annotated[ImportFormat, typer.Option(help="The format of the source file.")],
209
- source: Annotated[str, typer.Option(help="The path to the file that should be imported.")],
221
+ source: Annotated[str, typer.Option(help="The path to the file or Glue Database that should be imported.")],
210
222
  ):
211
223
  """
212
- Create a data contract from the given source file. Prints to stdout.
224
+ Create a data contract from the given source location. Prints to stdout.
213
225
  """
214
226
  result = DataContract().import_from_source(format, source)
215
227
  console.print(result.to_yaml())
216
228
 
217
229
 
230
+ @app.command(name="catalog")
231
+ def catalog(
232
+ files: Annotated[
233
+ Optional[str], typer.Option(help="Glob pattern for the data contract files to include in the catalog.")
234
+ ] = "*.yaml",
235
+ output: Annotated[Optional[str], typer.Option(help="Output directory for the catalog html files.")] = "catalog/",
236
+ ):
237
+ """
238
+ Create an html catalog of data contracts.
239
+ """
240
+ path = Path(output)
241
+ path.mkdir(parents=True, exist_ok=True)
242
+ console.print(f"Created {output}")
243
+
244
+ contracts = []
245
+ for file in Path().glob(files):
246
+ try:
247
+ create_data_contract_html(contracts, file, path)
248
+ except Exception as e:
249
+ console.print(f"Skipped {file} due to error: {e}")
250
+
251
+ create_index_html(contracts, path)
252
+
253
+
218
254
  @app.command()
219
255
  def breaking(
220
256
  location_old: Annotated[str, typer.Argument(help="The location (url or path) of the old data contract yaml.")],
@@ -30,6 +30,7 @@ from datacontract.export.sodacl_converter import to_sodacl_yaml
30
30
  from datacontract.export.sql_converter import to_sql_ddl, to_sql_query
31
31
  from datacontract.export.terraform_converter import to_terraform
32
32
  from datacontract.imports.avro_importer import import_avro
33
+ from datacontract.imports.glue_importer import import_glue
33
34
  from datacontract.imports.sql_importer import import_sql
34
35
  from datacontract.integration.publish_datamesh_manager import \
35
36
  publish_datamesh_manager
@@ -66,6 +67,7 @@ class DataContract:
66
67
  publish_to_opentelemetry: bool = False,
67
68
  spark: SparkSession = None,
68
69
  inline_definitions: bool = False,
70
+ inline_quality: bool = False,
69
71
  ):
70
72
  self._data_contract_file = data_contract_file
71
73
  self._data_contract_str = data_contract_str
@@ -77,6 +79,7 @@ class DataContract:
77
79
  self._publish_to_opentelemetry = publish_to_opentelemetry
78
80
  self._spark = spark
79
81
  self._inline_definitions = inline_definitions
82
+ self._inline_quality = inline_quality
80
83
  self.all_linters = {
81
84
  ExampleModelLinter(),
82
85
  QualityUsesSchemaLinter(),
@@ -105,6 +108,7 @@ class DataContract:
105
108
  self._data_contract,
106
109
  self._schema_location,
107
110
  inline_definitions=True,
111
+ inline_quality=True,
108
112
  )
109
113
  run.checks.append(
110
114
  Check(type="lint", result="passed", name="Data contract is syntactically valid", engine="datacontract")
@@ -273,11 +277,16 @@ class DataContract:
273
277
  data_contract=self._data_contract,
274
278
  schema_location=self._schema_location,
275
279
  inline_definitions=self._inline_definitions,
280
+ inline_quality=self._inline_quality,
276
281
  )
277
282
 
278
283
  def export(self, export_format, model: str = "all", rdf_base: str = None, sql_server_type: str = "auto") -> str:
279
284
  data_contract = resolve.resolve_data_contract(
280
- self._data_contract_file, self._data_contract_str, self._data_contract, inline_definitions=True
285
+ self._data_contract_file,
286
+ self._data_contract_str,
287
+ self._data_contract,
288
+ inline_definitions=True,
289
+ inline_quality=True,
281
290
  )
282
291
  if export_format == "jsonschema":
283
292
  if data_contract.models is None:
@@ -482,6 +491,8 @@ class DataContract:
482
491
  data_contract_specification = import_sql(data_contract_specification, format, source)
483
492
  elif format == "avro":
484
493
  data_contract_specification = import_avro(data_contract_specification, source)
494
+ elif format == "glue":
495
+ data_contract_specification = import_glue(data_contract_specification, source)
485
496
  else:
486
497
  print(f"Import format {format} not supported.")
487
498
 
@@ -6,8 +6,7 @@ import fastjsonschema
6
6
 
7
7
  from datacontract.engines.fastjsonschema.s3.s3_read_files import yield_s3_files
8
8
  from datacontract.export.jsonschema_converter import to_jsonschema
9
- from datacontract.model.data_contract_specification import \
10
- DataContractSpecification, Server
9
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
11
10
  from datacontract.model.exceptions import DataContractException
12
11
  from datacontract.model.run import Run, Check
13
12
 
@@ -3,20 +3,14 @@ import logging
3
3
  from pyspark.sql import SparkSession
4
4
  from soda.scan import Scan
5
5
 
6
- from datacontract.engines.soda.connections.bigquery import \
7
- to_bigquery_soda_configuration
8
- from datacontract.engines.soda.connections.databricks import \
9
- to_databricks_soda_configuration
6
+ from datacontract.engines.soda.connections.bigquery import to_bigquery_soda_configuration
7
+ from datacontract.engines.soda.connections.databricks import to_databricks_soda_configuration
10
8
  from datacontract.engines.soda.connections.duckdb import get_duckdb_connection
11
- from datacontract.engines.soda.connections.kafka import create_spark_session, \
12
- read_kafka_topic
13
- from datacontract.engines.soda.connections.postgres import \
14
- to_postgres_soda_configuration
15
- from datacontract.engines.soda.connections.snowflake import \
16
- to_snowflake_soda_configuration
9
+ from datacontract.engines.soda.connections.kafka import create_spark_session, read_kafka_topic
10
+ from datacontract.engines.soda.connections.postgres import to_postgres_soda_configuration
11
+ from datacontract.engines.soda.connections.snowflake import to_snowflake_soda_configuration
17
12
  from datacontract.export.sodacl_converter import to_sodacl_yaml
18
- from datacontract.model.data_contract_specification import \
19
- DataContractSpecification, Server
13
+ from datacontract.model.data_contract_specification import DataContractSpecification, Server
20
14
  from datacontract.model.run import Run, Check, Log
21
15
 
22
16
 
@@ -30,9 +24,9 @@ def check_soda_execute(
30
24
  run.log_info("Running engine soda-core")
31
25
  scan = Scan()
32
26
 
33
- if server.type == "s3" or server.type == "local":
34
- if server.format in ["json", "parquet", "csv"]:
35
- con = get_duckdb_connection(data_contract, server)
27
+ if server.type in ["s3", "azure", "local"]:
28
+ if server.format in ["json", "parquet", "csv", "delta"]:
29
+ con = get_duckdb_connection(data_contract, server, run)
36
30
  scan.add_duckdb_connection(duckdb_connection=con, data_source_name=server.type)
37
31
  scan.set_data_source_name(server.type)
38
32
  else:
@@ -1,23 +1,28 @@
1
- import logging
2
1
  import os
3
2
 
3
+ from deltalake import DeltaTable
4
+
4
5
  import duckdb
5
6
  from datacontract.export.csv_type_converter import convert_to_duckdb_csv_type
7
+ from datacontract.model.run import Run
6
8
 
7
9
 
8
- def get_duckdb_connection(data_contract, server):
10
+ def get_duckdb_connection(data_contract, server, run: Run):
9
11
  con = duckdb.connect(database=":memory:")
10
12
  path: str = ""
11
13
  if server.type == "local":
12
14
  path = server.path
13
15
  if server.type == "s3":
14
16
  path = server.location
15
- setup_s3_connection(con, server)
17
+ setup_s3_connection(con, server)
18
+ if server.type == "azure":
19
+ path = server.location
20
+ setup_azure_connection(con, server)
16
21
  for model_name, model in data_contract.models.items():
17
22
  model_path = path
18
23
  if "{model}" in model_path:
19
24
  model_path = model_path.format(model=model_name)
20
- logging.info(f"Creating table {model_name} for {model_path}")
25
+ run.log_info(f"Creating table {model_name} for {model_path}")
21
26
 
22
27
  if server.format == "json":
23
28
  format = "auto"
@@ -34,6 +39,7 @@ def get_duckdb_connection(data_contract, server):
34
39
  """)
35
40
  elif server.format == "csv":
36
41
  columns = to_csv_types(model)
42
+ run.log_info("Using columns: " + str(columns))
37
43
  if columns is None:
38
44
  con.sql(
39
45
  f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1);"""
@@ -42,6 +48,21 @@ def get_duckdb_connection(data_contract, server):
42
48
  con.sql(
43
49
  f"""CREATE VIEW "{model_name}" AS SELECT * FROM read_csv('{model_path}', hive_partitioning=1, columns={columns});"""
44
50
  )
51
+ elif server.format == "delta":
52
+ if server.type == "azure":
53
+ raise NotImplementedError("Support for Delta Tables on Azure Storage is not implemented yet")
54
+
55
+ storage_options = {
56
+ "AWS_ENDPOINT_URL": server.endpointUrl,
57
+ "AWS_ACCESS_KEY_ID": os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID"),
58
+ "AWS_SECRET_ACCESS_KEY": os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY"),
59
+ "AWS_REGION": os.getenv("DATACONTRACT_S3_REGION", "us-east-1"),
60
+ "AWS_ALLOW_HTTP": "True" if server.endpointUrl.startswith("http://") else "False",
61
+ }
62
+
63
+ delta_table_arrow = DeltaTable(model_path, storage_options=storage_options).to_pyarrow_dataset()
64
+
65
+ con.register(model_name, delta_table_arrow)
45
66
  return con
46
67
 
47
68
 
@@ -59,18 +80,66 @@ def setup_s3_connection(con, server):
59
80
  s3_region = os.getenv("DATACONTRACT_S3_REGION")
60
81
  s3_access_key_id = os.getenv("DATACONTRACT_S3_ACCESS_KEY_ID")
61
82
  s3_secret_access_key = os.getenv("DATACONTRACT_S3_SECRET_ACCESS_KEY")
62
- # con.install_extension("httpfs")
63
- # con.load_extension("httpfs")
83
+ s3_endpoint = "s3.amazonaws.com"
84
+ use_ssl = "true"
85
+ url_style = "vhost"
64
86
  if server.endpointUrl is not None:
65
87
  s3_endpoint = server.endpointUrl.removeprefix("http://").removeprefix("https://")
66
88
  if server.endpointUrl.startswith("http://"):
67
- con.sql("SET s3_use_ssl = 0; SET s3_url_style = 'path';")
68
- con.sql(f"""
69
- SET s3_endpoint = '{s3_endpoint}';
70
- """)
89
+ use_ssl = "false"
90
+ url_style = 'path'
91
+
92
+
71
93
  if s3_access_key_id is not None:
72
94
  con.sql(f"""
73
- SET s3_region = '{s3_region}';
74
- SET s3_access_key_id = '{s3_access_key_id}';
75
- SET s3_secret_access_key = '{s3_secret_access_key}';
76
- """)
95
+ CREATE OR REPLACE SECRET s3_secret (
96
+ TYPE S3,
97
+ PROVIDER CREDENTIAL_CHAIN,
98
+ REGION '{s3_region}',
99
+ KEY_ID '{s3_access_key_id}',
100
+ SECRET '{s3_secret_access_key}',
101
+ ENDPOINT '{s3_endpoint}',
102
+ USE_SSL '{use_ssl}',
103
+ URL_STYLE '{url_style}'
104
+ );
105
+ """)
106
+
107
+ # con.sql(f"""
108
+ # SET s3_region = '{s3_region}';
109
+ # SET s3_access_key_id = '{s3_access_key_id}';
110
+ # SET s3_secret_access_key = '{s3_secret_access_key}';
111
+ # """)
112
+ # else:
113
+ # con.sql("""
114
+ # RESET s3_region;
115
+ # RESET s3_access_key_id;
116
+ # RESET s3_secret_access_key;
117
+ # """)
118
+ # con.sql("RESET s3_session_token")
119
+ # print(con.sql("SELECT * FROM duckdb_settings() WHERE name like 's3%'"))
120
+
121
+
122
+ def setup_azure_connection(con, server):
123
+ tenant_id = os.getenv("DATACONTRACT_AZURE_TENANT_ID")
124
+ client_id = os.getenv("DATACONTRACT_AZURE_CLIENT_ID")
125
+ client_secret = os.getenv("DATACONTRACT_AZURE_CLIENT_SECRET")
126
+
127
+ if tenant_id is None:
128
+ raise ValueError("Error: Environment variable DATACONTRACT_AZURE_TENANT_ID is not set")
129
+ if client_id is None:
130
+ raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_ID is not set")
131
+ if client_secret is None:
132
+ raise ValueError("Error: Environment variable DATACONTRACT_AZURE_CLIENT_SECRET is not set")
133
+
134
+ con.install_extension("azure")
135
+ con.load_extension("azure")
136
+
137
+ con.sql(f"""
138
+ CREATE SECRET azure_spn (
139
+ TYPE AZURE,
140
+ PROVIDER SERVICE_PRINCIPAL,
141
+ TENANT_ID '{tenant_id}',
142
+ CLIENT_ID '{client_id}',
143
+ CLIENT_SECRET '{client_secret}'
144
+ );
145
+ """)